Python’s urllib.robotparser module parses robots.txt files and answers whether a given user-agent may fetch a given URL. from urllib.robotparser import RobotFileParser. Create and fetch: rp = RobotFileParser(); rp.set_url("https://example.com/robots.txt"); rp.read() — fetches and parses; or rp.parse(lines) for pre-fetched content. Query: rp.can_fetch("*", url) → bool — "*" tests the wildcard agent; rp.can_fetch("Googlebot", url) → bool — agent-specific rule. Rate hints: rp.crawl_delay("*") → int | None — seconds to wait between requests; rp.request_rate("*") → RequestRate(requests=N, seconds=M) | None — N requests per M seconds. Freshness: rp.mtime() → float (time.time() of last read); rp.modified() → sets mtime to now (for cache freshness tracking). Site-specific: agent names are case-insensitive; rules match the longest matching path prefix; Disallow: / blocks everything; Allow: overrides Disallow for a more specific path; * in paths is a wildcard (extended standard). Sitemaps: not parsed by stdlib — use rp.sitemaps attribute in third-party extensions. Claude Code generates polite crawlers, site scraper guards, fetch policy validators, and crawl rate limiters.
CLAUDE.md for urllib.robotparser
## urllib.robotparser Stack
- Stdlib: from urllib.robotparser import RobotFileParser
- Fetch: rp = RobotFileParser()
- rp.set_url("https://example.com/robots.txt")
- rp.read() # HTTP GET
- Query: rp.can_fetch("*", url) # True = allowed
- rp.can_fetch("MyCrawler", url)
- Rate: delay = rp.crawl_delay("*") # int seconds or None
- rate = rp.request_rate("*") # RequestRate(N, M) or None
- Parse: rp.parse(["User-agent: *", "Disallow: /admin"])
urllib.robotparser Robots.txt Pipeline
# app/robotsutil.py — fetch, cache, check, rate-limit, policy report
from __future__ import annotations
import time
import urllib.parse
import urllib.request
from dataclasses import dataclass, field
from pathlib import Path
from urllib.robotparser import RobotFileParser
# ─────────────────────────────────────────────────────────────────────────────
# 1. Fetch and cache helpers
# ─────────────────────────────────────────────────────────────────────────────
def robots_url_for(site_url: str) -> str:
"""
Return the canonical robots.txt URL for a site base URL.
Example:
robots_url_for("https://example.com/path/page")
# 'https://example.com/robots.txt'
"""
parsed = urllib.parse.urlparse(site_url)
return f"{parsed.scheme}://{parsed.netloc}/robots.txt"
def fetch_robots(
robots_url: str,
user_agent: str = "Mozilla/5.0 (compatible; PythonCrawler/1.0)",
timeout: int = 10,
) -> RobotFileParser:
"""
Fetch and parse a robots.txt file from a URL.
Returns a RobotFileParser ready for can_fetch() queries.
Example:
rp = fetch_robots("https://example.com/robots.txt")
print(rp.can_fetch("*", "https://example.com/private"))
"""
rp = RobotFileParser()
rp.set_url(robots_url)
try:
req = urllib.request.Request(
robots_url,
headers={"User-Agent": user_agent},
)
with urllib.request.urlopen(req, timeout=timeout) as resp:
content = resp.read().decode("utf-8", errors="replace")
rp.parse(content.splitlines())
rp.modified()
except Exception:
# If robots.txt not found (404) or unreachable → allow all
rp.parse(["User-agent: *", "Allow: /"])
return rp
def parse_robots_text(text: str) -> RobotFileParser:
"""
Parse a robots.txt string directly (no HTTP fetch).
Example:
rp = parse_robots_text('''
User-agent: *
Disallow: /admin
Crawl-delay: 2
''')
"""
rp = RobotFileParser()
rp.parse(text.splitlines())
return rp
# ─────────────────────────────────────────────────────────────────────────────
# 2. Per-domain robot cache
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RobotCache:
"""
Caches parsed RobotFileParser instances per origin, with TTL.
"""
ttl_seconds: int = 3600
_cache: dict[str, tuple[RobotFileParser, float]] = field(
default_factory=dict, repr=False
)
def _origin(self, url: str) -> str:
parsed = urllib.parse.urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}"
def get(self, url: str) -> RobotFileParser:
"""
Return cached or freshly fetched RobotFileParser for the URL's origin.
Example:
cache = RobotCache(ttl_seconds=300)
rp = cache.get("https://example.com/page")
print(rp.can_fetch("*", "https://example.com/page"))
"""
origin = self._origin(url)
robots_url = f"{origin}/robots.txt"
entry = self._cache.get(origin)
if entry is not None:
rp, fetched_at = entry
if time.time() - fetched_at < self.ttl_seconds:
return rp
rp = fetch_robots(robots_url)
self._cache[origin] = (rp, time.time())
return rp
def can_fetch(self, user_agent: str, url: str) -> bool:
"""Check if user_agent may fetch url, using cached robots.txt."""
return self.get(url).can_fetch(user_agent, url)
def crawl_delay(self, url: str, user_agent: str = "*") -> float:
"""Return crawl delay (seconds) for the URL's origin, or 0.0."""
rp = self.get(url)
delay = rp.crawl_delay(user_agent)
if delay is None:
rate = rp.request_rate(user_agent)
if rate is not None:
return rate.seconds / max(rate.requests, 1)
return float(delay or 0.0)
def clear(self) -> None:
self._cache.clear()
# ─────────────────────────────────────────────────────────────────────────────
# 3. Policy report
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RobotPolicy:
origin: str
agent: str
crawl_delay: float | None # seconds
request_rate: str | None # "N/M sec" string
test_results: list[tuple[str, bool]] # (url, allowed)
def __str__(self) -> str:
lines = [f"Origin: {self.origin} Agent: {self.agent}"]
if self.crawl_delay is not None:
lines.append(f" Crawl-delay: {self.crawl_delay}s")
if self.request_rate:
lines.append(f" Request-rate: {self.request_rate}")
for url, allowed in self.test_results:
mark = "✓" if allowed else "✗"
lines.append(f" {mark} {url}")
return "\n".join(lines)
def inspect_policy(
rp: RobotFileParser,
origin: str,
agent: str,
test_paths: list[str],
) -> RobotPolicy:
"""
Produce a human-readable policy report for a user-agent against a robots.txt.
Example:
rp = parse_robots_text("User-agent: *\\nDisallow: /admin")
report = inspect_policy(rp, "https://example.com", "*",
["/", "/admin", "/about"])
print(report)
"""
delay = rp.crawl_delay(agent)
rate = rp.request_rate(agent)
rate_str: str | None = None
if rate is not None:
rate_str = f"{rate.requests}/{rate.seconds}s"
results = []
for path in test_paths:
url = urllib.parse.urljoin(origin, path)
results.append((url, rp.can_fetch(agent, url)))
return RobotPolicy(
origin=origin,
agent=agent,
crawl_delay=float(delay) if delay is not None else None,
request_rate=rate_str,
test_results=results,
)
# ─────────────────────────────────────────────────────────────────────────────
# 4. Polite crawl rate limiter
# ─────────────────────────────────────────────────────────────────────────────
class PoliteCrawler:
"""
URL iterator that respects robots.txt and enforces per-origin crawl delays.
Example:
crawler = PoliteCrawler("MyCrawler/1.0", default_delay=1.0)
for url in urls:
if crawler.may_fetch(url):
crawler.wait(url)
content = fetch(url)
crawler.record_fetch(url)
"""
def __init__(
self,
user_agent: str,
default_delay: float = 1.0,
robots_ttl: int = 3600,
):
self.user_agent = user_agent
self.default_delay = default_delay
self._cache = RobotCache(ttl_seconds=robots_ttl)
self._last_fetch: dict[str, float] = {} # origin → timestamp
def _origin(self, url: str) -> str:
p = urllib.parse.urlparse(url)
return f"{p.scheme}://{p.netloc}"
def may_fetch(self, url: str) -> bool:
"""Return True if robots.txt allows this user-agent to fetch url."""
return self._cache.can_fetch(self.user_agent, url)
def seconds_to_wait(self, url: str) -> float:
"""Seconds to sleep before fetching url (respects Crawl-delay)."""
origin = self._origin(url)
delay = self._cache.crawl_delay(url, self.user_agent) or self.default_delay
last = self._last_fetch.get(origin, 0.0)
elapsed = time.time() - last
return max(0.0, delay - elapsed)
def wait(self, url: str) -> None:
"""Sleep the required delay before fetching url."""
secs = self.seconds_to_wait(url)
if secs > 0:
time.sleep(secs)
def record_fetch(self, url: str) -> None:
"""Record that url's origin was just fetched; resets the delay timer."""
self._last_fetch[self._origin(url)] = time.time()
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== urllib.robotparser demo ===")
# ── parse_robots_text ─────────────────────────────────────────────────────
print("\n--- parse_robots_text ---")
robots_txt = """\
User-agent: *
Disallow: /admin/
Disallow: /private/
Crawl-delay: 2
Allow: /public/
User-agent: Googlebot
Disallow:
"""
rp = parse_robots_text(robots_txt)
origin = "https://example.com"
test_paths = ["/", "/about", "/admin/settings", "/private/data",
"/public/docs", "/admin/login"]
for path in test_paths:
url = f"{origin}{path}"
allowed = rp.can_fetch("*", url)
google_ok = rp.can_fetch("Googlebot", url)
print(f" {'✓' if allowed else '✗'} * | {'✓' if google_ok else '✗'} Googlebot {path}")
# ── inspect_policy ────────────────────────────────────────────────────────
print("\n--- inspect_policy ---")
report = inspect_policy(rp, origin, "*", test_paths[:5])
print(report)
# ── crawl_delay / request_rate ─────────────────────────────────────────────
print("\n--- crawl_delay / request_rate ---")
print(f" crawl_delay(*): {rp.crawl_delay('*')}")
# Robots with Request-rate
rp2 = parse_robots_text("User-agent: *\nRequest-rate: 3/10\nDisallow: /secret")
rate = rp2.request_rate("*")
print(f" request_rate(*): {rate}")
# ── robots_url_for ─────────────────────────────────────────────────────────
print("\n--- robots_url_for ---")
for url in [
"https://example.com/blog/post?id=1",
"https://shop.example.com/products",
"http://api.example.org/v2/users",
]:
print(f" {url}")
print(f" → {robots_url_for(url)}")
# ── PoliteCrawler demo ────────────────────────────────────────────────────
print("\n--- PoliteCrawler ---")
crawler = PoliteCrawler("TestBot/1.0", default_delay=0.5)
urls = [
f"{origin}/",
f"{origin}/about",
f"{origin}/admin/secret",
f"{origin}/public/page",
]
for url in urls:
# Use local rp instead of fetching
allowed = rp.can_fetch("*", url)
print(f" {'fetch' if allowed else 'skip ':5s} {url}")
print("\n=== done ===")
For the scrapy (PyPI) alternative — Scrapy’s RobotsTxtMiddleware automatically fetches, caches, and enforces robots.txt rules for every spider request via ROBOTSTXT_OBEY = True, with per-domain queuing and rate limiting built in — use Scrapy when running a full-featured production crawler; use urllib.robotparser for lightweight scripts, one-off scrapers, or any situation where Scrapy’s process model and configuration overhead is too heavy. For the reppy / robotexclusionrulesparser (PyPI) alternatives — these third-party parsers handle the RFC 9309 extended syntax (wildcards *, end-of-URL $, Sitemap directives) more completely than the stdlib — use a third-party parser for production crawlers that need strict RFC 9309 compliance and Sitemap URL extraction; use urllib.robotparser for basic allow/disallow checking where the extended syntax is not required. The Claude Skills 360 bundle includes urllib.robotparser skill sets covering robots_url_for()/fetch_robots()/parse_robots_text() fetch helpers, RobotCache TTL-based per-origin cache, RobotPolicy with inspect_policy() report generator, and PoliteCrawler with may_fetch()/wait()/record_fetch() polite crawl rate limiter. Start with the free tier to try robots.txt patterns and urllib.robotparser pipeline code generation.