parsel is a CSS+XPath selector library used by Scrapy. pip install parsel. Create: from parsel import Selector; sel = Selector(text=html). CSS: sel.css("div.product h2").get(). GetAll: sel.css("ul.menu a::text").getall(). XPath: sel.xpath("//h2[@class='title']/text()").get(). Attribute: sel.css("a.link::attr(href)").get(). Attrib: sel.attrib["href"]. Re: sel.css("div.price::text").re(r"\$[\d.]+"). Re first: sel.css(".price::text").re_first(r"\d+\.\d+"). Chaining: for item in sel.css("li.item"): name = item.css("span::text").get(). Nested: item_sel = sel.css("div.card")[0]. JSON: sel = Selector(text=json_str, type="json"); sel.jmespath("items[0].name").get(). Root: sel.root — lxml element. XML: Selector(text=xml, type="xml"). Drop: sel.css("script, style").drop() — remove unwanted tags. Response text: Selector(response.text). get() default: sel.css("missing").get(default="N/A"). Scrapy: built-in to response.css()/response.xpath(). Extract: .extract() older equivalent of .getall(). Claude Code generates parsel scrapers, structured data extractors, pagination crawlers, and Scrapy spider helpers.
CLAUDE.md for parsel
## parsel Stack
- Version: parsel >= 1.9 | pip install parsel
- Create: sel = Selector(text=html_string) | Selector(text=xml, type="xml")
- CSS: sel.css("div.title::text").get() | .getall() | .re(r"pattern") | .re_first(r"p")
- XPath: sel.xpath("//a/@href").getall()
- Attrib: sel.css("img")attrib["src"] | sel.css("a::attr(href)").get()
- Chain: for item in sel.css("li.product"): price = item.css(".price::text").get()
parsel Web Data Extraction Pipeline
# app/scrape.py — parsel CSS/XPath selectors, extractors, pagination, and structured data
from __future__ import annotations
import json
import re
from dataclasses import dataclass, asdict, field
from typing import Any, Generator
from parsel import Selector, SelectorList
# ─────────────────────────────────────────────────────────────────────────────
# 1. Selector factory
# ─────────────────────────────────────────────────────────────────────────────
def html_selector(
html: str,
base_url: str | None = None,
remove_tags: list[str] | None = None,
) -> Selector:
"""
Create a parsel Selector from an HTML string.
remove_tags: tag names to strip before selecting (e.g. ["script", "style"]).
Example:
sel = html_selector(response.text, base_url="https://example.com")
titles = sel.css("h2.title::text").getall()
"""
sel = Selector(text=html, base_url=base_url)
for tag in remove_tags or []:
for node in sel.css(tag):
node.drop()
return sel
def xml_selector(xml: str) -> Selector:
"""Create a parsel Selector for XML (e.g. RSS/Atom feeds, sitemaps)."""
return Selector(text=xml, type="xml")
def json_selector(json_str: str | dict) -> Selector:
"""
Create a parsel Selector for JSON using JMESPath.
Accepts JSON string or already-parsed dict.
Example:
sel = json_selector(api_response_text)
name = sel.jmespath("user.name").get()
tags = sel.jmespath("items[*].tag").getall()
"""
if isinstance(json_str, dict):
json_str = json.dumps(json_str)
return Selector(text=json_str, type="json")
# ─────────────────────────────────────────────────────────────────────────────
# 2. Text extraction helpers
# ─────────────────────────────────────────────────────────────────────────────
def get_text(
sel: Selector | SelectorList,
css: str | None = None,
xpath: str | None = None,
default: str = "",
strip: bool = True,
) -> str:
"""
Extract clean text from the first matching element.
Supports both CSS and XPath selectors.
Example:
price = get_text(sel, css=".product-price::text", default="0.00")
title = get_text(sel, xpath="//h1[@class='title']/text()")
"""
if css:
result = sel.css(css).get(default=default)
elif xpath:
result = sel.xpath(xpath).get(default=default)
else:
result = sel.css("::text").get(default=default) if hasattr(sel, "css") else default
return result.strip() if strip and result else result
def get_all_text(
sel: Selector | SelectorList,
css: str | None = None,
xpath: str | None = None,
strip: bool = True,
join: str | None = None,
) -> list[str] | str:
"""
Extract text from all matching elements.
join: if provided, return a single string joined by this separator.
Example:
bullets = get_all_text(sel, css="ul.features li::text")
description = get_all_text(sel, css="p::text", join=" ")
"""
if css:
results = sel.css(css).getall()
elif xpath:
results = sel.xpath(xpath).getall()
else:
results = []
if strip:
results = [r.strip() for r in results if r.strip()]
if join is not None:
return join.join(results)
return results
def get_attr(
sel: Selector | SelectorList,
css: str,
attr: str,
default: str = "",
) -> str:
"""
Extract an attribute value from the first matching element.
Example:
href = get_attr(sel, "a.next-page", "href")
src = get_attr(sel, "img.hero", "src")
"""
return sel.css(f"{css}::attr({attr})").get(default=default)
def get_all_attrs(
sel: Selector | SelectorList,
css: str,
attr: str,
) -> list[str]:
"""Extract attribute values from all matching elements."""
return sel.css(f"{css}::attr({attr})").getall()
def extract_number(
sel: Selector | SelectorList,
css: str,
pattern: str = r"[\d,]+\.?\d*",
conv: type = float,
default: Any = None,
) -> Any:
"""
Extract a number from text using a regex pattern.
Strips commas before converting.
Example:
price = extract_number(sel, ".price::text") # 29.99
count = extract_number(sel, ".review-count::text", conv=int) # 142
"""
raw = sel.css(css).re_first(pattern)
if raw is None:
return default
try:
return conv(raw.replace(",", ""))
except (ValueError, TypeError):
return default
# ─────────────────────────────────────────────────────────────────────────────
# 3. Structured data extractors
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class Product:
name: str
price: float | None
url: str
image_url: str
rating: float | None
reviews: int | None
sku: str
in_stock: bool
def extract_product(sel: Selector, base_url: str = "") -> Product:
"""
Extract structured product data from a product-page Selector.
CSS selectors follow common e-commerce patterns.
Example:
sel = html_selector(product_page_html, base_url="https://shop.example.com")
product = extract_product(sel)
"""
name = (
get_text(sel, css="h1.product-title::text") or
get_text(sel, css="h1::text") or
get_text(sel, xpath="//h1/text()")
)
price = (
extract_number(sel, ".price::text") or
extract_number(sel, "[itemprop='price']::text") or
extract_number(sel, ".product-price::text")
)
url = sel.attrib.get("data-url", "") or base_url
image_url = get_attr(sel, "img.product-image", "src") or get_attr(sel, "img", "src")
rating_text = get_text(sel, css=".rating::attr(aria-label)") or get_text(sel, css=".stars::text")
rating = float(re.search(r"[\d.]+", rating_text).group()) if rating_text and re.search(r"[\d.]+", rating_text) else None
reviews = extract_number(sel, ".review-count::text", conv=int)
sku = get_text(sel, css="[itemprop='sku']::text") or get_attr(sel, "[data-sku]", "data-sku")
in_stock = bool(sel.css(".in-stock, .add-to-cart:not(.disabled)"))
return Product(name=name, price=price, url=url, image_url=image_url,
rating=rating, reviews=reviews, sku=sku, in_stock=in_stock)
def extract_listing(
sel: Selector,
item_css: str,
fields: dict[str, str],
) -> list[dict]:
"""
Extract a list of items from a listing page.
fields: {"field_name": "css_selector"} — ::text and ::attr(x) suffixes supported.
Example:
items = extract_listing(sel,
item_css="div.product-card",
fields={
"name": "h3::text",
"price": ".price::text",
"href": "a::attr(href)",
"img": "img::attr(src)",
},
)
"""
results = []
for item in sel.css(item_css):
row: dict = {}
for field_name, selector in fields.items():
row[field_name] = item.css(selector).get(default="").strip()
results.append(row)
return results
def extract_table(sel: Selector, css: str = "table") -> list[dict]:
"""
Extract an HTML table to a list of dicts using the header row as keys.
Example:
rows = extract_table(sel, "table.data")
for row in rows:
print(row["Name"], row["Price"])
"""
tables = sel.css(css)
if not tables:
return []
table = tables[0]
headers = [th.css("::text").get("").strip() for th in table.css("th")]
results = []
for tr in table.css("tbody tr, tr:not(:first-child)"):
cells = [td.css("::text").get("").strip() for td in tr.css("td")]
if cells and any(cells):
if headers:
row = dict(zip(headers, cells))
else:
row = {str(i): v for i, v in enumerate(cells)}
results.append(row)
return results
# ─────────────────────────────────────────────────────────────────────────────
# 4. Pagination and link helpers
# ─────────────────────────────────────────────────────────────────────────────
def get_next_page_url(
sel: Selector,
css: str = "a[rel='next'], .next-page a, .pagination .next a",
base_url: str = "",
) -> str | None:
"""
Find the next-page URL from common pagination patterns.
Returns absolute URL if base_url provided, else attribute value.
"""
href = sel.css(f"{css}::attr(href)").get()
if not href:
return None
if base_url and not href.startswith("http"):
from urllib.parse import urljoin
return urljoin(base_url, href)
return href
def get_all_links(
sel: Selector,
css: str = "a",
base_url: str = "",
filter_pattern: str | None = None,
) -> list[str]:
"""
Extract all links matching an optional pattern.
filter_pattern: regex applied to href values.
Example:
product_links = get_all_links(sel, css="a.product-link",
filter_pattern=r"/products/\d+")
"""
hrefs = sel.css(f"{css}::attr(href)").getall()
if base_url:
from urllib.parse import urljoin
hrefs = [urljoin(base_url, h) for h in hrefs if h and not h.startswith("#")]
if filter_pattern:
hrefs = [h for h in hrefs if re.search(filter_pattern, h)]
return hrefs
# ─────────────────────────────────────────────────────────────────────────────
# 5. RSS / sitemap helpers
# ─────────────────────────────────────────────────────────────────────────────
def parse_rss(xml: str) -> list[dict]:
"""
Parse an RSS 2.0 feed into a list of item dicts.
Example:
items = parse_rss(requests.get("https://blog.example.com/rss").text)
for item in items: print(item["title"], item["link"])
"""
sel = xml_selector(xml)
# Register common RSS namespaces
namespaces = {
"dc": "http://purl.org/dc/elements/1.1/",
"content": "http://purl.org/rss/1.0/modules/content/",
"media": "http://search.yahoo.com/mrss/",
}
results = []
for item in sel.xpath("//item"):
results.append({
"title": item.xpath("title/text()").get(""),
"link": item.xpath("link/text()").get(""),
"pub_date": item.xpath("pubDate/text()").get(""),
"description": item.xpath("description/text()").get(""),
"author": item.xpath("author/text() | dc:creator/text()",
namespaces=namespaces).get(""),
})
return results
def parse_sitemap(xml: str) -> list[str]:
"""
Extract URLs from a sitemap.xml.
Example:
urls = parse_sitemap(requests.get("https://example.com/sitemap.xml").text)
"""
sel = xml_selector(xml)
return sel.xpath("//url/loc/text() | //sitemap/loc/text()").getall()
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
SAMPLE_HTML = """
<html><body>
<h1 class="product-title">Widget Pro X200</h1>
<span class="price">$29.99</span>
<div class="rating" aria-label="4.5 out of 5 stars"></div>
<span class="review-count">142 reviews</span>
<img class="product-image" src="/images/widget.jpg">
<span class="in-stock">In Stock</span>
<table class="specs">
<tr><th>Material</th><th>Weight</th><th>Color</th></tr>
<tr><td>Aluminum</td><td>120g</td><td>Silver</td></tr>
<tr><td>Plastic</td><td>90g</td><td>Black</td></tr>
</table>
<ul class="features">
<li>Waterproof</li><li>USB-C charging</li><li>Bluetooth 5.2</li>
</ul>
<div class="related">
<a class="product-link" href="/products/101">Widget Mini</a>
<a class="product-link" href="/products/205">Widget Max</a>
</div>
</body></html>
"""
sel = html_selector(SAMPLE_HTML, base_url="https://shop.example.com")
print("=== Product extraction ===")
product = extract_product(sel, base_url="https://shop.example.com/products/x200")
print(f" Name: {product.name}")
print(f" Price: {product.price}")
print(f" Rating: {product.rating}")
print(f" Reviews: {product.reviews}")
print(f" In stock: {product.in_stock}")
print("\n=== Table extraction ===")
table_rows = extract_table(sel, "table.specs")
for row in table_rows:
print(f" {row}")
print("\n=== Text list ===")
features = get_all_text(sel, css="ul.features li::text")
print(f" Features: {features}")
print("\n=== Links ===")
links = get_all_links(sel, css="a.product-link",
base_url="https://shop.example.com",
filter_pattern=r"/products/\d+")
print(f" Product links: {links}")
print("\n=== JSON selector ===")
api_response = {"user": {"name": "Alice", "score": 42}, "items": [{"tag": "python"}, {"tag": "parsel"}]}
jsel = json_selector(api_response)
name = jsel.jmespath("user.name").get()
tags = jsel.jmespath("items[*].tag").getall()
print(f" User: {name}, tags: {tags}")
For the BeautifulSoup4 alternative — BeautifulSoup4 has excellent tolerance for malformed HTML and a beginner-friendly API (soup.find("div", class_="title")); parsel supports both CSS selectors and XPath with a consistent .get()/.getall() API inherited from Scrapy, avoids the need to navigate NavigableString objects, and has built-in regex (.re()) and JSON (.jmespath()) support — use BeautifulSoup4 for lenient parsing of broken HTML, parsel when you want XPath power plus CSS convenience in a unified interface. For the lxml alternative — lxml provides the underlying libxml2 parser with full XPath 1.0 support; parsel wraps lxml (and cssselect) into a higher-level developer-friendly API with method chaining, .getall() list returns, and .re() inline regex — use parsel when you want Scrapy-style concise extraction, lxml when you need schema validation, XSLT transforms, or streaming iterparse. The Claude Skills 360 bundle includes parsel skill sets covering html_selector()/xml_selector()/json_selector() factory, get_text()/get_all_text() with CSS/XPath, get_attr()/get_all_attrs(), extract_number() with regex coercion, extract_product() structured scraper, extract_listing() dict field map, extract_table() HTML table parser, get_next_page_url() pagination, get_all_links() with regex filter, parse_rss() feed parser, and parse_sitemap() URL extractor. Start with the free tier to try CSS XPath data extraction code generation.