Blog / AI / Claude Code for parsel: CSS and XPath Web Scraping in Python

Claude Code for parsel: CSS and XPath Web Scraping in Python

Published: May 25, 2028

•

Read time: 5 min read

•

By: Claude Skills 360

parsel is a CSS+XPath selector library used by Scrapy. pip install parsel. Create: from parsel import Selector; sel = Selector(text=html). CSS: sel.css("div.product h2").get(). GetAll: sel.css("ul.menu a::text").getall(). XPath: sel.xpath("//h2[@class='title']/text()").get(). Attribute: sel.css("a.link::attr(href)").get(). Attrib: sel.attrib["href"]. Re: sel.css("div.price::text").re(r"\$[\d.]+"). Re first: sel.css(".price::text").re_first(r"\d+\.\d+"). Chaining: for item in sel.css("li.item"): name = item.css("span::text").get(). Nested: item_sel = sel.css("div.card")[0]. JSON: sel = Selector(text=json_str, type="json"); sel.jmespath("items[0].name").get(). Root: sel.root — lxml element. XML: Selector(text=xml, type="xml"). Drop: sel.css("script, style").drop() — remove unwanted tags. Response text: Selector(response.text). get() default: sel.css("missing").get(default="N/A"). Scrapy: built-in to response.css()/response.xpath(). Extract: .extract() older equivalent of .getall(). Claude Code generates parsel scrapers, structured data extractors, pagination crawlers, and Scrapy spider helpers.

CLAUDE.md for parsel

## parsel Stack
- Version: parsel >= 1.9 | pip install parsel
- Create: sel = Selector(text=html_string) | Selector(text=xml, type="xml")
- CSS: sel.css("div.title::text").get() | .getall() | .re(r"pattern") | .re_first(r"p")
- XPath: sel.xpath("//a/@href").getall()
- Attrib: sel.css("img")attrib["src"]  |  sel.css("a::attr(href)").get()
- Chain: for item in sel.css("li.product"): price = item.css(".price::text").get()

parsel Web Data Extraction Pipeline

# app/scrape.py — parsel CSS/XPath selectors, extractors, pagination, and structured data
from __future__ import annotations

import json
import re
from dataclasses import dataclass, asdict, field
from typing import Any, Generator

from parsel import Selector, SelectorList


# ─────────────────────────────────────────────────────────────────────────────
# 1. Selector factory
# ─────────────────────────────────────────────────────────────────────────────

def html_selector(
    html: str,
    base_url: str | None = None,
    remove_tags: list[str] | None = None,
) -> Selector:
    """
    Create a parsel Selector from an HTML string.
    remove_tags: tag names to strip before selecting (e.g. ["script", "style"]).

    Example:
        sel = html_selector(response.text, base_url="https://example.com")
        titles = sel.css("h2.title::text").getall()
    """
    sel = Selector(text=html, base_url=base_url)
    for tag in remove_tags or []:
        for node in sel.css(tag):
            node.drop()
    return sel


def xml_selector(xml: str) -> Selector:
    """Create a parsel Selector for XML (e.g. RSS/Atom feeds, sitemaps)."""
    return Selector(text=xml, type="xml")


def json_selector(json_str: str | dict) -> Selector:
    """
    Create a parsel Selector for JSON using JMESPath.
    Accepts JSON string or already-parsed dict.

    Example:
        sel = json_selector(api_response_text)
        name = sel.jmespath("user.name").get()
        tags = sel.jmespath("items[*].tag").getall()
    """
    if isinstance(json_str, dict):
        json_str = json.dumps(json_str)
    return Selector(text=json_str, type="json")


# ─────────────────────────────────────────────────────────────────────────────
# 2. Text extraction helpers
# ─────────────────────────────────────────────────────────────────────────────

def get_text(
    sel: Selector | SelectorList,
    css: str | None = None,
    xpath: str | None = None,
    default: str = "",
    strip: bool = True,
) -> str:
    """
    Extract clean text from the first matching element.
    Supports both CSS and XPath selectors.

    Example:
        price = get_text(sel, css=".product-price::text", default="0.00")
        title = get_text(sel, xpath="//h1[@class='title']/text()")
    """
    if css:
        result = sel.css(css).get(default=default)
    elif xpath:
        result = sel.xpath(xpath).get(default=default)
    else:
        result = sel.css("::text").get(default=default) if hasattr(sel, "css") else default

    return result.strip() if strip and result else result


def get_all_text(
    sel: Selector | SelectorList,
    css: str | None = None,
    xpath: str | None = None,
    strip: bool = True,
    join: str | None = None,
) -> list[str] | str:
    """
    Extract text from all matching elements.
    join: if provided, return a single string joined by this separator.

    Example:
        bullets = get_all_text(sel, css="ul.features li::text")
        description = get_all_text(sel, css="p::text", join=" ")
    """
    if css:
        results = sel.css(css).getall()
    elif xpath:
        results = sel.xpath(xpath).getall()
    else:
        results = []

    if strip:
        results = [r.strip() for r in results if r.strip()]

    if join is not None:
        return join.join(results)
    return results


def get_attr(
    sel: Selector | SelectorList,
    css: str,
    attr: str,
    default: str = "",
) -> str:
    """
    Extract an attribute value from the first matching element.

    Example:
        href = get_attr(sel, "a.next-page", "href")
        src  = get_attr(sel, "img.hero", "src")
    """
    return sel.css(f"{css}::attr({attr})").get(default=default)


def get_all_attrs(
    sel: Selector | SelectorList,
    css: str,
    attr: str,
) -> list[str]:
    """Extract attribute values from all matching elements."""
    return sel.css(f"{css}::attr({attr})").getall()


def extract_number(
    sel: Selector | SelectorList,
    css: str,
    pattern: str = r"[\d,]+\.?\d*",
    conv: type = float,
    default: Any = None,
) -> Any:
    """
    Extract a number from text using a regex pattern.
    Strips commas before converting.

    Example:
        price = extract_number(sel, ".price::text")          # 29.99
        count = extract_number(sel, ".review-count::text", conv=int)  # 142
    """
    raw = sel.css(css).re_first(pattern)
    if raw is None:
        return default
    try:
        return conv(raw.replace(",", ""))
    except (ValueError, TypeError):
        return default


# ─────────────────────────────────────────────────────────────────────────────
# 3. Structured data extractors
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class Product:
    name:      str
    price:     float | None
    url:       str
    image_url: str
    rating:    float | None
    reviews:   int | None
    sku:       str
    in_stock:  bool


def extract_product(sel: Selector, base_url: str = "") -> Product:
    """
    Extract structured product data from a product-page Selector.
    CSS selectors follow common e-commerce patterns.

    Example:
        sel     = html_selector(product_page_html, base_url="https://shop.example.com")
        product = extract_product(sel)
    """
    name = (
        get_text(sel, css="h1.product-title::text") or
        get_text(sel, css="h1::text") or
        get_text(sel, xpath="//h1/text()")
    )
    price = (
        extract_number(sel, ".price::text") or
        extract_number(sel, "[itemprop='price']::text") or
        extract_number(sel, ".product-price::text")
    )
    url         = sel.attrib.get("data-url", "") or base_url
    image_url   = get_attr(sel, "img.product-image", "src") or get_attr(sel, "img", "src")
    rating_text = get_text(sel, css=".rating::attr(aria-label)") or get_text(sel, css=".stars::text")
    rating      = float(re.search(r"[\d.]+", rating_text).group()) if rating_text and re.search(r"[\d.]+", rating_text) else None
    reviews     = extract_number(sel, ".review-count::text", conv=int)
    sku         = get_text(sel, css="[itemprop='sku']::text") or get_attr(sel, "[data-sku]", "data-sku")
    in_stock    = bool(sel.css(".in-stock, .add-to-cart:not(.disabled)"))

    return Product(name=name, price=price, url=url, image_url=image_url,
                   rating=rating, reviews=reviews, sku=sku, in_stock=in_stock)


def extract_listing(
    sel: Selector,
    item_css: str,
    fields: dict[str, str],
) -> list[dict]:
    """
    Extract a list of items from a listing page.
    fields: {"field_name": "css_selector"} — ::text and ::attr(x) suffixes supported.

    Example:
        items = extract_listing(sel,
            item_css="div.product-card",
            fields={
                "name":  "h3::text",
                "price": ".price::text",
                "href":  "a::attr(href)",
                "img":   "img::attr(src)",
            },
        )
    """
    results = []
    for item in sel.css(item_css):
        row: dict = {}
        for field_name, selector in fields.items():
            row[field_name] = item.css(selector).get(default="").strip()
        results.append(row)
    return results


def extract_table(sel: Selector, css: str = "table") -> list[dict]:
    """
    Extract an HTML table to a list of dicts using the header row as keys.

    Example:
        rows = extract_table(sel, "table.data")
        for row in rows:
            print(row["Name"], row["Price"])
    """
    tables = sel.css(css)
    if not tables:
        return []
    table   = tables[0]
    headers = [th.css("::text").get("").strip() for th in table.css("th")]

    results = []
    for tr in table.css("tbody tr, tr:not(:first-child)"):
        cells = [td.css("::text").get("").strip() for td in tr.css("td")]
        if cells and any(cells):
            if headers:
                row = dict(zip(headers, cells))
            else:
                row = {str(i): v for i, v in enumerate(cells)}
            results.append(row)
    return results


# ─────────────────────────────────────────────────────────────────────────────
# 4. Pagination and link helpers
# ─────────────────────────────────────────────────────────────────────────────

def get_next_page_url(
    sel: Selector,
    css: str = "a[rel='next'], .next-page a, .pagination .next a",
    base_url: str = "",
) -> str | None:
    """
    Find the next-page URL from common pagination patterns.
    Returns absolute URL if base_url provided, else attribute value.
    """
    href = sel.css(f"{css}::attr(href)").get()
    if not href:
        return None
    if base_url and not href.startswith("http"):
        from urllib.parse import urljoin
        return urljoin(base_url, href)
    return href


def get_all_links(
    sel: Selector,
    css: str = "a",
    base_url: str = "",
    filter_pattern: str | None = None,
) -> list[str]:
    """
    Extract all links matching an optional pattern.
    filter_pattern: regex applied to href values.

    Example:
        product_links = get_all_links(sel, css="a.product-link",
                                      filter_pattern=r"/products/\d+")
    """
    hrefs = sel.css(f"{css}::attr(href)").getall()
    if base_url:
        from urllib.parse import urljoin
        hrefs = [urljoin(base_url, h) for h in hrefs if h and not h.startswith("#")]
    if filter_pattern:
        hrefs = [h for h in hrefs if re.search(filter_pattern, h)]
    return hrefs


# ─────────────────────────────────────────────────────────────────────────────
# 5. RSS / sitemap helpers
# ─────────────────────────────────────────────────────────────────────────────

def parse_rss(xml: str) -> list[dict]:
    """
    Parse an RSS 2.0 feed into a list of item dicts.

    Example:
        items = parse_rss(requests.get("https://blog.example.com/rss").text)
        for item in items: print(item["title"], item["link"])
    """
    sel = xml_selector(xml)
    # Register common RSS namespaces
    namespaces = {
        "dc":      "http://purl.org/dc/elements/1.1/",
        "content": "http://purl.org/rss/1.0/modules/content/",
        "media":   "http://search.yahoo.com/mrss/",
    }
    results = []
    for item in sel.xpath("//item"):
        results.append({
            "title":       item.xpath("title/text()").get(""),
            "link":        item.xpath("link/text()").get(""),
            "pub_date":    item.xpath("pubDate/text()").get(""),
            "description": item.xpath("description/text()").get(""),
            "author":      item.xpath("author/text() | dc:creator/text()",
                                     namespaces=namespaces).get(""),
        })
    return results


def parse_sitemap(xml: str) -> list[str]:
    """
    Extract URLs from a sitemap.xml.

    Example:
        urls = parse_sitemap(requests.get("https://example.com/sitemap.xml").text)
    """
    sel = xml_selector(xml)
    return sel.xpath("//url/loc/text() | //sitemap/loc/text()").getall()


# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    SAMPLE_HTML = """
    <html><body>
      <h1 class="product-title">Widget Pro X200</h1>
      <span class="price">$29.99</span>
      <div class="rating" aria-label="4.5 out of 5 stars"></div>
      <span class="review-count">142 reviews</span>
      <img class="product-image" src="/images/widget.jpg">
      <span class="in-stock">In Stock</span>

      <table class="specs">
        <tr><th>Material</th><th>Weight</th><th>Color</th></tr>
        <tr><td>Aluminum</td><td>120g</td><td>Silver</td></tr>
        <tr><td>Plastic</td><td>90g</td><td>Black</td></tr>
      </table>

      <ul class="features">
        <li>Waterproof</li><li>USB-C charging</li><li>Bluetooth 5.2</li>
      </ul>

      <div class="related">
        <a class="product-link" href="/products/101">Widget Mini</a>
        <a class="product-link" href="/products/205">Widget Max</a>
      </div>
    </body></html>
    """

    sel = html_selector(SAMPLE_HTML, base_url="https://shop.example.com")

    print("=== Product extraction ===")
    product = extract_product(sel, base_url="https://shop.example.com/products/x200")
    print(f"  Name:     {product.name}")
    print(f"  Price:    {product.price}")
    print(f"  Rating:   {product.rating}")
    print(f"  Reviews:  {product.reviews}")
    print(f"  In stock: {product.in_stock}")

    print("\n=== Table extraction ===")
    table_rows = extract_table(sel, "table.specs")
    for row in table_rows:
        print(f"  {row}")

    print("\n=== Text list ===")
    features = get_all_text(sel, css="ul.features li::text")
    print(f"  Features: {features}")

    print("\n=== Links ===")
    links = get_all_links(sel, css="a.product-link",
                          base_url="https://shop.example.com",
                          filter_pattern=r"/products/\d+")
    print(f"  Product links: {links}")

    print("\n=== JSON selector ===")
    api_response = {"user": {"name": "Alice", "score": 42}, "items": [{"tag": "python"}, {"tag": "parsel"}]}
    jsel  = json_selector(api_response)
    name  = jsel.jmespath("user.name").get()
    tags  = jsel.jmespath("items[*].tag").getall()
    print(f"  User: {name}, tags: {tags}")

For the BeautifulSoup4 alternative — BeautifulSoup4 has excellent tolerance for malformed HTML and a beginner-friendly API (soup.find("div", class_="title")); parsel supports both CSS selectors and XPath with a consistent .get()/.getall() API inherited from Scrapy, avoids the need to navigate NavigableString objects, and has built-in regex (.re()) and JSON (.jmespath()) support — use BeautifulSoup4 for lenient parsing of broken HTML, parsel when you want XPath power plus CSS convenience in a unified interface. For the lxml alternative — lxml provides the underlying libxml2 parser with full XPath 1.0 support; parsel wraps lxml (and cssselect) into a higher-level developer-friendly API with method chaining, .getall() list returns, and .re() inline regex — use parsel when you want Scrapy-style concise extraction, lxml when you need schema validation, XSLT transforms, or streaming iterparse. The Claude Skills 360 bundle includes parsel skill sets covering html_selector()/xml_selector()/json_selector() factory, get_text()/get_all_text() with CSS/XPath, get_attr()/get_all_attrs(), extract_number() with regex coercion, extract_product() structured scraper, extract_listing() dict field map, extract_table() HTML table parser, get_next_page_url() pagination, get_all_links() with regex filter, parse_rss() feed parser, and parse_sitemap() URL extractor. Start with the free tier to try CSS XPath data extraction code generation.

Keep Reading

Claude Code for email.contentmanager: Python Email Content Accessors

Read and write EmailMessage body content with Python's email.contentmanager module and Claude Code — email contentmanager ContentManager for the class that maps content types to get and set handler functions allowing EmailMessage to support get_content and set_content with type-specific behaviour, email contentmanager raw_data_manager for the ContentManager instance that handles raw bytes and str payloads without any conversion, email contentmanager content_manager for the standard ContentManager instance used by email.policy.default that intelligently handles text plain text html multipart and binary content types, email contentmanager get_content_text for the handler that returns the decoded text payload of a text-star message part as a str, email contentmanager get_content_binary for the handler that returns the raw decoded bytes payload of a non-text message part, email contentmanager get_data_manager for the get-handler lookup used by EmailMessage get_content to find the right reader function for the content type, email contentmanager set_content text for the handler that creates and sets a text part correctly choosing charset and transfer encoding, email contentmanager set_content bytes for the handler that creates and sets a binary part with base64 encoding and optional filename Content-Disposition, email contentmanager EmailMessage get_content for the method that reads the message body using the registered content manager handlers, email contentmanager EmailMessage set_content for the method that sets the message body and MIME headers in one call, email contentmanager EmailMessage make_alternative make_mixed make_related for the methods that convert a simple message into a multipart container, email contentmanager EmailMessage add_attachment for the method that attaches a file or bytes to a multipart message, and email contentmanager integration with email.message and email.policy and email.mime and io for building high-level email readers attachment extractors text body accessors HTML readers and policy-aware MIME construction pipelines.

5 min read Feb 12, 2029

Claude Code for email.charset: Python Email Charset Encoding

Control header and body encoding for international email with Python's email.charset module and Claude Code — email charset Charset for the class that wraps a character set name with the encoding rules for header encoding and body encoding describing how to encode text for that charset in email messages, email charset Charset header_encoding for the attribute specifying whether headers using this charset should use QP quoted-printable encoding BASE64 encoding or no encoding, email charset Charset body_encoding for the attribute specifying the Content-Transfer-Encoding to use for message bodies in this charset such as QP or BASE64, email charset Charset output_codec for the attribute giving the Python codec name used to encode the string to bytes for the wire format, email charset Charset input_codec for the attribute giving the Python codec name used to decode incoming bytes to str, email charset Charset get_output_charset for returning the output charset name, email charset Charset header_encode for encoding a header string using the charset's header_encoding method, email charset Charset body_encode for encoding body content using the charset's body_encoding, email charset Charset convert for converting a string from the input_codec to the output_codec, email charset add_charset for registering a new charset with custom encoding rules in the global charset registry, email charset add_alias for adding an alias name that maps to an existing registered charset, email charset add_codec for registering a codec name mapping for use by the charset machinery, and email charset integration with email.message and email.mime and email.policy and email.encoders for building international email senders non-ASCII header encoders Content-Transfer-Encoding selectors charset-aware message constructors and MIME encoding pipelines.

5 min read Feb 11, 2029

Claude Code for email.utils: Python Email Address and Header Utilities

Parse and format RFC 2822 email addresses and dates with Python's email.utils module and Claude Code — email utils parseaddr for splitting a display-name plus angle-bracket address string into a realname and email address tuple, email utils formataddr for combining a realname and address string into a properly quoted RFC 2822 address with angle brackets, email utils getaddresses for parsing a list of raw address header strings each potentially containing multiple comma-separated addresses into a list of realname address tuples, email utils parsedate for parsing an RFC 2822 date string into a nine-tuple compatible with time.mktime, email utils parsedate_tz for parsing an RFC 2822 date string into a ten-tuple that includes the UTC offset timezone in seconds, email utils parsedate_to_datetime for parsing an RFC 2822 date string into an aware datetime object with timezone, email utils formatdate for formatting a POSIX timestamp or the current time as an RFC 2822 date string with optional usegmt and localtime flags, email utils format_datetime for formatting a datetime object as an RFC 2822 date string, email utils make_msgid for generating a globally unique Message-ID string with optional idstring and domain components, email utils decode_rfc2231 for decoding an RFC 2231 encoded parameter value into a tuple of charset language and value, email utils encode_rfc2231 for encoding a string as an RFC 2231 encoded parameter value, email utils collapse_rfc2231_value for collapsing a decoded RFC 2231 tuple to a Unicode string, and email utils integration with email.message and email.headerregistry and datetime and time for building address parsers date formatters message-id generators header extractors and RFC-compliant email construction utilities.

5 min read Feb 10, 2029

Put these ideas into practice

Claude Skills 360 gives you production-ready skills for everything in this article — and 2,350+ more. Start free or go all-in.

Get 360 skills free

Free $39