pdfplumber extracts tables and text with precise layout information from PDFs. pip install pdfplumber. Open: import pdfplumber; with pdfplumber.open("file.pdf") as pdf:. pdf.pages — list of pages. pdf.pages[0] — first page. Text: page.extract_text() — layout-aware text. page.extract_text(x_tolerance=3, y_tolerance=3). Words: page.extract_words() — list of word dicts with x0 y0 x1 y1. Lines: page.extract_text_lines() — words grouped by line. Tables: page.extract_table() — first table as list of rows. page.extract_tables() — all tables. Table settings: page.extract_table(table_settings={"vertical_strategy":"lines","horizontal_strategy":"lines"}). Crop: page.crop((x0, y0, x1, y1)) — extract a bounding box region. page.within_bbox((x0,y0,x1,y1)). Chars: page.chars — list of char dicts with x0 y0 fontname size. Rects: page.rects — horizontal/vertical lines. Images: page.images — embedded image metadata. Size: page.width, page.height. PDF properties: pdf.metadata. Debug: page.to_image().debug_tablefinder() — visualize detected table cells. page.to_image().save("debug.png"). Pandas: import pandas as pd; pd.DataFrame(table[1:], columns=table[0]). Batch: for path in paths: with pdfplumber.open(path) as pdf:. Claude Code generates pdfplumber extractors, table parsers, and region croppers.
CLAUDE.md for pdfplumber
## pdfplumber Stack
- Version: pdfplumber >= 0.11 | pip install pdfplumber
- Open: with pdfplumber.open("file.pdf") as pdf: for page in pdf.pages:
- Text: page.extract_text() | extract_text_lines() | extract_words()
- Tables: page.extract_table() → list of rows | extract_tables() → all tables
- Table tuning: table_settings={"vertical_strategy":"lines","horizontal_strategy":"text"}
- Crop: page.crop((x0,y0,x1,y1)).extract_text() — extract region
- Pandas: pd.DataFrame(table[1:], columns=table[0]) from extract_table()
pdfplumber Extraction Pipeline
# app/pdf_extract.py — pdfplumber text, table, and region extraction
from __future__ import annotations
import io
from pathlib import Path
from typing import Any
import pdfplumber
# ─────────────────────────────────────────────────────────────────────────────
# 1. Text extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_all_text(source: str | Path | bytes) -> str:
"""
Extract text from all pages.
pdfplumber uses pdfminer under the hood for layout-aware positioning.
extract_text() respects column and line breaks better than pypdf.
"""
with _open(source) as pdf:
return "\n\n".join(
page.extract_text() or ""
for page in pdf.pages
)
def extract_page_text(source: str | Path | bytes) -> list[str]:
"""Return text per page as a list."""
with _open(source) as pdf:
return [(page.extract_text() or "") for page in pdf.pages]
def extract_words(source: str | Path | bytes, page_num: int = 0) -> list[dict]:
"""
Return word-level bounding boxes.
Each word: {"text", "x0", "y0", "x1", "y1", "doctop", "bottom"}.
Useful for understanding layout and position of text elements.
"""
with _open(source) as pdf:
return pdf.pages[page_num].extract_words(
x_tolerance=3,
y_tolerance=3,
keep_blank_chars=False,
)
def extract_chars(source: str | Path | bytes, page_num: int = 0) -> list[dict]:
"""
Return character-level detail: x0 y0 x1 y1 text fontname size color.
The most granular extraction — useful for font analysis or OCR post-processing.
"""
with _open(source) as pdf:
return pdf.pages[page_num].chars
# ─────────────────────────────────────────────────────────────────────────────
# 2. Table extraction
# ─────────────────────────────────────────────────────────────────────────────
TABLE_SETTINGS_DEFAULT = {
"vertical_strategy": "lines", # use visible lines to detect columns
"horizontal_strategy": "lines", # use visible lines to detect rows
"snap_tolerance": 3,
"join_tolerance": 3,
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
}
TABLE_SETTINGS_TEXT = {
"vertical_strategy": "text", # use whitespace gaps between text columns
"horizontal_strategy": "text", # use vertical gaps between text rows
"snap_tolerance": 3,
"join_tolerance": 3,
"edge_min_length": 3,
}
def extract_first_table(
source: str | Path | bytes,
page_num: int = 0,
use_text_strategy: bool = False,
) -> list[list[str | None]]:
"""
Extract the first table from a page.
Returns list of rows, each row is list of cell strings.
use_text_strategy=True for tables without visible borders (whitespace-delimited).
"""
settings = TABLE_SETTINGS_TEXT if use_text_strategy else TABLE_SETTINGS_DEFAULT
with _open(source) as pdf:
return pdf.pages[page_num].extract_table(table_settings=settings) or []
def extract_all_tables(
source: str | Path | bytes,
page_num: int = 0,
) -> list[list[list[str | None]]]:
"""Extract all tables from the given page — returns list of tables."""
with _open(source) as pdf:
return pdf.pages[page_num].extract_tables(TABLE_SETTINGS_DEFAULT)
def extract_tables_all_pages(
source: str | Path | bytes,
) -> dict[int, list[list[list[str | None]]]]:
"""Extract all tables on every page. Returns {page_num: [table, ...]}."""
result = {}
with _open(source) as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables(TABLE_SETTINGS_DEFAULT)
if tables:
result[i] = tables
return result
def table_to_dicts(table: list[list[str | None]]) -> list[dict[str, str]]:
"""
Convert a table (list of rows, first row = headers) to list of dicts.
Pairs with pandas: pd.DataFrame(table_to_dicts(table)).
"""
if not table or len(table) < 2:
return []
headers = [str(h or "").strip() for h in table[0]]
return [
{headers[j]: str(row[j] or "").strip() for j in range(len(headers))}
for row in table[1:]
]
def table_to_dataframe(table: list[list[str | None]]):
"""Convert extracted table to a pandas DataFrame."""
import pandas as pd
if not table:
return pd.DataFrame()
headers = [str(h or "").strip() or f"col_{i}" for i, h in enumerate(table[0])]
data = [[str(cell or "").strip() for cell in row] for row in table[1:]]
return pd.DataFrame(data, columns=headers)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Region / bounding box extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_region(
source: str | Path | bytes,
bbox: tuple[float, float, float, float],
page_num: int = 0,
) -> str:
"""
Extract text from a specific rectangular region.
bbox = (x0, y0, x1, y1) in PDF coordinate system (origin = bottom-left).
crop() returns a child page object — all extraction methods still work on it.
"""
with _open(source) as pdf:
region = pdf.pages[page_num].crop(bbox)
return region.extract_text() or ""
def extract_header_footer(
source: str | Path | bytes,
header_height: float = 50,
footer_height: float = 50,
) -> list[dict[str, str]]:
"""
Extract header and footer text from each page by cropping top and bottom strips.
Useful for identifying page numbers, document titles, or section headers.
"""
results = []
with _open(source) as pdf:
for i, page in enumerate(pdf.pages):
h = page.height
w = page.width
header = page.crop((0, 0, w, header_height)).extract_text() or ""
footer = page.crop((0, h - footer_height, w, h)).extract_text() or ""
results.append({"page": i, "header": header.strip(), "footer": footer.strip()})
return results
def find_text_position(
source: str | Path | bytes,
search_text: str,
page_num: int = 0,
) -> list[dict[str, Any]]:
"""
Find all occurrences of a string and return their bounding boxes.
Uses extract_words() to locate the search term's pixel coordinates.
"""
with _open(source) as pdf:
words = pdf.pages[page_num].extract_words()
matches = [
{"text": w["text"], "x0": w["x0"], "y0": w["top"], "x1": w["x1"], "y1": w["bottom"]}
for w in words
if search_text.lower() in w["text"].lower()
]
return matches
# ─────────────────────────────────────────────────────────────────────────────
# 4. Metadata and structure
# ─────────────────────────────────────────────────────────────────────────────
def get_pdf_info(source: str | Path | bytes) -> dict[str, Any]:
"""Return page count, dimensions, and metadata."""
with _open(source) as pdf:
meta = pdf.metadata or {}
pages_info = [
{"page": i, "width": p.width, "height": p.height}
for i, p in enumerate(pdf.pages)
]
return {
"pages": len(pdf.pages),
"metadata": {k: str(v) for k, v in meta.items()},
"sizes": pages_info,
}
# ─────────────────────────────────────────────────────────────────────────────
# 5. Batch processing
# ─────────────────────────────────────────────────────────────────────────────
def batch_extract_tables(
paths: list[str | Path],
) -> list[dict[str, Any]]:
"""
Extract tables from a list of PDF files.
Returns [{"file", "page", "table_index", "data": list_of_dicts}].
"""
results = []
for path in paths:
try:
tables_by_page = extract_tables_all_pages(path)
for page_num, tables in tables_by_page.items():
for ti, table in enumerate(tables):
results.append({
"file": Path(path).name,
"page": page_num,
"table_index": ti,
"rows": len(table) - 1,
"cols": len(table[0]) if table else 0,
"data": table_to_dicts(table),
})
except Exception as e:
results.append({"file": Path(path).name, "error": str(e)})
return results
def _open(source):
if isinstance(source, bytes):
return pdfplumber.open(io.BytesIO(source))
return pdfplumber.open(str(source))
# ─────────────────────────────────────────────────────────────────────────────
# Demo — creates a test PDF with fpdf2 then extracts from it
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
try:
from fpdf import FPDF
def _make_table_pdf() -> bytes:
pdf = FPDF()
pdf.add_page()
pdf.set_font("Helvetica", "B", 14)
pdf.cell(0, 10, "Sales Report", align="C")
pdf.ln(12)
cols = ["Product", "Units", "Revenue"]
col_w = [80, 40, 60]
rows_data = [
["Widget A", "150", "$3,750"],
["Gadget B", "42", "$8,400"],
["Module C", "300", "$15,000"],
]
pdf.set_font("Helvetica", "B", 11)
pdf.set_fill_color(50, 50, 50)
pdf.set_text_color(255, 255, 255)
for w, h in zip(col_w, cols):
pdf.cell(w, 8, h, border=1, fill=True, align="C")
pdf.ln()
pdf.set_font("Helvetica", size=10)
pdf.set_text_color(0, 0, 0)
for r in rows_data:
for w, c in zip(col_w, r):
pdf.cell(w, 7, c, border=1, align="C")
pdf.ln()
return pdf.output()
pdf_bytes = _make_table_pdf()
print(f"Created test PDF: {len(pdf_bytes):,} bytes")
print("\n=== Text extraction ===")
text = extract_all_text(pdf_bytes)
print(text[:200])
print("\n=== Table extraction ===")
table = extract_first_table(pdf_bytes, use_text_strategy=False)
if table:
for row in table:
print(f" {row}")
print(f"\n As dicts: {table_to_dicts(table)[:2]}")
else:
# Text strategy fallback
table = extract_first_table(pdf_bytes, use_text_strategy=True)
for row in (table or []):
print(f" {row}")
print("\n=== PDF info ===")
info = get_pdf_info(pdf_bytes)
print(f" Pages: {info['pages']}")
for size in info["sizes"]:
print(f" Page {size['page']}: {size['width']:.0f}×{size['height']:.0f}")
except ImportError:
print("fpdf2 not installed — install with: pip install fpdf2")
For the pypdf alternative — pypdf’s extract_text() is faster and sufficient for continuous body text, but pdfplumber is the correct tool when you need tables: extract_table() detects cell boundaries from visible lines or text alignment and returns a list[list[str]] ready to pass to pd.DataFrame(), while pypdf has no table detection at all. For the tabula-py / camelot alternative — tabula-py wraps the Java-based Tabula library (requires JVM) and camelot uses OpenCV for lattice/stream table detection; pdfplumber is a pure-Python solution that handles the majority of PDF table layouts without Java or OpenCV, making it easier to install in containers and serverless functions. The Claude Skills 360 bundle includes pdfplumber skill sets covering pdfplumber.open() context manager, page.extract_text() and extract_words() and extract_chars(), extract_table() with lines and text strategies, extract_tables() for all tables on a page, TABLE_SETTINGS_DEFAULT and TABLE_SETTINGS_TEXT configurations, table_to_dicts() and table_to_dataframe() converters, crop() bounding box region extraction, extract_header_footer() for page template regions, find_text_position() word search, batch_extract_tables() for multi-file processing, and get_pdf_info() metadata reader. Start with the free tier to try PDF table extraction pipeline code generation.