PyMuPDF (fitz) is a fast PDF/XPS processor for text and image extraction. pip install pymupdf. Open: import fitz; doc = fitz.open("file.pdf"). Pages: doc.page_count. Page: page = doc[0]. Text: page.get_text() — plain text. Blocks: page.get_text("blocks") — list of (x0,y0,x1,y1,text,block_no,block_type). Words: page.get_text("words"). JSON: page.get_text("json") — structured with font info. Dict: page.get_text("dict") — dict with blocks/lines/spans. Search: rects = page.search_for("invoice") — list of Rect. Links: page.get_links() — list of dicts with uri/rect. Images: page.get_images() — list of (xref,…). Extract image: img = doc.extract_image(xref); img["image"] → bytes. Render: mat = fitz.Matrix(2,2); pix = page.get_pixmap(matrix=mat); pix.save("page.png"). PDF to PNG: pix.tobytes("png"). Merge: doc1.insert_pdf(doc2). Annotate: page.draw_rect(fitz.Rect(10,10,100,50), color=(1,0,0), width=2). Insert text: page.insert_text((50,100), "label", fontsize=12). Redact: page.add_redact_annot(rect); page.apply_redactions(). Save: doc.save("out.pdf"). BytesIO: buf = doc.tobytes(). Meta: doc.metadata — title, author, creator. TOC: doc.get_toc() — nested [[level,title,page],…]. Close: doc.close(). Claude Code generates PyMuPDF text extractors, PDF-to-image converters, annotation tools, and document analysis pipelines.
CLAUDE.md for PyMuPDF
## PyMuPDF Stack
- Version: pymupdf >= 1.24 | pip install pymupdf
- Open: doc = fitz.open("file.pdf") | fitz.open(stream=bytes_io, filetype="pdf")
- Text: page.get_text() | page.get_text("dict") | page.get_text("blocks")
- Images: page.get_images() → xrefs → doc.extract_image(xref)["image"]
- Render: page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72)).tobytes("png")
- Save: doc.save("out.pdf") | doc.tobytes()
PyMuPDF PDF Processing Pipeline
# app/pdf_extract.py — PyMuPDF text extraction, images, search, render, merge, annotate
from __future__ import annotations
import io
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import fitz # PyMuPDF
log = logging.getLogger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# 1. Open / load helpers
# ─────────────────────────────────────────────────────────────────────────────
def open_pdf(
source: str | Path | bytes | io.BytesIO,
password: str | None = None,
) -> fitz.Document:
"""
Open a PDF from file path, bytes, or BytesIO.
Decrypts with password if provided.
Example:
doc = open_pdf("report.pdf")
doc = open_pdf(pdf_bytes) # from download
doc = open_pdf("secure.pdf", password="pw")
"""
if isinstance(source, (str, Path)):
doc = fitz.open(str(source))
elif isinstance(source, io.BytesIO):
doc = fitz.open(stream=source.read(), filetype="pdf")
elif isinstance(source, bytes):
doc = fitz.open(stream=source, filetype="pdf")
else:
doc = fitz.open(stream=source.read(), filetype="pdf")
if password and doc.is_encrypted:
if not doc.authenticate(password):
raise ValueError("Invalid PDF password")
return doc
# ─────────────────────────────────────────────────────────────────────────────
# 2. Text extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_text(
source: str | Path | bytes | fitz.Document,
pages: list[int] | None = None,
mode: str = "text",
join_pages: str = "\n\n",
) -> str:
"""
Extract text from a PDF.
pages: 0-indexed page numbers (default: all pages).
mode: "text" (plain) | "blocks" | "words" — only "text" returns a plain string.
Example:
text = extract_text("contract.pdf")
first_page = extract_text("invoice.pdf", pages=[0])
"""
close_after = False
if not isinstance(source, fitz.Document):
doc = open_pdf(source)
close_after = True
else:
doc = source
try:
page_indices = pages if pages is not None else range(doc.page_count)
texts = []
for i in page_indices:
if 0 <= i < doc.page_count:
texts.append(doc[i].get_text(mode))
return join_pages.join(texts)
finally:
if close_after:
doc.close()
def extract_text_structured(
doc: fitz.Document,
page_num: int = 0,
) -> list[dict]:
"""
Extract text blocks with position, font, and size information.
Returns list of dicts: {text, x0, y0, x1, y1, size, font, bold, italic}
Example:
for block in extract_text_structured(doc, page_num=0):
if block["bold"] and block["size"] > 14:
print(f"Heading: {block['text']}")
"""
page = doc[page_num]
result = []
data = page.get_text("dict")
for block in data.get("blocks", []):
if block.get("type") != 0: # type 0 = text block
continue
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span.get("text", "").strip()
if not text:
continue
flags = span.get("flags", 0)
result.append({
"text": text,
"x0": span["bbox"][0],
"y0": span["bbox"][1],
"x1": span["bbox"][2],
"y1": span["bbox"][3],
"size": round(span.get("size", 0), 1),
"font": span.get("font", ""),
"bold": bool(flags & 2**4),
"italic": bool(flags & 2**1),
})
return result
def find_text_positions(
doc: fitz.Document,
query: str,
pages: list[int] | None = None,
case_sensitive: bool = False,
) -> list[dict]:
"""
Find all occurrences of a string and return their page and bounding rect.
Returns list of {"page": int, "rect": fitz.Rect, "text": str}
Example:
hits = find_text_positions(doc, "Total Amount Due")
for hit in hits:
print(f"Found on page {hit['page']}: {hit['rect']}")
"""
page_indices = pages if pages is not None else range(doc.page_count)
results = []
flags = 0 if case_sensitive else fitz.TEXT_PRESERVE_WHITESPACE
for i in page_indices:
page = doc[i]
for rect in page.search_for(query, flags=flags):
results.append({"page": i, "rect": rect, "text": query})
return results
# ─────────────────────────────────────────────────────────────────────────────
# 3. Image extraction
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class ExtractedImage:
page: int
xref: int
ext: str
width: int
height: int
data: bytes
def extract_images(
doc: fitz.Document,
pages: list[int] | None = None,
min_width: int = 50,
min_height: int = 50,
) -> list[ExtractedImage]:
"""
Extract all embedded images from a PDF.
min_width/min_height: skip small images (icons, decorations).
Example:
images = extract_images(doc)
for img in images:
Path(f"img_{img.xref}.{img.ext}").write_bytes(img.data)
"""
page_indices = pages if pages is not None else range(doc.page_count)
seen_xrefs = set()
results = []
for i in page_indices:
for img_info in doc[i].get_images(full=True):
xref = img_info[0]
if xref in seen_xrefs:
continue
seen_xrefs.add(xref)
try:
image = doc.extract_image(xref)
w, h = image["width"], image["height"]
if w < min_width or h < min_height:
continue
results.append(ExtractedImage(
page=i,
xref=xref,
ext=image["ext"],
width=w,
height=h,
data=image["image"],
))
except Exception as e:
log.warning("Could not extract image xref=%s: %s", xref, e)
return results
def save_images(
doc: fitz.Document,
output_dir: str | Path,
pages: list[int] | None = None,
prefix: str = "img",
) -> list[Path]:
"""
Extract and save all images to output_dir.
Returns list of saved file paths.
"""
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
images = extract_images(doc, pages)
paths = []
for img in images:
p = out / f"{prefix}_p{img.page}_{img.xref}.{img.ext}"
p.write_bytes(img.data)
paths.append(p)
return paths
# ─────────────────────────────────────────────────────────────────────────────
# 4. Rendering (PDF → image)
# ─────────────────────────────────────────────────────────────────────────────
def page_to_png(
doc: fitz.Document,
page_num: int = 0,
dpi: int = 150,
colorspace: str = "rgb",
) -> bytes:
"""
Render a PDF page to PNG bytes.
dpi: resolution (72 = original PDF units, 150 = medium, 300 = print quality).
Example:
png_data = page_to_png(doc, page_num=0, dpi=200)
img_path = Path("preview.png")
img_path.write_bytes(png_data)
"""
page = doc[page_num]
scale = dpi / 72
matrix = fitz.Matrix(scale, scale)
cs = fitz.csRGB if colorspace == "rgb" else fitz.csGRAY
pix = page.get_pixmap(matrix=matrix, colorspace=cs)
return pix.tobytes("png")
def pdf_to_images(
source: str | Path | bytes,
dpi: int = 150,
pages: list[int] | None = None,
fmt: str = "png",
) -> list[bytes]:
"""
Convert PDF pages to image bytes.
Returns list of bytes in the order of pages.
Example:
images = pdf_to_images("slides.pdf", dpi=100)
for i, img in enumerate(images):
Path(f"slide_{i+1}.png").write_bytes(img)
"""
doc = open_pdf(source)
try:
scale = dpi / 72
matrix = fitz.Matrix(scale, scale)
page_nums = pages if pages is not None else range(doc.page_count)
results = []
for i in page_nums:
page = doc[i]
pix = page.get_pixmap(matrix=matrix, colorspace=fitz.csRGB)
results.append(pix.tobytes(fmt))
return results
finally:
doc.close()
# ─────────────────────────────────────────────────────────────────────────────
# 5. Metadata, TOC, links
# ─────────────────────────────────────────────────────────────────────────────
def get_metadata(doc: fitz.Document) -> dict:
"""Return PDF metadata (title, author, subject, creator, dates)."""
meta = doc.metadata or {}
return {
"title": meta.get("title", ""),
"author": meta.get("author", ""),
"subject": meta.get("subject", ""),
"creator": meta.get("creator", ""),
"producer": meta.get("producer", ""),
"created": meta.get("creationDate", ""),
"modified": meta.get("modDate", ""),
"pages": doc.page_count,
"encrypted": doc.is_encrypted,
}
def get_toc(doc: fitz.Document) -> list[dict]:
"""Return table of contents as list of {level, title, page}."""
return [{"level": item[0], "title": item[1], "page": item[2]}
for item in doc.get_toc()]
def get_links(
doc: fitz.Document,
pages: list[int] | None = None,
) -> list[dict]:
"""Extract all hyperlinks from the document."""
page_indices = pages if pages is not None else range(doc.page_count)
results = []
for i in page_indices:
for link in doc[i].get_links():
uri = link.get("uri", "")
if uri:
results.append({"page": i, "url": uri, "rect": link.get("from")})
return results
# ─────────────────────────────────────────────────────────────────────────────
# 6. Merge / split
# ─────────────────────────────────────────────────────────────────────────────
def merge_pdfs(sources: list[str | Path | bytes], output_path: str | Path | None = None) -> bytes:
"""
Merge multiple PDFs into one.
Returns bytes; optionally saves to output_path.
"""
merged = fitz.open()
for source in sources:
doc = open_pdf(source)
merged.insert_pdf(doc)
doc.close()
buf = merged.tobytes()
if output_path:
merged.save(str(output_path))
merged.close()
return buf
def split_pdf(
source: str | Path | bytes,
output_dir: str | Path,
prefix: str = "page",
) -> list[Path]:
"""
Split each page into a separate PDF file.
Returns list of output file paths.
"""
doc = open_pdf(source)
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
paths = []
for i in range(doc.page_count):
single = fitz.open()
single.insert_pdf(doc, from_page=i, to_page=i)
p = out / f"{prefix}_{i+1:04d}.pdf"
single.save(str(p))
single.close()
paths.append(p)
doc.close()
return paths
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Create a sample PDF for demonstration
sample = fitz.open()
page = sample.new_page()
page.insert_text((50, 100), "Invoice #INV-2024-042", fontsize=18)
page.insert_text((50, 130), "Total Amount Due: $1,250.00", fontsize=14)
page.insert_text((50, 160), "Due Date: April 30, 2024", fontsize=12)
page.draw_rect(fitz.Rect(40, 90, 400, 175), color=(0.1, 0.3, 0.6), width=2)
sample_bytes = sample.tobytes()
sample.close()
doc = open_pdf(sample_bytes)
print("=== Metadata ===")
meta = get_metadata(doc)
print(f" Pages: {meta['pages']}, Encrypted: {meta['encrypted']}")
print("\n=== Text extraction ===")
text = extract_text(doc)
for line in text.strip().split("\n"):
if line.strip():
print(f" {line}")
print("\n=== Search ===")
hits = find_text_positions(doc, "Amount Due")
print(f" Found 'Amount Due' {len(hits)} time(s)")
if hits:
print(f" At rect: {hits[0]['rect']}")
print("\n=== Render to PNG ===")
png_bytes = page_to_png(doc, page_num=0, dpi=150)
Path("/tmp/sample_page.png").write_bytes(png_bytes)
print(f" Rendered: /tmp/sample_page.png ({len(png_bytes):,} bytes)")
doc.close()
print("\n=== Merge PDFs ===")
merged_bytes = merge_pdfs([sample_bytes, sample_bytes])
doc2 = open_pdf(merged_bytes)
print(f" Merged {doc2.page_count} pages")
doc2.close()
For the pdfplumber alternative — pdfplumber is built on pdfminer.six and excels at precise table extraction with visual debugging; PyMuPDF is a C-extension binding to the MuPDF library and is significantly faster (5–20×) for text extraction, page rendering, and image extraction across large document sets — use pdfplumber when you need accurate table-cell boundary detection, PyMuPDF when extraction speed and image rendering performance matter. For the pikepdf alternative — pikepdf (built on QPDF) focuses on PDF manipulation: merging, splitting, encryption, metadata editing, and preserving PDF structure; PyMuPDF focuses on content extraction and rendering — use PyMuPDF to get text/images out of a PDF, pikepdf to restructure or secure an existing PDF. The Claude Skills 360 bundle includes PyMuPDF skill sets covering open_pdf() from path/bytes/BytesIO with password support, extract_text() per-page and full-doc, extract_text_structured() with font/bold/italic metadata, find_text_positions() bounding-box search, extract_images() with xref deduplication, save_images() bulk export, page_to_png()/pdf_to_images() DPI-controlled rendering, get_metadata()/get_toc()/get_links(), merge_pdfs() and split_pdf(). Start with the free tier to try fast PDF text and image extraction code generation.