pypdf reads, splits, merges, and modifies PDFs in pure Python. pip install pypdf. Read: from pypdf import PdfReader; reader = PdfReader("file.pdf"). len(reader.pages). reader.pages[0].extract_text(). All text: "".join(p.extract_text() for p in reader.pages). Metadata: reader.metadata.title, .author, .creator, .creation_date. Encrypted: reader.is_encrypted. reader.decrypt("password"). Write: from pypdf import PdfWriter; writer = PdfWriter(). writer.add_page(reader.pages[0]). writer.write("out.pdf"). Merge: from pypdf import PdfMerger; merger = PdfMerger(). merger.append("a.pdf"). merger.merge(0, "b.pdf") — insert at position. merger.write("merged.pdf"). Split: writer.add_page(reader.pages[i]). Encrypt: writer.encrypt("user_pass", "owner_pass", use_128bit=True). Metadata: writer.add_metadata({"/Title": "My Doc", "/Author": "Alice"}). Rotate: page.rotate(90). Scale: page.scale_by(0.5). Crop: page.cropbox.lower_left = (x, y). Images: reader.pages[0].images → list of ImageObject. img.data, img.name. BytesIO: writer.write(buf). PdfReader(io.BytesIO(data)). Watermark: reader.pages[i].merge_page(watermark_page). Claude Code generates pypdf merger, splitter, text extractor, and encryption pipelines.
CLAUDE.md for pypdf
## pypdf Stack
- Version: pypdf >= 4.0 | pip install pypdf
- Read: PdfReader("file.pdf") | reader.pages | page.extract_text()
- Write: PdfWriter(); writer.add_page(page); writer.write("out.pdf")
- Merge: PdfMerger().append("a.pdf").append("b.pdf").write("merged.pdf")
- Split: PdfWriter per chunk; writer.add_page(reader.pages[i])
- Encrypt: writer.encrypt(user_password, owner_password)
- Metadata: reader.metadata.title | writer.add_metadata({"/Title": "..."})
pypdf PDF Processing Pipeline
# app/pdf_ops.py — pypdf text extraction, merge, split, encrypt, watermark
from __future__ import annotations
import io
import re
from pathlib import Path
from typing import Any
from pypdf import PdfMerger, PdfReader, PdfWriter
from pypdf.errors import PdfReadError
# ─────────────────────────────────────────────────────────────────────────────
# 1. Text extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_text(source: str | Path | bytes | io.IOBase) -> str:
"""
Extract all text from every page of a PDF.
Handles file path, bytes, or file-like object.
extract_text() uses heuristics — complex layouts may need pdfplumber.
"""
reader = _open_reader(source)
parts = []
for page in reader.pages:
text = page.extract_text(extraction_mode="layout") or ""
parts.append(text)
return "\n\n".join(parts)
def extract_page_text(source: str | Path | bytes | io.IOBase) -> list[str]:
"""Return a list of text strings, one per page."""
reader = _open_reader(source)
return [(page.extract_text() or "") for page in reader.pages]
def extract_text_range(
source: str | Path | bytes,
start: int = 0,
end: int | None = None,
) -> str:
"""Extract text from pages[start:end]."""
reader = _open_reader(source)
pages = reader.pages[start:end]
return "\n\n".join(p.extract_text() or "" for p in pages)
def _open_reader(source) -> PdfReader:
if isinstance(source, bytes):
return PdfReader(io.BytesIO(source))
if isinstance(source, io.IOBase):
return PdfReader(source)
return PdfReader(str(source))
# ─────────────────────────────────────────────────────────────────────────────
# 2. Metadata
# ─────────────────────────────────────────────────────────────────────────────
def get_metadata(source: str | Path | bytes) -> dict[str, Any]:
"""
Read PDF metadata — title, author, creator, subject, dates.
reader.metadata is a dict-like DocumentInformation object.
"""
reader = _open_reader(source)
meta = reader.metadata or {}
return {
"title": meta.get("/Title", ""),
"author": meta.get("/Author", ""),
"subject": meta.get("/Subject", ""),
"creator": meta.get("/Creator", ""),
"producer": meta.get("/Producer", ""),
"created": str(meta.get("/CreationDate", "")),
"modified": str(meta.get("/ModDate", "")),
"pages": len(reader.pages),
"encrypted": reader.is_encrypted,
}
def set_metadata(
source: str | Path | bytes,
title: str = "",
author: str = "",
subject: str = "",
) -> bytes:
"""Read a PDF, set metadata, and return modified bytes."""
reader = _open_reader(source)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
meta = {}
if title: meta["/Title"] = title
if author: meta["/Author"] = author
if subject: meta["/Subject"] = subject
if meta:
writer.add_metadata(meta)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
# ─────────────────────────────────────────────────────────────────────────────
# 3. Merge PDFs
# ─────────────────────────────────────────────────────────────────────────────
def merge_pdfs(sources: list[str | Path | bytes]) -> bytes:
"""
Merge multiple PDFs into one, preserving page order.
PdfMerger.append() adds all pages; merge(position, pdf) inserts at position.
"""
merger = PdfMerger()
for src in sources:
if isinstance(src, bytes):
merger.append(io.BytesIO(src))
else:
merger.append(str(src))
buf = io.BytesIO()
merger.write(buf)
merger.close()
return buf.getvalue()
def merge_pdf_files(paths: list[str | Path], output: str | Path) -> Path:
"""Merge PDF files and write to output path."""
pdf_bytes = merge_pdfs([Path(p) for p in paths])
out = Path(output)
out.write_bytes(pdf_bytes)
return out
# ─────────────────────────────────────────────────────────────────────────────
# 4. Split PDF
# ─────────────────────────────────────────────────────────────────────────────
def split_pages(source: str | Path | bytes) -> list[bytes]:
"""Split a PDF into individual single-page PDFs."""
reader = _open_reader(source)
result = []
for page in reader.pages:
writer = PdfWriter()
writer.add_page(page)
buf = io.BytesIO()
writer.write(buf)
result.append(buf.getvalue())
return result
def split_range(
source: str | Path | bytes,
chunks: list[tuple[int, int]],
) -> list[bytes]:
"""
Split a PDF into chunks defined by (start, end) page ranges.
Example: chunks=[(0, 4), (4, 9)] → two PDFs, pages 0-3 and 4-8.
"""
reader = _open_reader(source)
pdfs = []
for start, end in chunks:
writer = PdfWriter()
for page in reader.pages[start:end]:
writer.add_page(page)
buf = io.BytesIO()
writer.write(buf)
pdfs.append(buf.getvalue())
return pdfs
def split_every_n_pages(source: str | Path | bytes, n: int) -> list[bytes]:
"""Split a PDF into chunks of n pages each."""
reader = _open_reader(source)
total = len(reader.pages)
chunks = [(i, min(i + n, total)) for i in range(0, total, n)]
return split_range(source, chunks)
# ─────────────────────────────────────────────────────────────────────────────
# 5. Encrypt / decrypt
# ─────────────────────────────────────────────────────────────────────────────
def encrypt_pdf(
source: str | Path | bytes,
user_password: str,
owner_password: str | None = None,
) -> bytes:
"""
Encrypt a PDF with AES-256.
user_password: allows reading.
owner_password: allows editing/printing (defaults to user_password if None).
"""
reader = _open_reader(source)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
writer.encrypt(
user_password=user_password,
owner_password=owner_password or user_password,
use_128bit=False, # False → AES-256 in pypdf 4+
)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
def decrypt_pdf(source: str | Path | bytes, password: str) -> bytes:
"""Decrypt a password-protected PDF and return unencrypted bytes."""
reader = _open_reader(source)
if reader.is_encrypted:
reader.decrypt(password)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
# ─────────────────────────────────────────────────────────────────────────────
# 6. Page manipulation
# ─────────────────────────────────────────────────────────────────────────────
def rotate_pages(source: str | Path | bytes, degrees: int = 90) -> bytes:
"""Rotate all pages by degrees (90, 180, 270)."""
reader = _open_reader(source)
writer = PdfWriter()
for page in reader.pages:
page.rotate(degrees)
writer.add_page(page)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
def watermark(source: str | Path | bytes, watermark_pdf: str | Path | bytes) -> bytes:
"""
Overlay a watermark page on every page of the source PDF.
Typically watermark_pdf is a single-page PDF with transparent text/image.
merge_page() composites the watermark onto the source page.
"""
source_reader = _open_reader(source)
watermark_reader = _open_reader(watermark_pdf)
watermark_page = watermark_reader.pages[0]
writer = PdfWriter()
for page in source_reader.pages:
page.merge_page(watermark_page)
writer.add_page(page)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
# ─────────────────────────────────────────────────────────────────────────────
# 7. Image extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_images(source: str | Path | bytes) -> list[dict[str, Any]]:
"""
Extract all embedded images from a PDF.
Returns list of {"page", "name", "data": bytes, "type": "JPEG"|"PNG"|...}.
"""
reader = _open_reader(source)
results = []
for page_num, page in enumerate(reader.pages):
for img in page.images:
results.append({
"page": page_num,
"name": img.name,
"data": img.data,
"type": "JPEG" if img.data[:2] == b"\xff\xd8" else "PNG",
})
return results
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import tempfile
# Create a minimal test PDF via fpdf2
try:
from fpdf import FPDF
def _make_test_pdf(text: str = "Hello pypdf!") -> bytes:
pdf = FPDF()
pdf.add_page()
pdf.set_font("Helvetica", size=14)
pdf.cell(0, 10, text)
return pdf.output()
test_pdf = _make_test_pdf("Page 1 — test content")
print(f"=== Test PDF: {len(test_pdf):,} bytes ===")
print("\n=== Text extraction ===")
text = extract_text(test_pdf)
print(f" Extracted: {text.strip()[:80]!r}")
print("\n=== Metadata ===")
meta = get_metadata(test_pdf)
for k, v in meta.items():
print(f" {k}: {v!r}")
print("\n=== Merge (3 pages) ===")
pages = [_make_test_pdf(f"Page {i+1}") for i in range(3)]
merged = merge_pdfs(pages)
merged_reader = PdfReader(io.BytesIO(merged))
print(f" Merged: {len(merged_reader.pages)} pages")
print("\n=== Split ===")
splits = split_pages(merged)
print(f" Split into {len(splits)} single-page PDFs")
print("\n=== Encrypt / decrypt ===")
encrypted = encrypt_pdf(test_pdf, "secret123")
enc_reader = PdfReader(io.BytesIO(encrypted))
print(f" Encrypted: is_encrypted={enc_reader.is_encrypted}")
decrypted = decrypt_pdf(encrypted, "secret123")
dec_text = extract_text(decrypted)
print(f" Decrypted text: {dec_text.strip()[:40]!r}")
except ImportError:
print("fpdf2 not installed — install with: pip install fpdf2")
For the PyPDF2 alternative — PyPDF2 was the original library; it was merged into pypdf (same maintainer) and PyPDF2 is now deprecated — pip install pypdf is the current package with the same API plus bug fixes and AES-256 encryption support. For the pdfminer.six alternative — pdfminer is a layout-aware text extractor that models character positions, line boxes, and columns, making it more accurate for complex multi-column documents; pypdf is faster and covers the common case (sequential text, metadata, merge/split/encrypt) without the layout analysis overhead. The Claude Skills 360 bundle includes pypdf skill sets covering PdfReader with file/bytes/BytesIO, extract_text() for all pages, extract_page_text() list, get_metadata() DocumentInformation parsing, PdfWriter add_page, merge_pdfs() with PdfMerger.append, split_pages() and split_range() and split_every_n_pages(), set_metadata() writer, encrypt_pdf() with user/owner passwords, decrypt_pdf() for locked files, rotate_pages(), watermark() with merge_page overlay, and extract_images() for embedded images. Start with the free tier to try PDF processing pipeline code generation.