Blog / AI / Claude Code for pickletools: Python Pickle Bytecode Analyzer

Claude Code for pickletools: Python Pickle Bytecode Analyzer

Published: November 13, 2028

•

Read time: 5 min read

•

By: Claude Skills 360

Python’s pickletools module disassembles and optimizes pickle byte streams without unpickling them, making it invaluable for debugging serialization issues and auditing pickle data. import pickletools. Disassemble: pickletools.dis(pickle_bytes, output=None, indentlevel=4, annotate=0) — prints annotated opcode listing to stdout or output file object; annotate controls column width of type annotations. Generate opcodes: pickletools.genops(pickle_bytes) → iterator of (opcode, arg, pos) tuples — opcode is a _Opcode namedtuple; arg is the decoded argument; pos is the byte offset. Optimize: pickletools.optimize(p) → bytes — removes duplicate PUT/BINPUT/LONG_BINPUT memoization opcodes that store values never retrieved by a GET, reducing stream size. Opcode list: pickletools.opcodes — list of _Opcode objects with .name, .code, .arg, .stack_before, .stack_after, .proto, .doc attributes. Protocol constants: pickle protocols 0–5 with increasing efficiency. Claude Code generates pickle debuggers, protocol analyzers, size optimizers, and serialization validators.

CLAUDE.md for pickletools

## pickletools Stack
- Stdlib: import pickletools, pickle
- Dis:    pickletools.dis(pickle.dumps(obj))          # print opcode listing
- Ops:    for op, arg, pos in pickletools.genops(data): ...
- Opt:    smaller = pickletools.optimize(data)        # remove unused PUTs
- Note:   Never unpickles — safe for inspecting untrusted data structure

pickletools Pickle Bytecode Analyzer Pipeline

# app/pickletoolsutil.py — disassemble, analyze, optimize, compare, validate
from __future__ import annotations

import io
import pickle
import pickletools
import struct
from dataclasses import dataclass, field
from pathlib import Path


# ─────────────────────────────────────────────────────────────────────────────
# 1. Disassembly helpers
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class OpcodeRecord:
    pos:    int
    code:   str
    name:   str
    arg:    object
    proto:  int       # minimum protocol that introduced this opcode

    def __str__(self) -> str:
        arg_str = repr(self.arg) if self.arg is not None else ""
        if len(arg_str) > 60:
            arg_str = arg_str[:57] + "..."
        return f"{self.pos:6d}  {self.code!r:4s}  {self.name:<24s}  {arg_str}"


def disassemble(data: bytes) -> list[OpcodeRecord]:
    """
    Disassemble pickle bytes into a list of OpcodeRecord objects.

    Example:
        records = disassemble(pickle.dumps({"a": 1}))
        for r in records:
            print(r)
    """
    records = []
    for opcode, arg, pos in pickletools.genops(data):
        records.append(OpcodeRecord(
            pos=pos,
            code=opcode.code,
            name=opcode.name,
            arg=arg,
            proto=opcode.proto,
        ))
    return records


def dis_to_string(data: bytes, annotate: int = 40) -> str:
    """
    Return pickletools.dis output as a string.

    Example:
        print(dis_to_string(pickle.dumps([1, 2, 3])))
    """
    buf = io.StringIO()
    pickletools.dis(data, output=buf, annotate=annotate)
    return buf.getvalue()


# ─────────────────────────────────────────────────────────────────────────────
# 2. Stream analysis
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class PickleStats:
    total_bytes:    int
    protocol:       int
    opcode_count:   int
    opcode_freq:    dict[str, int]
    memo_puts:      int       # number of memoize (PUT) opcodes
    memo_gets:      int       # number of memo fetch (GET) opcodes
    string_bytes:   int       # bytes consumed by string/bytes opcodes
    nested_depth:   int       # maximum stack depth reached

    def efficiency(self) -> float:
        """Ratio of GETs to PUTs — higher means memo is being used well."""
        if self.memo_puts == 0:
            return 1.0
        return self.memo_gets / self.memo_puts

    def __str__(self) -> str:
        top = sorted(self.opcode_freq.items(), key=lambda kv: -kv[1])[:5]
        top_str = ", ".join(f"{n}×{c}" for n, c in top)
        return (f"PickleStats(proto={self.protocol}, "
                f"{self.total_bytes}B, {self.opcode_count} ops, "
                f"depth={self.nested_depth}, "
                f"memo efficiency={self.efficiency():.2f}, "
                f"top=[{top_str}])")


_PUT_OPCODES = {"PUT", "BINPUT", "LONG_BINPUT", "MEMOIZE"}
_GET_OPCODES = {"GET", "BINGET", "LONG_BINGET"}
_PUSH_OPCODES = {
    "MARK", "FRAME", "PROTO",
    "INT", "LONG", "LONG1", "LONG4",
    "STRING", "BINSTRING", "SHORT_BINSTRING",
    "BINBYTES", "SHORT_BINBYTES", "BINBYTES8",
    "BYTEARRAY8", "NEXT_BUFFER", "READONLY_BUFFER",
    "UNICODE", "SHORT_BINUNICODE", "BINUNICODE", "BINUNICODE8",
    "FLOAT", "BINFLOAT",
    "TRUE", "FALSE", "NONE",
    "NEWFALSE", "NEWTRUE",
    "EMPTY_LIST", "EMPTY_TUPLE", "EMPTY_DICT", "EMPTY_SET", "FROZENSET",
}
_POP_OPCODES = {
    "APPEND", "APPENDS", "SETITEM", "SETITEMS",
    "ADDITEMS", "BUILD", "STOP",
}


def analyze(data: bytes) -> PickleStats:
    """
    Analyze a pickle byte stream for size, protocol, opcode frequency, and depth.

    Example:
        stats = analyze(pickle.dumps(my_obj, protocol=5))
        print(stats)
    """
    records = disassemble(data)
    freq: dict[str, int] = {}
    depth = 0
    max_depth = 0
    memo_puts = 0
    memo_gets = 0
    string_bytes = 0
    protocol = 0

    for r in records:
        freq[r.name] = freq.get(r.name, 0) + 1
        if r.name == "PROTO" and isinstance(r.arg, int):
            protocol = r.arg
        if r.name in _PUT_OPCODES:
            memo_puts += 1
        if r.name in _GET_OPCODES:
            memo_gets += 1
        if r.name in {"STRING", "BINSTRING", "SHORT_BINSTRING",
                      "BINBYTES", "SHORT_BINBYTES", "BINUNICODE",
                      "SHORT_BINUNICODE", "BINUNICODE8", "BINBYTES8"}:
            if isinstance(r.arg, (str, bytes)):
                string_bytes += len(r.arg)
        if r.name in _PUSH_OPCODES:
            depth += 1
            max_depth = max(max_depth, depth)
        elif r.name in _POP_OPCODES:
            depth = max(0, depth - 1)

    return PickleStats(
        total_bytes=len(data),
        protocol=protocol,
        opcode_count=len(records),
        opcode_freq=freq,
        memo_puts=memo_puts,
        memo_gets=memo_gets,
        string_bytes=string_bytes,
        nested_depth=max_depth,
    )


# ─────────────────────────────────────────────────────────────────────────────
# 3. Optimization and size comparison
# ─────────────────────────────────────────────────────────────────────────────

def optimize(data: bytes) -> bytes:
    """
    Remove unused PUT memoization opcodes from a pickle stream.
    Returns the optimized byte string.

    Example:
        original = pickle.dumps(obj)
        smaller  = optimize(original)
        print(f"saved {len(original) - len(smaller)} bytes")
    """
    return pickletools.optimize(data)


@dataclass
class SizeComparison:
    protocol:   int
    original:   int
    optimized:  int
    saving:     int

    @property
    def saving_pct(self) -> float:
        if self.original == 0:
            return 0.0
        return 100 * self.saving / self.original

    def __str__(self) -> str:
        return (f"proto={self.protocol}  "
                f"{self.original}B → {self.optimized}B  "
                f"(-{self.saving}B / {self.saving_pct:.1f}%)")


def compare_protocols(obj: object) -> list[SizeComparison]:
    """
    Pickle an object at all available protocols and compare optimized sizes.

    Example:
        for row in compare_protocols(my_data):
            print(row)
    """
    results = []
    for proto in range(pickle.HIGHEST_PROTOCOL + 1):
        raw = pickle.dumps(obj, protocol=proto)
        opt = optimize(raw)
        results.append(SizeComparison(
            protocol=proto,
            original=len(raw),
            optimized=len(opt),
            saving=len(raw) - len(opt),
        ))
    return results


# ─────────────────────────────────────────────────────────────────────────────
# 4. Safe structure probe (no unpickling)
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class PickleProbe:
    """Lightweight structural summary extracted without unpickling."""
    protocol:       int
    has_global:     bool     # True if stream contains GLOBAL/STACK_GLOBAL opcodes
    global_names:   list[str]  # module.name strings referenced
    has_reduce:     bool
    has_build:      bool
    probable_type:  str      # heuristic guess at top-level type

    def is_safe_scalars_only(self) -> bool:
        """No GLOBAL or REDUCE opcodes → only primitive values."""
        return not self.has_global and not self.has_reduce


def probe(data: bytes) -> PickleProbe:
    """
    Probe pickle bytes for structure without unpickling.
    Useful for auditing untrusted data.

    Example:
        p = probe(data)
        if not p.is_safe_scalars_only():
            raise ValueError(f"Untrusted: uses {p.global_names}")
    """
    records = disassemble(data)
    protocol = 0
    globals_: list[str] = []
    has_reduce = False
    has_build = False
    probable_type = "unknown"

    for r in records:
        if r.name == "PROTO" and isinstance(r.arg, int):
            protocol = r.arg
        if r.name == "GLOBAL" and isinstance(r.arg, str):
            globals_.append(r.arg.replace(" ", "."))
        if r.name == "STACK_GLOBAL":
            globals_.append("<stack_global>")
        if r.name in {"REDUCE", "NEWOBJ", "NEWOBJ_EX"}:
            has_reduce = True
        if r.name == "BUILD":
            has_build = True
        # Simple heuristic for top-level type
        if r.name == "EMPTY_LIST":
            probable_type = "list"
        elif r.name == "EMPTY_DICT":
            probable_type = "dict"
        elif r.name == "EMPTY_TUPLE":
            probable_type = "tuple"
        elif r.name == "EMPTY_SET":
            probable_type = "set"
        elif r.name in {"INT", "LONG", "LONG1", "LONG4"}:
            probable_type = "int"
        elif r.name in {"FLOAT", "BINFLOAT"}:
            probable_type = "float"
        elif r.name in {"STRING", "BINSTRING", "SHORT_BINSTRING",
                        "BINUNICODE", "SHORT_BINUNICODE", "BINUNICODE8"}:
            probable_type = "str"

    return PickleProbe(
        protocol=protocol,
        has_global=len(globals_) > 0,
        global_names=globals_,
        has_reduce=has_reduce,
        has_build=has_build,
        probable_type=probable_type,
    )


# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import datetime
    print("=== pickletools demo ===")

    # ── disassemble simple object ──────────────────────────────────────────────
    print("\n--- dis_to_string({'key': [1, 2]}) ---")
    data = pickle.dumps({"key": [1, 2]}, protocol=2)
    print(dis_to_string(data))

    # ── disassemble as records ────────────────────────────────────────────────
    print("--- disassemble records ---")
    for r in disassemble(data):
        print(f"  {r}")

    # ── analyze ───────────────────────────────────────────────────────────────
    print("\n--- analyze ---")
    complex_obj = {
        "x": list(range(20)),
        "y": {"nested": "data", "flag": True},
        "z": (1, 2, 3),
        "shared": [1, 2],
    }
    complex_obj["also_shared"] = complex_obj["shared"]  # creates memo entry
    stats = analyze(pickle.dumps(complex_obj, protocol=4))
    print(f"  {stats}")

    # ── compare_protocols ─────────────────────────────────────────────────────
    print("\n--- compare_protocols ---")
    for row in compare_protocols(complex_obj):
        print(f"  {row}")

    # ── probe (safety audit) ──────────────────────────────────────────────────
    print("\n--- probe ---")
    scalars = pickle.dumps({"a": 1, "b": [2, 3]})
    class_data = pickle.dumps(datetime.datetime.now())
    for label, d in [("scalars", scalars), ("datetime", class_data)]:
        p = probe(d)
        safe = "SAFE" if p.is_safe_scalars_only() else f"UNSAFE ({p.global_names})"
        print(f"  {label:12s}: proto={p.protocol}, "
              f"type={p.probable_type}, {safe}")

    print("\n=== done ===")

For the pickle alternative — pickle.loads(data) and pickle.load(f) fully deserialize data into live Python objects, and pickle.dumps(obj, protocol=N) controls protocol version and wire format — use pickle when you need to actually reconstruct objects; use pickletools when you want to inspect or audit pickle streams without executing them (security reviews, debugging truncated streams, understanding why serialization is large). For the marshal alternative — marshal.dumps(obj) / marshal.loads(data) serialize basic Python types (used internally for .pyc files) with a simpler byte format — use marshal only for code objects and .pyc-adjacent tooling; use pickle + pickletools for general-purpose object serialization with full class support and the ability to inspect the resulting byte stream. The Claude Skills 360 bundle includes pickletools skill sets covering OpcodeRecord with disassemble()/dis_to_string() disassemblers, PickleStats with analyze() stream analyzer, SizeComparison with optimize()/compare_protocols() size tools, and PickleProbe with probe() safety auditor. Start with the free tier to try pickle inspection patterns and pickletools pipeline code generation.

Keep Reading

Claude Code for email.contentmanager: Python Email Content Accessors

Read and write EmailMessage body content with Python's email.contentmanager module and Claude Code — email contentmanager ContentManager for the class that maps content types to get and set handler functions allowing EmailMessage to support get_content and set_content with type-specific behaviour, email contentmanager raw_data_manager for the ContentManager instance that handles raw bytes and str payloads without any conversion, email contentmanager content_manager for the standard ContentManager instance used by email.policy.default that intelligently handles text plain text html multipart and binary content types, email contentmanager get_content_text for the handler that returns the decoded text payload of a text-star message part as a str, email contentmanager get_content_binary for the handler that returns the raw decoded bytes payload of a non-text message part, email contentmanager get_data_manager for the get-handler lookup used by EmailMessage get_content to find the right reader function for the content type, email contentmanager set_content text for the handler that creates and sets a text part correctly choosing charset and transfer encoding, email contentmanager set_content bytes for the handler that creates and sets a binary part with base64 encoding and optional filename Content-Disposition, email contentmanager EmailMessage get_content for the method that reads the message body using the registered content manager handlers, email contentmanager EmailMessage set_content for the method that sets the message body and MIME headers in one call, email contentmanager EmailMessage make_alternative make_mixed make_related for the methods that convert a simple message into a multipart container, email contentmanager EmailMessage add_attachment for the method that attaches a file or bytes to a multipart message, and email contentmanager integration with email.message and email.policy and email.mime and io for building high-level email readers attachment extractors text body accessors HTML readers and policy-aware MIME construction pipelines.

5 min read Feb 12, 2029

Claude Code for email.charset: Python Email Charset Encoding

Control header and body encoding for international email with Python's email.charset module and Claude Code — email charset Charset for the class that wraps a character set name with the encoding rules for header encoding and body encoding describing how to encode text for that charset in email messages, email charset Charset header_encoding for the attribute specifying whether headers using this charset should use QP quoted-printable encoding BASE64 encoding or no encoding, email charset Charset body_encoding for the attribute specifying the Content-Transfer-Encoding to use for message bodies in this charset such as QP or BASE64, email charset Charset output_codec for the attribute giving the Python codec name used to encode the string to bytes for the wire format, email charset Charset input_codec for the attribute giving the Python codec name used to decode incoming bytes to str, email charset Charset get_output_charset for returning the output charset name, email charset Charset header_encode for encoding a header string using the charset's header_encoding method, email charset Charset body_encode for encoding body content using the charset's body_encoding, email charset Charset convert for converting a string from the input_codec to the output_codec, email charset add_charset for registering a new charset with custom encoding rules in the global charset registry, email charset add_alias for adding an alias name that maps to an existing registered charset, email charset add_codec for registering a codec name mapping for use by the charset machinery, and email charset integration with email.message and email.mime and email.policy and email.encoders for building international email senders non-ASCII header encoders Content-Transfer-Encoding selectors charset-aware message constructors and MIME encoding pipelines.

5 min read Feb 11, 2029

Claude Code for email.utils: Python Email Address and Header Utilities

Parse and format RFC 2822 email addresses and dates with Python's email.utils module and Claude Code — email utils parseaddr for splitting a display-name plus angle-bracket address string into a realname and email address tuple, email utils formataddr for combining a realname and address string into a properly quoted RFC 2822 address with angle brackets, email utils getaddresses for parsing a list of raw address header strings each potentially containing multiple comma-separated addresses into a list of realname address tuples, email utils parsedate for parsing an RFC 2822 date string into a nine-tuple compatible with time.mktime, email utils parsedate_tz for parsing an RFC 2822 date string into a ten-tuple that includes the UTC offset timezone in seconds, email utils parsedate_to_datetime for parsing an RFC 2822 date string into an aware datetime object with timezone, email utils formatdate for formatting a POSIX timestamp or the current time as an RFC 2822 date string with optional usegmt and localtime flags, email utils format_datetime for formatting a datetime object as an RFC 2822 date string, email utils make_msgid for generating a globally unique Message-ID string with optional idstring and domain components, email utils decode_rfc2231 for decoding an RFC 2231 encoded parameter value into a tuple of charset language and value, email utils encode_rfc2231 for encoding a string as an RFC 2231 encoded parameter value, email utils collapse_rfc2231_value for collapsing a decoded RFC 2231 tuple to a Unicode string, and email utils integration with email.message and email.headerregistry and datetime and time for building address parsers date formatters message-id generators header extractors and RFC-compliant email construction utilities.

5 min read Feb 10, 2029

Put these ideas into practice

Claude Skills 360 gives you production-ready skills for everything in this article — and 2,350+ more. Start free or go all-in.

Get 360 skills free

Free $39