Python’s codecs module provides an interface to all text and binary codecs available in the interpreter. import codecs. codecs.open: f = codecs.open("file.txt", "r", encoding="utf-8", errors="replace") — like open() but accepts any codec name. encode/decode: codecs.encode("hello", "rot_13") → "uryyb"; codecs.decode(b"\x89PNG...", "latin-1"). lookup: info = codecs.lookup("utf-8") → CodecInfo(name, encode, decode, streamreader, streamwriter, ...); info.name is the canonical name. Encodings: Python supports "utf-8", "utf-16", "utf-32", "latin-1", "ascii", "idna", "punycode", "base64", "hex_codec", "zlib_codec", "bz2_codec", "rot_13", "uu_codec". BOM: codecs.BOM_UTF8 = b"\xef\xbb\xbf"; codecs.BOM_UTF16 = b"\xff\xfe" or b"\xfe\xff". StreamReader/StreamWriter: wrap a file-like object — reader = codecs.getreader("utf-8")(raw_io). IncrementalDecoder: dec = codecs.getincrementaldecoder("utf-8")(); dec.decode(chunk, final=True) — stateful; handles multi-byte boundaries. errors: "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace", "surrogateescape". register: codecs.register(search_fn) — add custom codec. codecs.encode(data, "zlib_codec") — zlib compress as one-liner. Claude Code generates encoding-aware file readers, BOM-stripping utilities, incremental streaming decoders, and multi-encoding document converters.
CLAUDE.md for codecs
## codecs Stack
- Stdlib: import codecs
- File: f = codecs.open("file.txt", "r", encoding="utf-8-sig", errors="replace")
- Quick: codecs.encode(text, "utf-8"); codecs.decode(data, "utf-8")
- Zlib: compressed = codecs.encode(data, "zlib_codec")
- BOM: data.lstrip(codecs.BOM_UTF8) if data.startswith(codecs.BOM_UTF8)
- Stream: reader = codecs.getreader("utf-8")(raw_binary_io)
codecs Encoding Pipeline
# app/codecutil.py — open, detect BOM, incremental decode, transform, custom
from __future__ import annotations
import codecs
import io
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Generator, Iterator
# ─────────────────────────────────────────────────────────────────────────────
# 1. File and BOM helpers
# ─────────────────────────────────────────────────────────────────────────────
_BOM_TABLE: list[tuple[bytes, str]] = [
(codecs.BOM_UTF32_BE, "utf-32-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
(codecs.BOM_UTF8, "utf-8-sig"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
]
def detect_bom_encoding(data: bytes) -> str | None:
"""
Detect encoding from a BOM prefix in raw bytes.
Returns encoding name or None if no BOM found.
Example:
enc = detect_bom_encoding(Path("file.txt").read_bytes())
if enc:
text = data.decode(enc)
"""
for bom, enc in _BOM_TABLE:
if data.startswith(bom):
return enc
return None
def strip_bom(data: bytes) -> tuple[bytes, str | None]:
"""
Strip BOM from bytes if present. Returns (stripped_bytes, encoding_or_None).
Example:
stripped, enc = strip_bom(raw_bytes)
text = stripped.decode(enc or "utf-8")
"""
for bom, enc in _BOM_TABLE:
if data.startswith(bom):
return data[len(bom):], enc
return data, None
def read_text_auto(path: str | Path, fallback: str = "utf-8",
errors: str = "replace") -> str:
"""
Read a text file, auto-detecting BOM encoding and falling back to `fallback`.
Example:
text = read_text_auto("notes.txt")
"""
raw = Path(path).read_bytes()
enc = detect_bom_encoding(raw) or fallback
cleaned, _ = strip_bom(raw)
return cleaned.decode(enc, errors=errors)
def open_text(
path: str | Path,
mode: str = "r",
encoding: str = "utf-8",
errors: str = "strict",
) -> io.TextIOWrapper:
"""
Open a text file via codecs, returning a file-like object.
Equivalent to open() but uses codecs machinery for any codec name.
Example:
with open_text("win.txt", encoding="cp1252") as f:
for line in f:
print(line)
"""
return codecs.open(str(path), mode, encoding=encoding, errors=errors) # type: ignore[return-value]
# ─────────────────────────────────────────────────────────────────────────────
# 2. Encoding / decoding helpers
# ─────────────────────────────────────────────────────────────────────────────
def encode_to(text: str, encoding: str, errors: str = "strict") -> bytes:
"""
Encode a string to bytes using the named codec.
Example:
data = encode_to("hello", "utf-8")
data = encode_to("naïve", "latin-1", errors="replace")
"""
encoder, _ = codecs.lookup(encoding).incrementalencoder, None
return text.encode(encoding, errors=errors)
def decode_from(data: bytes, encoding: str, errors: str = "strict") -> str:
"""
Decode bytes to string using the named codec.
Example:
text = decode_from(b"\\xff\\xfeh\\x00i\\x00", "utf-16")
"""
return data.decode(encoding, errors=errors)
def transcode(data: bytes, src_enc: str, dst_enc: str, errors: str = "replace") -> bytes:
"""
Re-encode bytes from src_enc to dst_enc.
Example:
utf8_bytes = transcode(latin1_bytes, "latin-1", "utf-8")
"""
return data.decode(src_enc, errors=errors).encode(dst_enc, errors=errors)
def try_decode(data: bytes, encodings: list[str],
errors: str = "strict") -> tuple[str, str] | None:
"""
Try decoding bytes with each encoding in order; return (text, encoding) for
the first success, or None if all fail.
Example:
result = try_decode(raw, ["utf-8", "utf-16", "latin-1"])
if result:
text, enc = result
"""
for enc in encodings:
try:
return data.decode(enc, errors=errors), enc
except (UnicodeDecodeError, LookupError):
continue
return None
# ─────────────────────────────────────────────────────────────────────────────
# 3. Incremental decoder for streaming
# ─────────────────────────────────────────────────────────────────────────────
class StreamingDecoder:
"""
Stateful incremental decoder for feeding bytes in chunks (e.g. from a socket).
Handles multi-byte boundary splits correctly.
Example:
dec = StreamingDecoder("utf-8")
for chunk in socket_chunks:
text = dec.feed(chunk)
process(text)
text = dec.finish()
"""
def __init__(self, encoding: str = "utf-8", errors: str = "replace") -> None:
self._decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
self._encoding = encoding
def feed(self, data: bytes) -> str:
"""Decode a chunk (may be partial; buffers incomplete sequences)."""
return self._decoder.decode(data, final=False)
def finish(self) -> str:
"""Flush any remaining buffered bytes and signal end-of-stream."""
return self._decoder.decode(b"", final=True)
def reset(self) -> None:
self._decoder.reset()
def decode_iter(self, source: Iterator[bytes]) -> Generator[str, None, None]:
"""
Yield decoded strings for each chunk from an iterator of bytes.
Example:
for text in dec.decode_iter(response.iter_content()):
print(text, end="")
"""
for chunk in source:
result = self.feed(chunk)
if result:
yield result
tail = self.finish()
if tail:
yield tail
# ─────────────────────────────────────────────────────────────────────────────
# 4. Data transform codecs
# ─────────────────────────────────────────────────────────────────────────────
def zlib_compress(data: bytes) -> bytes:
"""
Compress bytes using zlib via codecs.
Example:
compressed = zlib_compress(large_bytes)
original = zlib_decompress(compressed)
"""
return codecs.encode(data, "zlib_codec")
def zlib_decompress(data: bytes) -> bytes:
"""Decompress zlib-compressed bytes via codecs."""
return codecs.decode(data, "zlib_codec")
def bz2_compress(data: bytes) -> bytes:
"""Compress bytes using bzip2 via codecs."""
return codecs.encode(data, "bz2_codec")
def bz2_decompress(data: bytes) -> bytes:
"""Decompress bz2-compressed bytes via codecs."""
return codecs.decode(data, "bz2_codec")
def rot13(text: str) -> str:
"""
Apply ROT-13 encoding/decoding (its own inverse) via codecs.
Example:
rot13("Hello World") # "Uryyb Jbeyq"
rot13("Uryyb Jbeyq") # "Hello World"
"""
return codecs.encode(text, "rot_13")
def hex_encode(data: bytes) -> str:
"""Hex-encode bytes via codecs (returns lowercase hex string)."""
return codecs.encode(data, "hex_codec").decode("ascii")
def hex_decode(hex_str: str) -> bytes:
"""Decode a hex string to bytes via codecs."""
return codecs.decode(hex_str.encode("ascii"), "hex_codec")
# ─────────────────────────────────────────────────────────────────────────────
# 5. Codec info and custom codec helpers
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class CodecSummary:
name: str
aliases: list[str]
def __str__(self) -> str:
return f"{self.name} aliases={self.aliases[:4]}"
def codec_info(name: str) -> CodecSummary:
"""
Return summary info for a codec name.
Example:
print(codec_info("utf-8"))
print(codec_info("latin-1"))
"""
info = codecs.lookup(name)
# Build alias list by trying common variants
aliases: list[str] = []
for variant in [name.replace("-", "_"), name.replace("_", "-"),
name.upper(), name.lower()]:
try:
if codecs.lookup(variant).name == info.name and variant != info.name:
aliases.append(variant)
except LookupError:
pass
return CodecSummary(name=info.name, aliases=aliases)
def supported_encodings() -> list[str]:
"""
Return a list of encoding names that Python supports on this platform.
Checks a common set of well-known codecs.
"""
candidates = [
"utf-8", "utf-16", "utf-32", "ascii", "latin-1", "cp1252",
"cp1251", "iso-8859-1", "iso-8859-2", "gbk", "big5", "shift_jis",
"euc-jp", "euc-kr", "utf-8-sig", "idna", "punycode",
"base64", "hex_codec", "zlib_codec", "bz2_codec", "rot_13",
]
supported: list[str] = []
for enc in candidates:
try:
codecs.lookup(enc)
supported.append(enc)
except LookupError:
pass
return supported
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== codecs demo ===")
# ── BOM detection ─────────────────────────────────────────────────────────
print("\n--- BOM detection ---")
samples = {
"UTF-8 BOM": codecs.BOM_UTF8 + "hello world".encode("utf-8"),
"UTF-16 LE BOM": codecs.BOM_UTF16_LE + "hello".encode("utf-16-le"),
"No BOM": "plain text".encode("utf-8"),
}
for label, raw in samples.items():
stripped, enc = strip_bom(raw)
print(f" {label:15s}: enc={enc!r:15s} stripped starts={stripped[:8]!r}")
# ── transcode ─────────────────────────────────────────────────────────────
print("\n--- transcode latin-1 → utf-8 ---")
latin1 = "café résumé".encode("latin-1")
utf8 = transcode(latin1, "latin-1", "utf-8")
print(f" latin1: {latin1!r}")
print(f" utf-8: {utf8!r}")
# ── try_decode ────────────────────────────────────────────────────────────
print("\n--- try_decode ---")
snippets = [
"hello 世界".encode("utf-8"),
"café".encode("latin-1"),
]
for raw in snippets:
result = try_decode(raw, ["utf-8", "latin-1"])
if result:
text, enc = result
print(f" decoded as {enc!r}: {text!r}")
# ── StreamingDecoder ──────────────────────────────────────────────────────
print("\n--- StreamingDecoder ---")
msg = "hello 世界".encode("utf-8")
dec = StreamingDecoder("utf-8")
parts = [msg[:5], msg[5:10], msg[10:]]
reconstructed = ""
for chunk in parts:
reconstructed += dec.feed(chunk)
reconstructed += dec.finish()
print(f" streamed in 3 chunks → {reconstructed!r}")
# ── transform codecs ──────────────────────────────────────────────────────
print("\n--- transform codecs ---")
data = b"hello world " * 100
compressed = zlib_compress(data)
decompressed = zlib_decompress(compressed)
print(f" zlib: {len(data)} → {len(compressed)} bytes (ratio {len(data)/len(compressed):.1f}x)")
print(f" roundtrip ok: {decompressed == data}")
print(f"\n rot13('Hello World') = {rot13('Hello World')!r}")
print(f" hex_encode(b'\\xde\\xad') = {hex_encode(b'\\xde\\xad')!r}")
print(f" hex_decode('deadbeef') = {hex_decode('deadbeef')!r}")
# ── supported encodings ───────────────────────────────────────────────────
print("\n--- supported_encodings (sample) ---")
for enc in supported_encodings()[:8]:
print(f" {enc}")
print("\n=== done ===")
For the chardet / charset-normalizer alternative — chardet (PyPI) samples the byte distribution of an unknown-encoding document and returns a probability-weighted encoding guess; charset-normalizer does the same as a drop-in replacement used by requests — use these when you receive a file with no declared encoding and must infer it from the bytes; use codecs when you already know or can determine the encoding (from BOM, HTTP header, XML declaration, or user configuration) and simply need a streaming-capable encoder/decoder interface. For the io.TextIOWrapper alternative — io.TextIOWrapper wraps a binary io.RawIOBase with a codec and is what Python’s built-in open() returns for text mode; it offers newline, line_buffering, and write_through options; codecs.StreamReader/StreamWriter provide the same wrapping but accept any codec name including zlib_codec and base64 — use io.TextIOWrapper for standard text-file I/O; use codecs.open() or StreamReader/StreamWriter when you need a codec name that open() doesn’t recognise (e.g. "zlib_codec") or when building a custom codec pipeline. The Claude Skills 360 bundle includes codecs skill sets covering detect_bom_encoding()/strip_bom()/read_text_auto()/open_text() BOM-aware readers, encode_to()/decode_from()/transcode()/try_decode() encode/decode helpers, StreamingDecoder with feed()/finish()/decode_iter() incremental streaming, zlib_compress()/zlib_decompress()/bz2_compress()/rot13()/hex_encode()/hex_decode() transform codec wrappers, and codec_info()/supported_encodings() introspection. Start with the free tier to try encoding pipeline patterns and codecs pipeline code generation.