charset-normalizer detects text encoding without heuristic guessing. pip install charset-normalizer. from_bytes: from charset_normalizer import from_bytes; results = from_bytes(raw_bytes). Best: best = results.best(); best.encoding → “utf-8”. Decoded: str(best). First: results.first(). from_path: from charset_normalizer import from_path; results = from_path("file.txt"). from_fp: from charset_normalizer import from_fp; results = from_fp(file_obj). normalize: from charset_normalizer import normalize; normalize("file.txt") — writes UTF-8 copy. decode: decoded = best.output().decode("utf-8"). output: best.output() → bytes in detected encoding. encoding: best.encoding → “windows-1252”. chaos: best.chaos → disorder score (0.0=clean, 1.0=chaos). coherence: best.coherence → text coherence score. language: best.languages → detected language list. bom: best.bom → whether BOM was detected. raw: best.raw → original bytes. encode: best.output(encoding="utf-8"). alphabets: best.alphabets. could_be_from_charset: results.could_be_from_charset → list of candidates. len(results): number of viable candidates. Iterate: for r in results: print(r.encoding, r.chaos). is_large_sequence: optimizes detection for large files. cli: python -m charset_normalizer file.txt. Preferred over chardet: faster, more accurate, actively maintained. Claude Code generates charset-normalizer decoders, CSV normalizers, log file converters, and encoding validators.
CLAUDE.md for charset-normalizer
## charset-normalizer Stack
- Version: charset-normalizer >= 3.3 | pip install charset-normalizer
- Detect bytes: results = from_bytes(raw) | results.best().encoding
- Detect file: results = from_path("file.txt") | str(results.best())
- Decode: safe_decode(raw) → str | normalize_to_utf8(raw) → bytes
- Scores: best.chaos (0=clean, 1=corrupted) | best.coherence | best.languages
- Output: best.output(encoding="utf-8") → re-encoded bytes
charset-normalizer Encoding Pipeline
# app/encoding.py — charset-normalizer detection, normalization, bulk convert, validation
from __future__ import annotations
import logging
from pathlib import Path
from typing import Generator
from charset_normalizer import from_bytes, from_fp, from_path, normalize
from charset_normalizer.models import NormalizedInputOrDefault
log = logging.getLogger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# 1. Detection helpers
# ─────────────────────────────────────────────────────────────────────────────
def detect_encoding(raw: bytes) -> dict:
"""
Detect the encoding of a byte string.
Returns dict with encoding, confidence indicators, language.
Example:
info = detect_encoding(b"\\xff\\xfe" + "Hello".encode("utf-16-le"))
info["encoding"] # "utf-16"
info["chaos"] # near 0.0 (clean)
"""
results = from_bytes(raw)
best = results.best()
if best is None:
return {
"encoding": None,
"chaos": 1.0,
"coherence": 0.0,
"languages": [],
"bom": False,
"candidates": [],
}
return {
"encoding": best.encoding,
"chaos": round(best.chaos, 4),
"coherence": round(best.coherence, 4),
"languages": list(best.languages),
"bom": best.bom,
"candidates": [r.encoding for r in results],
}
def detect_file_encoding(path: str | Path) -> dict:
"""
Detect the encoding of a file.
Example:
info = detect_file_encoding("legacy_data.csv")
print(info["encoding"]) # "windows-1252"
print(info["languages"]) # ["Latin-1 Supplement"]
"""
results = from_path(str(path))
best = results.best()
if best is None:
return {"encoding": None, "path": str(path)}
return {
"encoding": best.encoding,
"chaos": round(best.chaos, 4),
"coherence": round(best.coherence, 4),
"languages": list(best.languages),
"bom": best.bom,
"path": str(path),
}
def is_valid_encoding(raw: bytes, encoding: str) -> bool:
"""
Check if bytes can be decoded with the given encoding without errors.
Example:
is_valid_encoding(data, "utf-8") # True or False
"""
try:
raw.decode(encoding)
return True
except (UnicodeDecodeError, LookupError):
return False
# ─────────────────────────────────────────────────────────────────────────────
# 2. Decoding helpers
# ─────────────────────────────────────────────────────────────────────────────
def safe_decode(
raw: bytes,
fallback_encoding: str = "latin-1",
chaos_threshold: float = 0.2,
) -> str:
"""
Safely decode bytes to a string.
Falls back to the fallback_encoding if detection fails or chaos is high.
Example:
text = safe_decode(b"Caf\\xe9") # "Café" (latin-1/windows-1252)
text = safe_decode(response.content) # any HTTP response body
"""
# Try UTF-8 first (most common, fast)
try:
return raw.decode("utf-8")
except UnicodeDecodeError:
pass
results = from_bytes(raw)
best = results.best()
if best and best.chaos < chaos_threshold:
return str(best)
# Final fallback — latin-1 never raises, just gives garbage for non-latin bytes
return raw.decode(fallback_encoding, errors="replace")
def decode_safely(raw: bytes, encoding: str, errors: str = "replace") -> str:
"""
Decode with a known encoding, using errors strategy on failure.
Example:
text = decode_safely(data, "windows-1252", errors="replace")
"""
return raw.decode(encoding, errors=errors)
def read_text_file(
path: str | Path,
encoding: str | None = None,
fallback: str = "latin-1",
) -> str:
"""
Read a text file, auto-detecting encoding if not specified.
Example:
content = read_text_file("legacy.csv") # auto-detect
content = read_text_file("known.txt", "utf-8") # specified
"""
p = Path(path)
raw = p.read_bytes()
if encoding:
return raw.decode(encoding, errors="replace")
return safe_decode(raw, fallback_encoding=fallback)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Normalization (convert to UTF-8)
# ─────────────────────────────────────────────────────────────────────────────
def normalize_to_utf8(
raw: bytes,
chaos_threshold: float = 0.5,
) -> bytes:
"""
Convert bytes in any encoding to UTF-8 bytes.
Returns raw unchanged if it's already clean UTF-8 or detection fails.
Example:
utf8 = normalize_to_utf8(windows1252_bytes)
json.loads(utf8)
"""
try:
raw.decode("utf-8")
return raw # already UTF-8
except UnicodeDecodeError:
pass
results = from_bytes(raw)
best = results.best()
if best and best.chaos < chaos_threshold:
return best.output(encoding="utf-8")
return raw # can't reliably convert
def normalize_file_to_utf8(
src: str | Path,
dst: str | Path | None = None,
backup: bool = True,
) -> Path:
"""
Convert a file to UTF-8, optionally saving to a new path.
If dst is None, overwrites src (with optional backup).
Example:
normalize_file_to_utf8("legacy.csv") # overwrite
normalize_file_to_utf8("en.csv", "en_utf8.csv") # new file
"""
src = Path(src)
dst = Path(dst) if dst else src
raw = src.read_bytes()
content = normalize_to_utf8(raw)
if backup and dst == src and content != raw:
src.with_suffix(src.suffix + ".bak").write_bytes(raw)
dst.parent.mkdir(parents=True, exist_ok=True)
dst.write_bytes(content)
return dst
# ─────────────────────────────────────────────────────────────────────────────
# 4. Bulk file processing
# ─────────────────────────────────────────────────────────────────────────────
def scan_directory_encodings(
directory: str | Path,
pattern: str = "*.txt",
recursive: bool = True,
) -> list[dict]:
"""
Scan all matching files and report their encodings.
Example:
report = scan_directory_encodings("data/", pattern="*.csv")
non_utf8 = [f for f in report if f["encoding"] != "utf-8"]
"""
directory = Path(directory)
glob_fn = directory.rglob if recursive else directory.glob
results = []
for path in glob_fn(pattern):
try:
info = detect_file_encoding(path)
results.append(info)
except Exception as e:
results.append({"path": str(path), "encoding": None, "error": str(e)})
return results
def batch_normalize(
files: list[str | Path],
output_dir: str | Path | None = None,
skip_utf8: bool = True,
) -> dict[str, str]:
"""
Normalize a list of files to UTF-8.
Returns {src_path: status} mapping.
Example:
results = batch_normalize(Path("data").glob("*.csv"), output_dir="data_utf8")
failed = {k: v for k, v in results.items() if v != "ok"}
"""
status_map: dict[str, str] = {}
out_dir = Path(output_dir) if output_dir else None
if out_dir:
out_dir.mkdir(parents=True, exist_ok=True)
for src in files:
src = Path(src)
try:
raw = src.read_bytes()
# Check if already UTF-8
try:
raw.decode("utf-8")
if skip_utf8:
status_map[str(src)] = "utf-8 (skipped)"
continue
except UnicodeDecodeError:
pass
utf8 = normalize_to_utf8(raw)
dst = (out_dir / src.name) if out_dir else src
dst.write_bytes(utf8)
status_map[str(src)] = "ok"
except Exception as e:
log.error("Failed to normalize %s: %s", src, e)
status_map[str(src)] = f"error: {e}"
return status_map
def iter_decoded_files(
directory: str | Path,
pattern: str = "**/*.txt",
) -> Generator[tuple[Path, str], None, None]:
"""
Yield (path, decoded_text) for files in directory.
Example:
for path, text in iter_decoded_files("logs/", "**/*.log"):
index.add_document(path, text)
"""
for path in Path(directory).glob(pattern):
try:
text = read_text_file(path)
yield path, text
except Exception as e:
log.warning("Could not decode %s: %s", path, e)
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== Encoding detection ===")
samples = [
("UTF-8", "Hello, World! 🌍".encode("utf-8")),
("UTF-16", "Hello".encode("utf-16")),
("windows-1252", "Caf\xe9 au lait".encode("windows-1252")),
("ISO-8859-1", "Beyonc\xe9".encode("iso-8859-1")),
("pure ASCII", b"Hello World"),
]
for label, raw in samples:
info = detect_encoding(raw)
print(f" {label:15s}: detected={info['encoding']:15s} chaos={info['chaos']:.3f}")
print("\n=== safe_decode ===")
for label, raw in samples:
decoded = safe_decode(raw)
print(f" {label:15s}: {decoded!r}")
print("\n=== normalize_to_utf8 ===")
win1252 = "Café résumé naïve".encode("windows-1252")
utf8 = normalize_to_utf8(win1252)
print(f" windows-1252 → utf-8: {utf8.decode('utf-8')!r}")
print("\n=== is_valid_encoding ===")
print(f" UTF-8 valid: {is_valid_encoding(b'hello', 'utf-8')}")
print(f" ASCII via UTF-8: {is_valid_encoding(b'hello', 'ascii')}")
print(f" Bad UTF-8: {is_valid_encoding(b'Caf\\xe9', 'utf-8')}")
For the chardet alternative — chardet is the original Python encoding detection library from Mozilla, supports 30+ encodings, and is widely used as a dependency; charset-normalizer is a newer, actively maintained replacement that is faster, more accurate, has better Unicode normalization, and can optionally detect the natural language of the text — use charset-normalizer for new projects (it is the default detector in the requests library since v2.26). For the ftfy alternative — ftfy fixes already-decoded but incorrectly-encoded Unicode strings (mojibake like “Café” → “Café”); charset-normalizer detects the encoding of raw bytes before decoding — use charset-normalizer first to decode bytes correctly, ftfy as a downstream repair step when you receive already-decoded text that looks garbled. The Claude Skills 360 bundle includes charset-normalizer skill sets covering detect_encoding()/detect_file_encoding()/is_valid_encoding(), safe_decode()/decode_safely()/read_text_file(), normalize_to_utf8()/normalize_file_to_utf8(), scan_directory_encodings()/batch_normalize()/iter_decoded_files(). Start with the free tier to try encoding detection and UTF-8 normalization code generation.