Python’s email.charset module controls how the email package encodes non-ASCII text in headers and bodies. from email import charset as _charset. Charset(input_charset="us-ascii") — create a charset descriptor. Key attributes: .header_encoding — QP (quoted-printable), BASE64, or SHORTEST (auto); .body_encoding — same; .output_codec — Python codec name for wire encoding; .input_codec — codec for decoding incoming. Methods: cs.header_encode(string) — return encoded-word form for headers; cs.body_encode(string) — return encoded body string; cs.convert(string) — transcode input_codec → output_codec. Global registry: charset.add_charset(name, header_enc=None, body_enc=None, output_codec=None) — register a new charset; charset.add_alias(alias, canonical) — add an alias; charset.add_codec(charset_name, codec_name) — map a charset name to a Python codec. Common encodings: utf-8 → BASE64 headers, BASE64 body; us-ascii → no encoding; iso-8859-1 → QP headers, QP body; iso-2022-jp → BASE64 headers. The email.mime.text.MIMEText(body, charset=cs) constructor accepts a Charset object to control encoding. Claude Code generates international email senders, non-ASCII header encoders, Content-Transfer-Encoding selectors, charset-aware message constructors, and MIME encoding pipelines.
CLAUDE.md for email.charset
## email.charset Stack
- Stdlib: from email import charset as _charset
- from email.charset import Charset, add_charset, add_alias
- Create: cs = Charset("utf-8")
- cs.header_encoding # QP / BASE64 / SHORTEST / None
- cs.body_encoding # QP / BASE64 / None
- cs.output_codec # "utf-8"
- Encode: cs.header_encode("héllo") # =?utf-8?b?...?=
- cs.body_encode("body text")
- Register: add_charset("windows-1252", header_enc=QP, body_enc=QP)
- add_alias("win-1252", "windows-1252")
- Use w/ MIME: MIMEText(body, "plain", cs)
email.charset Encoding Pipeline
# app/emailcharsetutil.py — inspect, encode, build, register, detect, compare
from __future__ import annotations
import base64
import quopri
from dataclasses import dataclass
from email import charset as _cs_mod
from email.charset import (
BASE64,
QP,
SHORTEST,
Charset,
add_alias,
add_charset,
add_codec,
)
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import Any
# ─────────────────────────────────────────────────────────────────────────────
# 1. Charset inspection helpers
# ─────────────────────────────────────────────────────────────────────────────
_ENC_NAMES = {None: "none", QP: "QP", BASE64: "base64", SHORTEST: "shortest"}
@dataclass
class CharsetInfo:
name: str
input_codec: str
output_codec: str
header_encoding: str
body_encoding: str
output_charset: str
def inspect_charset(name: str) -> CharsetInfo | None:
"""
Return encoding details for a charset by name.
Example:
info = inspect_charset("utf-8")
print(info.header_encoding, info.body_encoding)
"""
try:
cs = Charset(name)
except Exception:
return None
return CharsetInfo(
name=name,
input_codec=cs.input_codec or "",
output_codec=cs.output_codec or "",
header_encoding=_ENC_NAMES.get(cs.header_encoding, str(cs.header_encoding)),
body_encoding=_ENC_NAMES.get(cs.body_encoding, str(cs.body_encoding)),
output_charset=cs.get_output_charset() or "",
)
def charset_table(names: "list[str] | None" = None) -> list[CharsetInfo]:
"""
Return CharsetInfo for a list of charset names (defaults to common set).
Example:
for info in charset_table():
print(info.name, info.header_encoding, info.body_encoding)
"""
defaults = ["us-ascii", "utf-8", "iso-8859-1", "iso-8859-2",
"iso-2022-jp", "euc-jp", "shift_jis", "big5", "gb2312", "koi8-r"]
results: list[CharsetInfo] = []
for n in (names or defaults):
info = inspect_charset(n)
if info:
results.append(info)
return results
# ─────────────────────────────────────────────────────────────────────────────
# 2. Header encoding helpers
# ─────────────────────────────────────────────────────────────────────────────
def encode_header_value(text: str,
charset_name: str = "utf-8") -> str:
"""
Encode a string as an RFC 2047 encoded-word for use in a header.
Returns the original string unchanged if it is pure ASCII.
Example:
enc = encode_header_value("Héllo Wörld")
# "=?utf-8?b?SMOpbGxvIFf...?="
"""
if text.isascii():
return text
cs = Charset(charset_name)
return cs.header_encode(text)
def decode_header_value(encoded: str) -> str:
"""
Decode an RFC 2047 encoded-word or plain text header value.
Example:
text = decode_header_value("=?utf-8?b?SMOpbGxv?=")
print(text) # "Héllo"
"""
import email.header
parts = email.header.decode_header(encoded)
decoded_parts: list[str] = []
for data, enc in parts:
if isinstance(data, bytes):
decoded_parts.append(data.decode(enc or "utf-8", errors="replace"))
else:
decoded_parts.append(data)
return "".join(decoded_parts)
# ─────────────────────────────────────────────────────────────────────────────
# 3. MIME message body encoding selectors
# ─────────────────────────────────────────────────────────────────────────────
def best_charset_for(text: str) -> str:
"""
Return the most compact charset name for encoding a text body.
Prefers 'us-ascii' for ASCII-only text, 'utf-8' otherwise.
Example:
charset = best_charset_for("Hello!") # "us-ascii"
charset = best_charset_for("こんにちは") # "utf-8"
"""
if text.isascii():
return "us-ascii"
return "utf-8"
def build_text_part(body: str,
subtype: str = "plain") -> MIMEText:
"""
Build a MIMEText part with the optimal charset and transfer encoding.
Example:
part = build_text_part("Hello!")
part_jp = build_text_part("こんにちは", "plain")
"""
charset_name = best_charset_for(body)
cs = Charset(charset_name)
return MIMEText(body, subtype, cs)
def build_international_email(
subject: str,
plain_body: str,
html_body: str | None,
from_addr: str,
to_addrs: "list[str]",
) -> MIMEMultipart:
"""
Build a fully charset-aware international email message.
Automatically selects charset and encoding for headers and body.
Example:
msg = build_international_email(
"Grüß Gott!",
"Willkommen in München.",
"<b>München</b>",
"[email protected]",
["[email protected]"],
)
"""
from email.utils import formatdate, make_msgid
from email.header import Header
outer = MIMEMultipart("alternative")
outer["From"] = from_addr
outer["To"] = ", ".join(to_addrs)
outer["Date"] = formatdate(localtime=True)
outer["Message-ID"] = make_msgid()
# Encode subject with charset if non-ASCII
if not subject.isascii():
outer["Subject"] = Header(subject, "utf-8").encode()
else:
outer["Subject"] = subject
outer.attach(build_text_part(plain_body, "plain"))
if html_body:
outer.attach(build_text_part(html_body, "html"))
return outer
# ─────────────────────────────────────────────────────────────────────────────
# 4. Custom charset registration
# ─────────────────────────────────────────────────────────────────────────────
def register_windows_charsets() -> list[str]:
"""
Register common Windows code page charsets with appropriate encoding rules.
Returns list of registered charset names.
Example:
registered = register_windows_charsets()
print(registered)
"""
extras = [
("windows-1250", QP, QP, "windows-1250"),
("windows-1251", BASE64, BASE64, "windows-1251"),
("windows-1252", QP, QP, "windows-1252"),
("windows-1253", BASE64, BASE64, "windows-1253"),
("windows-1254", QP, QP, "windows-1254"),
]
registered: list[str] = []
for name, hdr, body, codec in extras:
try:
add_charset(name, header_enc=hdr, body_enc=body, output_codec=codec)
registered.append(name)
except Exception:
pass # already registered
# Aliases
try:
add_alias("cp1252", "windows-1252")
add_alias("cp1251", "windows-1251")
except Exception:
pass
return registered
# ─────────────────────────────────────────────────────────────────────────────
# 5. Transfer-encoding comparison
# ─────────────────────────────────────────────────────────────────────────────
def compare_encodings(text: str,
charset_name: str = "utf-8") -> dict[str, Any]:
"""
Compare raw, QP-encoded, and base64-encoded sizes for a text body.
Example:
info = compare_encodings("Hello Wörld! " * 10)
print(info)
"""
raw_bytes = text.encode(charset_name, errors="replace")
qp_encoded = quopri.encodestring(raw_bytes, quotetabs=False)
b64_encoded = base64.encodebytes(raw_bytes)
return {
"raw_bytes": len(raw_bytes),
"qp_bytes": len(qp_encoded),
"base64_bytes": len(b64_encoded),
"qp_overhead": len(qp_encoded) - len(raw_bytes),
"base64_overhead": len(b64_encoded) - len(raw_bytes),
"prefer": "QP" if len(qp_encoded) <= len(b64_encoded) else "BASE64",
}
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== email.charset demo ===")
# ── charset_table ─────────────────────────────────────────────────────
print("\n--- charset_table ---")
for info in charset_table():
print(f" {info.name:20s} hdr={info.header_encoding:8s} "
f"body={info.body_encoding:8s} codec={info.output_codec}")
# ── encode_header_value ────────────────────────────────────────────────
print("\n--- encode_header_value ---")
for text in ["Hello World", "Héllo Wörld", "こんにちは", "Grüß Gott"]:
enc = encode_header_value(text)
dec = decode_header_value(enc)
print(f" {text!r:25s} → {enc!r}")
print(f" {'':25s} decoded back: {dec!r}")
# ── best_charset_for + build_text_part ─────────────────────────────────
print("\n--- best_charset_for ---")
for text in ["Hello!", "Привет!", "こんにちは", "Ça va?"]:
cs_name = best_charset_for(text)
part = build_text_part(text)
print(f" {text!r:20s} charset={cs_name} "
f"cte={part.get('Content-Transfer-Encoding', 'none')}")
# ── compare_encodings ─────────────────────────────────────────────────
print("\n--- compare_encodings ---")
for sample in ["Hello plain ASCII text. " * 5,
"Héllo Wörld! Grüß Gott! " * 5,
"こんにちは世界!" * 5]:
info = compare_encodings(sample)
print(f" {sample[:20]!r} ...")
print(f" raw={info['raw_bytes']} qp={info['qp_bytes']}"
f" b64={info['base64_bytes']} prefer={info['prefer']}")
# ── register_windows_charsets ─────────────────────────────────────────
print("\n--- register_windows_charsets ---")
regs = register_windows_charsets()
print(f" registered: {regs}")
print("\n=== done ===")
For the email.header.Header stdlib companion — email.header.Header(text, charset) / email.header.decode_header(s) handle RFC 2047 encoded-word encoding and decoding at the header level; Header accepts a Charset object or charset name string as its second argument and is used directly in header generation — use Header in legacy email.message.Message code; use email.policy.default with EmailMessage for modern code where header encoding is handled automatically by the typed header system. For the chardet (PyPI) alternative — chardet.detect(raw_bytes)["encoding"] auto-detects the charset of incoming bytes without requiring a prior charset declaration — use chardet (or charset-normalizer) to detect the charset of suspect inbound email bytes before constructing a Charset object; use email.charset for outbound encoding where the charset is known. The Claude Skills 360 bundle includes email.charset skill sets covering CharsetInfo/inspect_charset()/charset_table() inspection helpers, encode_header_value()/decode_header_value() header encoders, best_charset_for()/build_text_part() message builders, build_international_email() full-charset email composer, register_windows_charsets() registry extender, and compare_encodings() transfer-encoding comparator. Start with the free tier to try charset encoding patterns and email.charset pipeline code generation.