Blog / AI / Claude Code for tarfile: Python TAR Archive Creation and Extraction

Claude Code for tarfile: Python TAR Archive Creation and Extraction

Published: September 21, 2028

•

Read time: 5 min read

•

By: Claude Skills 360

Python’s tarfile module creates and reads TAR archives with optional compression. import tarfile. open: tarfile.open("archive.tar.gz", "w:gz") — modes: "r" auto-detect, "r:gz" / "r:bz2" / "r:xz" compressed read, "w" / "w:gz" / "w:bz2" / "w:xz" write, "a" append (uncompressed only), "r|gz" streaming read pipe. add: tf.add("src/", arcname="src", recursive=True, filter=fn) — filter receives TarInfo and returns modified or None to exclude. extract: tf.extract(member, path=".", set_attrs=True, numeric_owner=False). extractall: tf.extractall(path, members=None, filter="data") — filter="data" (Python 3.12+) blocks dangerous paths. getmembers: tf.getmembers() → list of TarInfo. TarInfo attrs: .name, .size, .mtime, .mode, .uid, .gid, .type (REGTYPE/DIRTYPE/SYMTYPE), .linkname, .isfile()/.isdir()/.issym(). addfile: tf.addfile(tarinfo, fileobj) — add in-memory data. is_tarfile: tarfile.is_tarfile("file.tar.gz") → bool. Context manager: with tarfile.open(...) as tf: — auto-closes. getnames: tf.getnames() → list of str. tarfile.TarFile.OPEN_METH — maps "gz"→"r:gz" etc. Claude Code generates cross-platform archivers, reproducible build tarballs, streaming extractors, and incremental backup tools.

CLAUDE.md for tarfile

## tarfile Stack
- Stdlib: import tarfile
- Write:  with tarfile.open("out.tar.gz", "w:gz") as tf: tf.add("dir/", arcname="dir")
- Read:   with tarfile.open("in.tar.gz") as tf: tf.extractall(dest, filter="data")
- List:   with tarfile.open("a.tar") as tf: print(tf.getnames())
- Filter: def strip(t): t.uid=t.gid=0; t.uname=t.gname=""; return t  # reproducible
- Memory: info=tarfile.TarInfo(name="f.txt"); info.size=len(data); tf.addfile(info, io.BytesIO(data))

tarfile Archive Pipeline

# app/tarutil.py — create, extract, list, stream, reproducible, in-memory
from __future__ import annotations

import io
import os
import stat
import tarfile
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Generator, Iterator


# ─────────────────────────────────────────────────────────────────────────────
# 1. Archive creation
# ─────────────────────────────────────────────────────────────────────────────

def _strip_filter(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
    """
    Strip non-reproducible metadata: uid/gid, uname/gname, mtime.
    Use as the filter= argument to tf.add() for reproducible archives.
    """
    tarinfo.uid = tarinfo.gid = 0
    tarinfo.uname = tarinfo.gname = ""
    tarinfo.mtime = 0
    return tarinfo


def create_archive(
    output: str | Path,
    sources: list[str | Path],
    compression: str = "gz",
    arcname_map: dict[str, str] | None = None,
    reproducible: bool = False,
    exclude: Callable[[tarfile.TarInfo], bool] | None = None,
) -> Path:
    """
    Create a TAR archive from one or more source paths.

    compression: "gz" (default), "bz2", "xz", or "" (no compression).
    arcname_map: {str(source_path): arcname} — override archive names.
    reproducible: strip uid/gid/mtime so archive is bitwise-reproducible.
    exclude: return True from filter to skip a TarInfo entry.

    Example:
        create_archive("dist.tar.gz", ["src/", "README.md"])
        create_archive("pkg.tar.xz", ["myapp/"], reproducible=True)
    """
    mode = f"w:{compression}" if compression else "w"
    out_path = Path(output)

    def _filter(ti: tarfile.TarInfo) -> tarfile.TarInfo | None:
        if exclude and exclude(ti):
            return None
        if reproducible:
            ti = _strip_filter(ti)
        return ti

    with tarfile.open(out_path, mode) as tf:
        for src in sources:
            src_path = Path(src)
            arc = (arcname_map or {}).get(str(src_path), src_path.name)
            tf.add(str(src_path), arcname=arc, recursive=True, filter=_filter)

    return out_path


def create_from_bytes(
    output: str | Path,
    files: dict[str, bytes],
    compression: str = "gz",
    mtime: float | None = None,
) -> Path:
    """
    Create a TAR archive from an in-memory dict of {arcname: data}.
    Useful for generating archives without touching disk.

    Example:
        create_from_bytes("bundle.tar.gz", {
            "config.json": b'{"key": "value"}',
            "README.txt": b"My bundle",
        })
    """
    mode = f"w:{compression}" if compression else "w"
    out_path = Path(output)
    ts = mtime if mtime is not None else time.time()

    with tarfile.open(out_path, mode) as tf:
        for arcname, data in files.items():
            info = tarfile.TarInfo(name=arcname)
            info.size = len(data)
            info.mtime = int(ts)
            tf.addfile(info, io.BytesIO(data))

    return out_path


# ─────────────────────────────────────────────────────────────────────────────
# 2. Extraction
# ─────────────────────────────────────────────────────────────────────────────

def extract_archive(
    archive: str | Path,
    dest: str | Path = ".",
    members: list[str] | None = None,
    safe: bool = True,
) -> Path:
    """
    Extract a TAR archive to dest.

    members: list of arcnames to extract (None = all).
    safe: use filter="data" (Python 3.12+) to block path traversal exploits.

    Example:
        extract_archive("dist.tar.gz", "/tmp/out")
        extract_archive("bundle.tar.gz", ".", members=["config.json"])
    """
    dest_path = Path(dest)
    dest_path.mkdir(parents=True, exist_ok=True)

    with tarfile.open(str(archive)) as tf:
        if members is not None:
            selected = [tf.getmember(m) for m in members]
        else:
            selected = None

        kwargs: dict = {}
        if safe:
            import sys
            if sys.version_info >= (3, 12):
                kwargs["filter"] = "data"

        tf.extractall(path=dest_path, members=selected, **kwargs)

    return dest_path


def extract_to_memory(archive: str | Path) -> dict[str, bytes]:
    """
    Extract all regular files from a TAR archive into memory.
    Returns {arcname: data}.

    Example:
        files = extract_to_memory("bundle.tar.gz")
        config = files["config.json"]
    """
    result: dict[str, bytes] = {}
    with tarfile.open(str(archive)) as tf:
        for member in tf.getmembers():
            if member.isfile():
                fobj = tf.extractfile(member)
                if fobj is not None:
                    result[member.name] = fobj.read()
    return result


def extract_file_bytes(archive: str | Path, arcname: str) -> bytes:
    """
    Extract a single file from an archive to bytes.

    Example:
        data = extract_file_bytes("bundle.tar.gz", "config.json")
    """
    with tarfile.open(str(archive)) as tf:
        fobj = tf.extractfile(arcname)
        if fobj is None:
            raise KeyError(f"{arcname!r} is not a regular file in the archive")
        return fobj.read()


# ─────────────────────────────────────────────────────────────────────────────
# 3. Listing and inspection
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class ArchiveMember:
    name:    str
    size:    int
    mtime:   float
    mode:    int
    type:    str   # "file", "dir", "symlink", "other"
    link:    str   # symlink target or ""

    @property
    def mode_str(self) -> str:
        return stat.filemode(self.mode)

    def __str__(self) -> str:
        return (f"{self.mode_str}  {self.type:7s}  "
                f"{self.size:10,d}  {self.name}")


def list_archive(archive: str | Path) -> list[ArchiveMember]:
    """
    List archive members with metadata.

    Example:
        for m in list_archive("dist.tar.gz"):
            print(m)
    """
    members: list[ArchiveMember] = []
    with tarfile.open(str(archive)) as tf:
        for ti in tf.getmembers():
            if ti.isfile():
                kind = "file"
            elif ti.isdir():
                kind = "dir"
            elif ti.issym():
                kind = "symlink"
            else:
                kind = "other"
            members.append(ArchiveMember(
                name=ti.name, size=ti.size, mtime=ti.mtime,
                mode=ti.mode, type=kind, link=ti.linkname,
            ))
    return members


def archive_info(archive: str | Path) -> dict:
    """
    Return summary stats for an archive.

    Example:
        info = archive_info("dist.tar.gz")
        print(f"{info['file_count']} files, {info['total_size']:,d} bytes")
    """
    members = list_archive(archive)
    files = [m for m in members if m.type == "file"]
    return {
        "path": str(archive),
        "compressed_size": Path(archive).stat().st_size,
        "member_count": len(members),
        "file_count": len(files),
        "total_size": sum(m.size for m in files),
    }


# ─────────────────────────────────────────────────────────────────────────────
# 4. Streaming (pipe mode)
# ─────────────────────────────────────────────────────────────────────────────

def stream_archive_members(
    archive: str | Path,
) -> Generator[tuple[tarfile.TarInfo, bytes], None, None]:
    """
    Stream (name, data) tuples from a compressed TAR archive without
    loading the full archive into memory.

    Example:
        for info, data in stream_archive_members("huge.tar.gz"):
            if info.name.endswith(".log"):
                process(data)
    """
    with tarfile.open(str(archive), "r|*") as tf:
        for member in tf:
            if member.isfile():
                fobj = tf.extractfile(member)
                if fobj is not None:
                    yield member, fobj.read()


def stream_create_archive(
    output: str | Path,
    file_iter: Iterator[tuple[str, bytes]],
    compression: str = "gz",
) -> Path:
    """
    Create a TAR archive by streaming (name, data) tuples.
    Memory-efficient for large numbers of files.

    Example:
        def gen():
            for path in Path("logs/").glob("*.log"):
                yield path.name, path.read_bytes()
        stream_create_archive("logs.tar.gz", gen())
    """
    mode = f"w|{compression}" if compression else "w|"
    out_path = Path(output)
    with tarfile.open(str(out_path), mode) as tf:
        for arcname, data in file_iter:
            info = tarfile.TarInfo(name=arcname)
            info.size = len(data)
            tf.addfile(info, io.BytesIO(data))
    return out_path


# ─────────────────────────────────────────────────────────────────────────────
# 5. Reproducible build helper
# ─────────────────────────────────────────────────────────────────────────────

def reproducible_archive(
    output: str | Path,
    sources: list[str | Path],
    compression: str = "gz",
    source_date_epoch: int = 0,
) -> Path:
    """
    Create a reproducible TAR archive suitable for distribution packaging.
    Strips uid/gid/uname/gname; pins mtime to source_date_epoch.
    Sorts members alphabetically for deterministic order.

    Example:
        reproducible_archive("release.tar.gz", ["myapp/", "LICENSE"])
        # Identical on every build given identical source files.
    """
    mode = f"w:{compression}" if compression else "w"
    out_path = Path(output)

    # Collect all (arcname, real_path) pairs sorted alphabetically
    pairs: list[tuple[str, Path]] = []
    for src in sources:
        src_path = Path(src)
        if src_path.is_file():
            pairs.append((src_path.name, src_path))
        else:
            for child in sorted(src_path.rglob("*")):
                if child.is_file():
                    rel = child.relative_to(src_path.parent)
                    pairs.append((str(rel), child))
    pairs.sort(key=lambda x: x[0])

    with tarfile.open(out_path, mode) as tf:
        for arcname, real_path in pairs:
            info = tf.gettarinfo(str(real_path), arcname=arcname)
            info.uid = info.gid = 0
            info.uname = info.gname = ""
            info.mtime = source_date_epoch
            with real_path.open("rb") as f:
                tf.addfile(info, f)

    return out_path


# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import tempfile

    print("=== tarfile demo ===")

    with tempfile.TemporaryDirectory() as tmpdir:
        tmp = Path(tmpdir)

        # ── create source files ────────────────────────────────────────────────
        src_dir = tmp / "myapp"
        src_dir.mkdir()
        (src_dir / "main.py").write_text("print('hello')\n")
        (src_dir / "config.json").write_text('{"debug": false}\n')
        (src_dir / "data").mkdir()
        (src_dir / "data" / "records.csv").write_text("id,name\n1,Alice\n")

        # ── create_archive ─────────────────────────────────────────────────────
        print("\n--- create_archive ---")
        gz_path = create_archive(tmp / "dist.tar.gz", [src_dir])
        print(f"  created: {gz_path.name}  ({gz_path.stat().st_size:,d} bytes)")

        bz2_path = create_archive(tmp / "dist.tar.bz2", [src_dir], compression="bz2")
        print(f"  created: {bz2_path.name}  ({bz2_path.stat().st_size:,d} bytes)")

        xz_path = create_archive(tmp / "dist.tar.xz", [src_dir], compression="xz",
                                  reproducible=True)
        print(f"  created: {xz_path.name}  ({xz_path.stat().st_size:,d} bytes)")

        # ── list_archive ───────────────────────────────────────────────────────
        print("\n--- list_archive ---")
        for m in list_archive(gz_path):
            print(f"  {m}")

        info = archive_info(gz_path)
        print(f"  summary: {info['file_count']} files, "
              f"{info['total_size']:,d} uncompressed bytes, "
              f"{info['compressed_size']:,d} on disk")

        # ── create_from_bytes ──────────────────────────────────────────────────
        print("\n--- create_from_bytes ---")
        mem_path = create_from_bytes(tmp / "bundle.tar.gz", {
            "config.json": b'{"key": "value"}',
            "README.txt":  b"Generated bundle\n",
            "data/sample.csv": b"id,val\n1,42\n",
        })
        print(f"  bundle: {mem_path.name}  ({mem_path.stat().st_size:,d} bytes)")

        # ── extract_to_memory ──────────────────────────────────────────────────
        print("\n--- extract_to_memory ---")
        files = extract_to_memory(mem_path)
        for name, data in sorted(files.items()):
            print(f"  {name}: {data!r}")

        # ── extract_file_bytes ─────────────────────────────────────────────────
        print("\n--- extract_file_bytes ---")
        config = extract_file_bytes(mem_path, "config.json")
        print(f"  config.json: {config!r}")

        # ── extract_archive ────────────────────────────────────────────────────
        print("\n--- extract_archive ---")
        extract_dir = tmp / "extracted"
        extract_archive(gz_path, extract_dir)
        extracted = sorted(extract_dir.rglob("*"))
        for p in extracted:
            if p.is_file():
                print(f"  {p.relative_to(extract_dir)}: {p.stat().st_size} bytes")

        # ── stream_create_archive / stream_archive_members ────────────────────
        print("\n--- streaming ---")
        def _log_gen() -> Iterator[tuple[str, bytes]]:
            for i in range(5):
                yield f"log_{i:04d}.txt", f"log entry {i}\n".encode() * 100
        stream_path = stream_create_archive(tmp / "logs.tar.gz", _log_gen())
        print(f"  streamed archive: {stream_path.name}  ({stream_path.stat().st_size:,d} bytes)")
        count = sum(1 for _ in stream_archive_members(stream_path))
        print(f"  streamed back {count} members")

        # ── reproducible_archive ───────────────────────────────────────────────
        print("\n--- reproducible_archive ---")
        r1 = reproducible_archive(tmp / "repro1.tar.gz", [src_dir])
        r2 = reproducible_archive(tmp / "repro2.tar.gz", [src_dir])
        print(f"  identical: {r1.read_bytes() == r2.read_bytes()}")

    print("\n=== done ===")

For the zipfile alternative — zipfile (stdlib) creates .zip archives; ZIP is universally supported on Windows, macOS, and Linux without extra tools, supports per-file compression (Deflate, bzip2, lzma), and allows random-access reads without scanning the whole archive — use zipfile when your users are Windows-centric, when you need to read or update individual files inside a large archive efficiently, or when your tool ecosystem expects .zip; use tarfile for Unix-style archives, Linux distribution packages (.tar.gz, .tar.xz), and when you need to preserve Unix permissions and symbolic links accurately. For the shutil alternative — shutil.make_archive(base, "gztar", root_dir, base_dir) and shutil.unpack_archive(path, dest) wrap tarfile and zipfile in a single-call interface; make_archive handles format selection and temporary file management automatically, which simplifies simple “archive this directory” tasks — use shutil for straightforward directory packing where you don’t need filtering, streaming, or in-memory control; use tarfile directly when you need reproducible builds, incremental extraction, streaming pipe mode, in-memory file injection, or custom per-member filtering. The Claude Skills 360 bundle includes tarfile skill sets covering create_archive() with reproducible and exclude filter support, create_from_bytes() in-memory archive builder, extract_archive()/extract_to_memory()/extract_file_bytes() extractors, ArchiveMember dataclass with list_archive()/archive_info(), stream_archive_members()/stream_create_archive() pipe-mode streaming, and reproducible_archive() deterministic packaging. Start with the free tier to try TAR archive patterns and tarfile pipeline code generation.

Keep Reading

Claude Code for email.contentmanager: Python Email Content Accessors

Read and write EmailMessage body content with Python's email.contentmanager module and Claude Code — email contentmanager ContentManager for the class that maps content types to get and set handler functions allowing EmailMessage to support get_content and set_content with type-specific behaviour, email contentmanager raw_data_manager for the ContentManager instance that handles raw bytes and str payloads without any conversion, email contentmanager content_manager for the standard ContentManager instance used by email.policy.default that intelligently handles text plain text html multipart and binary content types, email contentmanager get_content_text for the handler that returns the decoded text payload of a text-star message part as a str, email contentmanager get_content_binary for the handler that returns the raw decoded bytes payload of a non-text message part, email contentmanager get_data_manager for the get-handler lookup used by EmailMessage get_content to find the right reader function for the content type, email contentmanager set_content text for the handler that creates and sets a text part correctly choosing charset and transfer encoding, email contentmanager set_content bytes for the handler that creates and sets a binary part with base64 encoding and optional filename Content-Disposition, email contentmanager EmailMessage get_content for the method that reads the message body using the registered content manager handlers, email contentmanager EmailMessage set_content for the method that sets the message body and MIME headers in one call, email contentmanager EmailMessage make_alternative make_mixed make_related for the methods that convert a simple message into a multipart container, email contentmanager EmailMessage add_attachment for the method that attaches a file or bytes to a multipart message, and email contentmanager integration with email.message and email.policy and email.mime and io for building high-level email readers attachment extractors text body accessors HTML readers and policy-aware MIME construction pipelines.

5 min read Feb 12, 2029

Claude Code for email.charset: Python Email Charset Encoding

Control header and body encoding for international email with Python's email.charset module and Claude Code — email charset Charset for the class that wraps a character set name with the encoding rules for header encoding and body encoding describing how to encode text for that charset in email messages, email charset Charset header_encoding for the attribute specifying whether headers using this charset should use QP quoted-printable encoding BASE64 encoding or no encoding, email charset Charset body_encoding for the attribute specifying the Content-Transfer-Encoding to use for message bodies in this charset such as QP or BASE64, email charset Charset output_codec for the attribute giving the Python codec name used to encode the string to bytes for the wire format, email charset Charset input_codec for the attribute giving the Python codec name used to decode incoming bytes to str, email charset Charset get_output_charset for returning the output charset name, email charset Charset header_encode for encoding a header string using the charset's header_encoding method, email charset Charset body_encode for encoding body content using the charset's body_encoding, email charset Charset convert for converting a string from the input_codec to the output_codec, email charset add_charset for registering a new charset with custom encoding rules in the global charset registry, email charset add_alias for adding an alias name that maps to an existing registered charset, email charset add_codec for registering a codec name mapping for use by the charset machinery, and email charset integration with email.message and email.mime and email.policy and email.encoders for building international email senders non-ASCII header encoders Content-Transfer-Encoding selectors charset-aware message constructors and MIME encoding pipelines.

5 min read Feb 11, 2029

Claude Code for email.utils: Python Email Address and Header Utilities

Parse and format RFC 2822 email addresses and dates with Python's email.utils module and Claude Code — email utils parseaddr for splitting a display-name plus angle-bracket address string into a realname and email address tuple, email utils formataddr for combining a realname and address string into a properly quoted RFC 2822 address with angle brackets, email utils getaddresses for parsing a list of raw address header strings each potentially containing multiple comma-separated addresses into a list of realname address tuples, email utils parsedate for parsing an RFC 2822 date string into a nine-tuple compatible with time.mktime, email utils parsedate_tz for parsing an RFC 2822 date string into a ten-tuple that includes the UTC offset timezone in seconds, email utils parsedate_to_datetime for parsing an RFC 2822 date string into an aware datetime object with timezone, email utils formatdate for formatting a POSIX timestamp or the current time as an RFC 2822 date string with optional usegmt and localtime flags, email utils format_datetime for formatting a datetime object as an RFC 2822 date string, email utils make_msgid for generating a globally unique Message-ID string with optional idstring and domain components, email utils decode_rfc2231 for decoding an RFC 2231 encoded parameter value into a tuple of charset language and value, email utils encode_rfc2231 for encoding a string as an RFC 2231 encoded parameter value, email utils collapse_rfc2231_value for collapsing a decoded RFC 2231 tuple to a Unicode string, and email utils integration with email.message and email.headerregistry and datetime and time for building address parsers date formatters message-id generators header extractors and RFC-compliant email construction utilities.

5 min read Feb 10, 2029

Put these ideas into practice

Claude Skills 360 gives you production-ready skills for everything in this article — and 2,350+ more. Start free or go all-in.

Get 360 skills free

Free $39