Python’s tarfile module creates and reads TAR archives with optional compression. import tarfile. open: tarfile.open("archive.tar.gz", "w:gz") — modes: "r" auto-detect, "r:gz" / "r:bz2" / "r:xz" compressed read, "w" / "w:gz" / "w:bz2" / "w:xz" write, "a" append (uncompressed only), "r|gz" streaming read pipe. add: tf.add("src/", arcname="src", recursive=True, filter=fn) — filter receives TarInfo and returns modified or None to exclude. extract: tf.extract(member, path=".", set_attrs=True, numeric_owner=False). extractall: tf.extractall(path, members=None, filter="data") — filter="data" (Python 3.12+) blocks dangerous paths. getmembers: tf.getmembers() → list of TarInfo. TarInfo attrs: .name, .size, .mtime, .mode, .uid, .gid, .type (REGTYPE/DIRTYPE/SYMTYPE), .linkname, .isfile()/.isdir()/.issym(). addfile: tf.addfile(tarinfo, fileobj) — add in-memory data. is_tarfile: tarfile.is_tarfile("file.tar.gz") → bool. Context manager: with tarfile.open(...) as tf: — auto-closes. getnames: tf.getnames() → list of str. tarfile.TarFile.OPEN_METH — maps "gz"→"r:gz" etc. Claude Code generates cross-platform archivers, reproducible build tarballs, streaming extractors, and incremental backup tools.
CLAUDE.md for tarfile
## tarfile Stack
- Stdlib: import tarfile
- Write: with tarfile.open("out.tar.gz", "w:gz") as tf: tf.add("dir/", arcname="dir")
- Read: with tarfile.open("in.tar.gz") as tf: tf.extractall(dest, filter="data")
- List: with tarfile.open("a.tar") as tf: print(tf.getnames())
- Filter: def strip(t): t.uid=t.gid=0; t.uname=t.gname=""; return t # reproducible
- Memory: info=tarfile.TarInfo(name="f.txt"); info.size=len(data); tf.addfile(info, io.BytesIO(data))
tarfile Archive Pipeline
# app/tarutil.py — create, extract, list, stream, reproducible, in-memory
from __future__ import annotations
import io
import os
import stat
import tarfile
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Generator, Iterator
# ─────────────────────────────────────────────────────────────────────────────
# 1. Archive creation
# ─────────────────────────────────────────────────────────────────────────────
def _strip_filter(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
"""
Strip non-reproducible metadata: uid/gid, uname/gname, mtime.
Use as the filter= argument to tf.add() for reproducible archives.
"""
tarinfo.uid = tarinfo.gid = 0
tarinfo.uname = tarinfo.gname = ""
tarinfo.mtime = 0
return tarinfo
def create_archive(
output: str | Path,
sources: list[str | Path],
compression: str = "gz",
arcname_map: dict[str, str] | None = None,
reproducible: bool = False,
exclude: Callable[[tarfile.TarInfo], bool] | None = None,
) -> Path:
"""
Create a TAR archive from one or more source paths.
compression: "gz" (default), "bz2", "xz", or "" (no compression).
arcname_map: {str(source_path): arcname} — override archive names.
reproducible: strip uid/gid/mtime so archive is bitwise-reproducible.
exclude: return True from filter to skip a TarInfo entry.
Example:
create_archive("dist.tar.gz", ["src/", "README.md"])
create_archive("pkg.tar.xz", ["myapp/"], reproducible=True)
"""
mode = f"w:{compression}" if compression else "w"
out_path = Path(output)
def _filter(ti: tarfile.TarInfo) -> tarfile.TarInfo | None:
if exclude and exclude(ti):
return None
if reproducible:
ti = _strip_filter(ti)
return ti
with tarfile.open(out_path, mode) as tf:
for src in sources:
src_path = Path(src)
arc = (arcname_map or {}).get(str(src_path), src_path.name)
tf.add(str(src_path), arcname=arc, recursive=True, filter=_filter)
return out_path
def create_from_bytes(
output: str | Path,
files: dict[str, bytes],
compression: str = "gz",
mtime: float | None = None,
) -> Path:
"""
Create a TAR archive from an in-memory dict of {arcname: data}.
Useful for generating archives without touching disk.
Example:
create_from_bytes("bundle.tar.gz", {
"config.json": b'{"key": "value"}',
"README.txt": b"My bundle",
})
"""
mode = f"w:{compression}" if compression else "w"
out_path = Path(output)
ts = mtime if mtime is not None else time.time()
with tarfile.open(out_path, mode) as tf:
for arcname, data in files.items():
info = tarfile.TarInfo(name=arcname)
info.size = len(data)
info.mtime = int(ts)
tf.addfile(info, io.BytesIO(data))
return out_path
# ─────────────────────────────────────────────────────────────────────────────
# 2. Extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_archive(
archive: str | Path,
dest: str | Path = ".",
members: list[str] | None = None,
safe: bool = True,
) -> Path:
"""
Extract a TAR archive to dest.
members: list of arcnames to extract (None = all).
safe: use filter="data" (Python 3.12+) to block path traversal exploits.
Example:
extract_archive("dist.tar.gz", "/tmp/out")
extract_archive("bundle.tar.gz", ".", members=["config.json"])
"""
dest_path = Path(dest)
dest_path.mkdir(parents=True, exist_ok=True)
with tarfile.open(str(archive)) as tf:
if members is not None:
selected = [tf.getmember(m) for m in members]
else:
selected = None
kwargs: dict = {}
if safe:
import sys
if sys.version_info >= (3, 12):
kwargs["filter"] = "data"
tf.extractall(path=dest_path, members=selected, **kwargs)
return dest_path
def extract_to_memory(archive: str | Path) -> dict[str, bytes]:
"""
Extract all regular files from a TAR archive into memory.
Returns {arcname: data}.
Example:
files = extract_to_memory("bundle.tar.gz")
config = files["config.json"]
"""
result: dict[str, bytes] = {}
with tarfile.open(str(archive)) as tf:
for member in tf.getmembers():
if member.isfile():
fobj = tf.extractfile(member)
if fobj is not None:
result[member.name] = fobj.read()
return result
def extract_file_bytes(archive: str | Path, arcname: str) -> bytes:
"""
Extract a single file from an archive to bytes.
Example:
data = extract_file_bytes("bundle.tar.gz", "config.json")
"""
with tarfile.open(str(archive)) as tf:
fobj = tf.extractfile(arcname)
if fobj is None:
raise KeyError(f"{arcname!r} is not a regular file in the archive")
return fobj.read()
# ─────────────────────────────────────────────────────────────────────────────
# 3. Listing and inspection
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class ArchiveMember:
name: str
size: int
mtime: float
mode: int
type: str # "file", "dir", "symlink", "other"
link: str # symlink target or ""
@property
def mode_str(self) -> str:
return stat.filemode(self.mode)
def __str__(self) -> str:
return (f"{self.mode_str} {self.type:7s} "
f"{self.size:10,d} {self.name}")
def list_archive(archive: str | Path) -> list[ArchiveMember]:
"""
List archive members with metadata.
Example:
for m in list_archive("dist.tar.gz"):
print(m)
"""
members: list[ArchiveMember] = []
with tarfile.open(str(archive)) as tf:
for ti in tf.getmembers():
if ti.isfile():
kind = "file"
elif ti.isdir():
kind = "dir"
elif ti.issym():
kind = "symlink"
else:
kind = "other"
members.append(ArchiveMember(
name=ti.name, size=ti.size, mtime=ti.mtime,
mode=ti.mode, type=kind, link=ti.linkname,
))
return members
def archive_info(archive: str | Path) -> dict:
"""
Return summary stats for an archive.
Example:
info = archive_info("dist.tar.gz")
print(f"{info['file_count']} files, {info['total_size']:,d} bytes")
"""
members = list_archive(archive)
files = [m for m in members if m.type == "file"]
return {
"path": str(archive),
"compressed_size": Path(archive).stat().st_size,
"member_count": len(members),
"file_count": len(files),
"total_size": sum(m.size for m in files),
}
# ─────────────────────────────────────────────────────────────────────────────
# 4. Streaming (pipe mode)
# ─────────────────────────────────────────────────────────────────────────────
def stream_archive_members(
archive: str | Path,
) -> Generator[tuple[tarfile.TarInfo, bytes], None, None]:
"""
Stream (name, data) tuples from a compressed TAR archive without
loading the full archive into memory.
Example:
for info, data in stream_archive_members("huge.tar.gz"):
if info.name.endswith(".log"):
process(data)
"""
with tarfile.open(str(archive), "r|*") as tf:
for member in tf:
if member.isfile():
fobj = tf.extractfile(member)
if fobj is not None:
yield member, fobj.read()
def stream_create_archive(
output: str | Path,
file_iter: Iterator[tuple[str, bytes]],
compression: str = "gz",
) -> Path:
"""
Create a TAR archive by streaming (name, data) tuples.
Memory-efficient for large numbers of files.
Example:
def gen():
for path in Path("logs/").glob("*.log"):
yield path.name, path.read_bytes()
stream_create_archive("logs.tar.gz", gen())
"""
mode = f"w|{compression}" if compression else "w|"
out_path = Path(output)
with tarfile.open(str(out_path), mode) as tf:
for arcname, data in file_iter:
info = tarfile.TarInfo(name=arcname)
info.size = len(data)
tf.addfile(info, io.BytesIO(data))
return out_path
# ─────────────────────────────────────────────────────────────────────────────
# 5. Reproducible build helper
# ─────────────────────────────────────────────────────────────────────────────
def reproducible_archive(
output: str | Path,
sources: list[str | Path],
compression: str = "gz",
source_date_epoch: int = 0,
) -> Path:
"""
Create a reproducible TAR archive suitable for distribution packaging.
Strips uid/gid/uname/gname; pins mtime to source_date_epoch.
Sorts members alphabetically for deterministic order.
Example:
reproducible_archive("release.tar.gz", ["myapp/", "LICENSE"])
# Identical on every build given identical source files.
"""
mode = f"w:{compression}" if compression else "w"
out_path = Path(output)
# Collect all (arcname, real_path) pairs sorted alphabetically
pairs: list[tuple[str, Path]] = []
for src in sources:
src_path = Path(src)
if src_path.is_file():
pairs.append((src_path.name, src_path))
else:
for child in sorted(src_path.rglob("*")):
if child.is_file():
rel = child.relative_to(src_path.parent)
pairs.append((str(rel), child))
pairs.sort(key=lambda x: x[0])
with tarfile.open(out_path, mode) as tf:
for arcname, real_path in pairs:
info = tf.gettarinfo(str(real_path), arcname=arcname)
info.uid = info.gid = 0
info.uname = info.gname = ""
info.mtime = source_date_epoch
with real_path.open("rb") as f:
tf.addfile(info, f)
return out_path
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import tempfile
print("=== tarfile demo ===")
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
# ── create source files ────────────────────────────────────────────────
src_dir = tmp / "myapp"
src_dir.mkdir()
(src_dir / "main.py").write_text("print('hello')\n")
(src_dir / "config.json").write_text('{"debug": false}\n')
(src_dir / "data").mkdir()
(src_dir / "data" / "records.csv").write_text("id,name\n1,Alice\n")
# ── create_archive ─────────────────────────────────────────────────────
print("\n--- create_archive ---")
gz_path = create_archive(tmp / "dist.tar.gz", [src_dir])
print(f" created: {gz_path.name} ({gz_path.stat().st_size:,d} bytes)")
bz2_path = create_archive(tmp / "dist.tar.bz2", [src_dir], compression="bz2")
print(f" created: {bz2_path.name} ({bz2_path.stat().st_size:,d} bytes)")
xz_path = create_archive(tmp / "dist.tar.xz", [src_dir], compression="xz",
reproducible=True)
print(f" created: {xz_path.name} ({xz_path.stat().st_size:,d} bytes)")
# ── list_archive ───────────────────────────────────────────────────────
print("\n--- list_archive ---")
for m in list_archive(gz_path):
print(f" {m}")
info = archive_info(gz_path)
print(f" summary: {info['file_count']} files, "
f"{info['total_size']:,d} uncompressed bytes, "
f"{info['compressed_size']:,d} on disk")
# ── create_from_bytes ──────────────────────────────────────────────────
print("\n--- create_from_bytes ---")
mem_path = create_from_bytes(tmp / "bundle.tar.gz", {
"config.json": b'{"key": "value"}',
"README.txt": b"Generated bundle\n",
"data/sample.csv": b"id,val\n1,42\n",
})
print(f" bundle: {mem_path.name} ({mem_path.stat().st_size:,d} bytes)")
# ── extract_to_memory ──────────────────────────────────────────────────
print("\n--- extract_to_memory ---")
files = extract_to_memory(mem_path)
for name, data in sorted(files.items()):
print(f" {name}: {data!r}")
# ── extract_file_bytes ─────────────────────────────────────────────────
print("\n--- extract_file_bytes ---")
config = extract_file_bytes(mem_path, "config.json")
print(f" config.json: {config!r}")
# ── extract_archive ────────────────────────────────────────────────────
print("\n--- extract_archive ---")
extract_dir = tmp / "extracted"
extract_archive(gz_path, extract_dir)
extracted = sorted(extract_dir.rglob("*"))
for p in extracted:
if p.is_file():
print(f" {p.relative_to(extract_dir)}: {p.stat().st_size} bytes")
# ── stream_create_archive / stream_archive_members ────────────────────
print("\n--- streaming ---")
def _log_gen() -> Iterator[tuple[str, bytes]]:
for i in range(5):
yield f"log_{i:04d}.txt", f"log entry {i}\n".encode() * 100
stream_path = stream_create_archive(tmp / "logs.tar.gz", _log_gen())
print(f" streamed archive: {stream_path.name} ({stream_path.stat().st_size:,d} bytes)")
count = sum(1 for _ in stream_archive_members(stream_path))
print(f" streamed back {count} members")
# ── reproducible_archive ───────────────────────────────────────────────
print("\n--- reproducible_archive ---")
r1 = reproducible_archive(tmp / "repro1.tar.gz", [src_dir])
r2 = reproducible_archive(tmp / "repro2.tar.gz", [src_dir])
print(f" identical: {r1.read_bytes() == r2.read_bytes()}")
print("\n=== done ===")
For the zipfile alternative — zipfile (stdlib) creates .zip archives; ZIP is universally supported on Windows, macOS, and Linux without extra tools, supports per-file compression (Deflate, bzip2, lzma), and allows random-access reads without scanning the whole archive — use zipfile when your users are Windows-centric, when you need to read or update individual files inside a large archive efficiently, or when your tool ecosystem expects .zip; use tarfile for Unix-style archives, Linux distribution packages (.tar.gz, .tar.xz), and when you need to preserve Unix permissions and symbolic links accurately. For the shutil alternative — shutil.make_archive(base, "gztar", root_dir, base_dir) and shutil.unpack_archive(path, dest) wrap tarfile and zipfile in a single-call interface; make_archive handles format selection and temporary file management automatically, which simplifies simple “archive this directory” tasks — use shutil for straightforward directory packing where you don’t need filtering, streaming, or in-memory control; use tarfile directly when you need reproducible builds, incremental extraction, streaming pipe mode, in-memory file injection, or custom per-member filtering. The Claude Skills 360 bundle includes tarfile skill sets covering create_archive() with reproducible and exclude filter support, create_from_bytes() in-memory archive builder, extract_archive()/extract_to_memory()/extract_file_bytes() extractors, ArchiveMember dataclass with list_archive()/archive_info(), stream_archive_members()/stream_create_archive() pipe-mode streaming, and reproducible_archive() deterministic packaging. Start with the free tier to try TAR archive patterns and tarfile pipeline code generation.