Python’s mmap module maps files directly into virtual memory for efficient random-access and large-file processing. import mmap. mmap: m = mmap.mmap(fileno, length, access=mmap.ACCESS_READ) — maps length bytes of fileno; length=0 maps the whole file. Access modes: ACCESS_READ (read-only), ACCESS_WRITE (read-write, syncs to file), ACCESS_COPY (copy-on-write, changes not written to disk). File-like API: m.read(n), m.write(b), m.seek(pos), m.tell(), m.readline(). Slice access: m[start:end] → bytes; m[start:end] = b"..." (write mode). find/rfind: m.find(b"pattern", start, end) → int offset or -1; supports re.search on mmap objects directly. flush: m.flush([offset, size]) — ensure writes reach disk. close: m.close(). mmap as context manager: with mmap.mmap(...) as m:. ALLOCATIONGRANULARITY: alignment required for offset parameter. PAGESIZE: OS memory page size. madvise (Unix 3.8+): m.madvise(mmap.MADV_SEQUENTIAL) — hint prefetch; MADV_RANDOM, MADV_WILLNEED, MADV_DONTNEED. Anonymous maps: mmap.mmap(-1, size) — no backing file (for IPC). Claude Code generates large log file parsers, binary database scanners, fast pattern searchers, and shared memory IPC buffers.
CLAUDE.md for mmap
## mmap Stack
- Stdlib: import mmap
- Read: with open(f,"rb") as fh, mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ) as m:
- Slice: m[start:end] # zero-copy bytes view
- Find: m.find(b"pattern") # search without loading all data
- Regex: re.search(pattern, m) # regex directly on mmap
- Anon: mmap.mmap(-1, size) # anonymous map, no file
mmap Large-File Processing Pipeline
# app/mmaputil.py — read, search, write, index, grep, IPC buffer
from __future__ import annotations
import mmap
import os
import re
import struct
from dataclasses import dataclass
from pathlib import Path
from typing import Generator, Iterator
# ─────────────────────────────────────────────────────────────────────────────
# 1. Context-manager helpers
# ─────────────────────────────────────────────────────────────────────────────
def open_read(path: str | Path) -> tuple:
"""
Open a file for memory-mapped read access.
Returns (file_obj, mmap_obj); caller must close both.
Prefer using mmap_read() context manager instead.
Example:
fh, m = open_read("large.bin")
try:
data = m[100:200]
finally:
m.close(); fh.close()
"""
fh = open(str(path), "rb")
m = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
return fh, m
from contextlib import contextmanager
@contextmanager
def mmap_read(path: str | Path) -> Generator[mmap.mmap, None, None]:
"""
Context manager for read-only memory-mapped file access.
Example:
with mmap_read("data.bin") as m:
header = m[:16]
pos = m.find(b"\\xde\\xad")
"""
fh = open(str(path), "rb")
try:
m = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
try:
yield m
finally:
m.close()
finally:
fh.close()
@contextmanager
def mmap_write(path: str | Path) -> Generator[mmap.mmap, None, None]:
"""
Context manager for read-write memory-mapped file access.
Changes are written back to the file on flush/close.
Example:
with mmap_write("data.bin") as m:
m[0:4] = b"NEW!"
m.flush()
"""
fh = open(str(path), "r+b")
try:
m = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_WRITE)
try:
yield m
finally:
m.flush()
m.close()
finally:
fh.close()
# ─────────────────────────────────────────────────────────────────────────────
# 2. Pattern search helpers
# ─────────────────────────────────────────────────────────────────────────────
def find_all(m: mmap.mmap, pattern: bytes) -> list[int]:
"""
Find all byte offsets of pattern in a memory-mapped file.
Example:
with mmap_read("log.bin") as m:
offsets = find_all(m, b"ERROR")
"""
offsets = []
pos = m.find(pattern)
while pos != -1:
offsets.append(pos)
pos = m.find(pattern, pos + 1)
return offsets
def find_all_regex(m: mmap.mmap, pattern: str | bytes, flags: int = 0) -> list[re.Match]:
"""
Find all regex matches in a memory-mapped file.
mmap objects are accepted directly by the re module.
Example:
with mmap_read("app.log") as m:
errors = find_all_regex(m, rb"ERROR.*?\\n")
"""
if isinstance(pattern, str):
pattern = pattern.encode()
return list(re.finditer(pattern, m, flags))
def count_occurrences(m: mmap.mmap, pattern: bytes) -> int:
"""
Count occurrences of pattern in m without collecting all offsets.
Example:
with mmap_read("access.log") as m:
n = count_occurrences(m, b"404")
"""
count = 0
pos = m.find(pattern)
while pos != -1:
count += 1
pos = m.find(pattern, pos + 1)
return count
def grep_lines(path: str | Path, pattern: str | bytes, max_results: int = 1000) -> list[tuple[int, bytes]]:
"""
Search a text file for lines matching pattern; return (line_no, line) pairs.
Uses mmap for efficiency on large files.
Example:
for lineno, line in grep_lines("server.log", b"500 Internal Server Error"):
print(f" line {lineno}: {line.decode()!r}")
"""
if isinstance(pattern, str):
pattern_b = pattern.encode()
rx = re.compile(pattern.encode())
else:
pattern_b = pattern
rx = re.compile(pattern)
results = []
with mmap_read(path) as m:
lineno = 1
start = 0
size = len(m)
while start < size and len(results) < max_results:
end = m.find(b"\\n", start)
if end == -1:
end = size
line = m[start:end]
if rx.search(line):
results.append((lineno, line))
lineno += 1
start = end + 1
return results
# ─────────────────────────────────────────────────────────────────────────────
# 3. Binary file reader
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class MappedReader:
"""
A struct-aware reader over a memory-mapped file.
Example:
with mmap_read("records.bin") as m:
reader = MappedReader(m)
count = reader.read_uint32()
for _ in range(count):
x, y = reader.read_struct("ff")
"""
_m: mmap.mmap
_pos: int = 0
def seek(self, pos: int) -> None:
self._pos = pos
def tell(self) -> int:
return self._pos
def read(self, n: int) -> bytes:
data = self._m[self._pos : self._pos + n]
self._pos += len(data)
return data
def read_struct(self, fmt: str) -> tuple:
s = struct.Struct(fmt)
data = self.read(s.size)
return s.unpack(data)
def read_uint8(self) -> int: return self.read_struct("B")[0]
def read_uint16(self) -> int: return self.read_struct(">H")[0]
def read_uint32(self) -> int: return self.read_struct(">I")[0]
def read_uint64(self) -> int: return self.read_struct(">Q")[0]
def read_int32(self) -> int: return self.read_struct(">i")[0]
def read_float(self) -> float: return self.read_struct(">f")[0]
def read_double(self) -> float: return self.read_struct(">d")[0]
def read_cstring(self, max_len: int = 4096) -> bytes:
"""Read a null-terminated C string."""
end = self._m.find(b"\\x00", self._pos, self._pos + max_len)
if end == -1:
end = self._pos + max_len
data = self._m[self._pos : end]
self._pos = end + 1
return data
def remaining(self) -> int:
return len(self._m) - self._pos
# ─────────────────────────────────────────────────────────────────────────────
# 4. Anonymous mmap (for IPC / buffers)
# ─────────────────────────────────────────────────────────────────────────────
def shared_buffer(size: int) -> mmap.mmap:
"""
Create an anonymous memory-mapped buffer (no backing file).
Useful as a shared memory region passed between processes via inheritance.
Example:
buf = shared_buffer(4096)
buf.write(b"data")
buf.seek(0)
buf.read(4) # b"data"
buf.close()
"""
return mmap.mmap(-1, size)
# ─────────────────────────────────────────────────────────────────────────────
# 5. File utilities
# ─────────────────────────────────────────────────────────────────────────────
def file_size(path: str | Path) -> int:
"""Return file size in bytes using stat (no mmap needed)."""
return os.stat(str(path)).st_size
def patch_bytes(path: str | Path, offset: int, data: bytes) -> None:
"""
Overwrite bytes at `offset` in a file with `data` (in-place).
File must already be large enough.
Example:
patch_bytes("firmware.bin", 0x100, b"\\x00\\x00")
"""
with mmap_write(path) as m:
m[offset : offset + len(data)] = data
def extract_region(path: str | Path, offset: int, length: int) -> bytes:
"""
Extract a byte region from a large file without loading it all.
Example:
header = extract_region("data.bin", 0, 128)
"""
with mmap_read(path) as m:
return bytes(m[offset : offset + length])
def iter_chunks(path: str | Path, chunk_size: int = 65536) -> Iterator[bytes]:
"""
Iterate over a file in fixed-size chunks via mmap.
Useful for streaming hash or checksum computation on large files.
Example:
import hashlib
h = hashlib.sha256()
for chunk in iter_chunks("large.bin"):
h.update(chunk)
print(h.hexdigest())
"""
with mmap_read(path) as m:
size = len(m)
pos = 0
while pos < size:
end = min(pos + chunk_size, size)
yield bytes(m[pos:end])
pos = end
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import tempfile, hashlib
print("=== mmap demo ===")
# Build a temp file for testing
with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as tf:
tmp_path = tf.name
# Write 10,000 bytes of content with known patterns
content = b""
for i in range(100):
content += f"line{i:04d}: data={i*3:06d} ERROR={i%5==0}\\n".encode()
tf.write(content)
try:
print(f"\n--- file_size ---")
print(f" {file_size(tmp_path)} bytes")
print("\n--- mmap_read + slice ---")
with mmap_read(tmp_path) as m:
print(f" first 40 bytes: {m[:40]!r}")
print(f" map length: {len(m)}")
print("\n--- find_all ---")
with mmap_read(tmp_path) as m:
offsets = find_all(m, b"ERROR=True")
print(f" 'ERROR=True' at {len(offsets)} offsets: first={offsets[:3]}")
print("\n--- count_occurrences ---")
with mmap_read(tmp_path) as m:
n = count_occurrences(m, b"line")
print(f" 'line' count: {n}")
print("\n--- find_all_regex ---")
with mmap_read(tmp_path) as m:
matches = find_all_regex(m, rb"line00[12][0-9]")
print(f" regex 'line00[12][0-9]': {len(matches)} matches")
print("\n--- extract_region ---")
region = extract_region(tmp_path, 0, 50)
print(f" first 50 bytes: {region!r}")
print("\n--- iter_chunks (sha256) ---")
h = hashlib.sha256()
n_chunks = 0
for chunk in iter_chunks(tmp_path, chunk_size=512):
h.update(chunk)
n_chunks += 1
print(f" sha256 over {n_chunks} chunks: {h.hexdigest()[:16]}...")
print("\n--- patch_bytes ---")
patch_bytes(tmp_path, 0, b"PATCHED!")
region_after = extract_region(tmp_path, 0, 20)
print(f" after patch, first 20 bytes: {region_after!r}")
print("\n--- shared_buffer ---")
buf = shared_buffer(64)
buf.write(b"hello from anonymous mmap")
buf.seek(0)
print(f" read back: {buf.read(25)!r}")
buf.close()
print("\n--- MappedReader ---")
# Create a small binary struct file
with tempfile.NamedTemporaryFile(delete=False, suffix=".struct") as sf:
struct_path = sf.name
sf.write(struct.pack(">I", 3)) # count = 3
for v in [1.5, 2.5, 3.5]:
sf.write(struct.pack(">f", v))
with mmap_read(struct_path) as m:
reader = MappedReader(m)
count = reader.read_uint32()
values = [reader.read_float() for _ in range(count)]
print(f" count={count} values={values}")
finally:
os.unlink(tmp_path)
try:
os.unlink(struct_path)
except Exception:
pass
print("\n=== done ===")
For the io.BytesIO alternative — io.BytesIO provides an in-memory byte buffer that accepts the same file-like API (read, write, seek, tell); it loads the entire content into RAM; mmap maps the file into virtual address space so the OS pages in only the parts you actually access — use BytesIO for constructing byte streams, processing results of network I/O, or when the data is already in memory; use mmap when the file is too large to load entirely, when you need random access to specific byte ranges, or when minimizing RSS memory footprint matters. For the numpy.memmap alternative — numpy.memmap wraps the OS mmap system call and presents the file content as a NumPy array with full vectorized math, slicing, and broadcasting; stdlib mmap gives a byte-level view without numerical semantics — use numpy.memmap for large numerical datasets (sensor recordings, audio, embeddings, image tensors) where you need array operations without loading all data; use mmap for generic binary files, text logs, and protocols where NumPy array semantics are not needed. The Claude Skills 360 bundle includes mmap skill sets covering mmap_read()/mmap_write() context managers, find_all()/find_all_regex()/count_occurrences()/grep_lines() pattern search helpers, MappedReader with read_struct/uint8/uint16/uint32/uint64/float/double/cstring, shared_buffer() anonymous IPC map, and patch_bytes()/extract_region()/iter_chunks() file utilities. Start with the free tier to try memory-mapped file patterns and mmap pipeline code generation.