pydub manipulates audio with a simple Python API. pip install pydub (requires ffmpeg: brew install ffmpeg or apt install ffmpeg). Load: from pydub import AudioSegment; audio = AudioSegment.from_file("track.mp3"). WAV: AudioSegment.from_wav("file.wav"). Duration: len(audio) — milliseconds. Frame rate: audio.frame_rate. Channels: audio.channels. Sample width: audio.sample_width. Slice: clip = audio[1000:5000] — ms indices. Concatenate: combined = a + b. Overlay: mixed = a.overlay(b, position=500). Loop: looped = b * 3. Export: audio.export("out.mp3", format="mp3", bitrate="192k"). Export WAV: audio.export("out.wav", format="wav"). Volume: louder = audio + 6 — dB. quieter = audio - 3. normalize: from pydub.effects import normalize; n = normalize(audio). Fade: audio.fade_in(2000).fade_out(3000). Reverse: audio.reverse(). Speed (frame rate hack): faster = audio._spawn(audio.raw_data, overrides={"frame_rate": int(audio.frame_rate * 1.5)}). Split silence: from pydub.silence import split_on_silence; chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40). Detect leading silence: from pydub.silence import detect_leading_silence; start = detect_leading_silence(audio). Strip silence: audio[start:]. Stereo to mono: audio.set_channels(1). Resample: audio.set_frame_rate(16000). Generators: from pydub.generators import Sine; tone = Sine(440).to_audio_segment(duration=1000). WhiteNoise: from pydub.generators import WhiteNoise. Filter: from pydub.effects import low_pass_filter; muffled = low_pass_filter(audio, 3000). Claude Code generates pydub audio editors, podcast processors, voice normalizers, and silence removers.
CLAUDE.md for pydub
## pydub Stack
- Version: pydub >= 0.25 | pip install pydub | requires ffmpeg in PATH
- Load: AudioSegment.from_file("path") | .from_mp3 | .from_wav | .from_ogg
- Slice: audio[start_ms:end_ms] | Concat: a + b | Overlay: a.overlay(b)
- Volume: audio + 6 # dB | normalize(audio) | fade_in(ms) | fade_out(ms)
- Export: audio.export("out.mp3", format="mp3", bitrate="192k")
- Silence: split_on_silence(audio, min_silence_len=500, silence_thresh=-40)
pydub Audio Processing Pipeline
# app/audio.py — pydub load, slicing, mixing, normalization, silence splitting, export
from __future__ import annotations
import io
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from pydub import AudioSegment
from pydub.effects import compress_dynamic_range, low_pass_filter, high_pass_filter, normalize
from pydub.generators import Sine, WhiteNoise
from pydub.silence import detect_leading_silence, detect_silence, split_on_silence
log = logging.getLogger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# 1. Loading helpers
# ─────────────────────────────────────────────────────────────────────────────
def load(
source: str | Path | bytes | io.BytesIO,
fmt: str | None = None,
) -> AudioSegment:
"""
Load an AudioSegment from a file path, bytes, or BytesIO.
fmt: "mp3" | "wav" | "ogg" | "flac" | "m4a" | ... (auto-detected if None)
Example:
audio = load("podcast.mp3")
audio = load(audio_bytes, fmt="wav")
"""
if isinstance(source, (str, Path)):
return AudioSegment.from_file(str(source), format=fmt)
if isinstance(source, bytes):
return AudioSegment.from_file(io.BytesIO(source), format=fmt)
return AudioSegment.from_file(source, format=fmt)
def info(audio: AudioSegment) -> dict:
"""Return basic metadata about an AudioSegment."""
return {
"duration_ms": len(audio),
"duration_s": round(len(audio) / 1000, 2),
"frame_rate": audio.frame_rate,
"channels": audio.channels,
"sample_width": audio.sample_width,
"max_dBFS": round(audio.max_dBFS, 2),
"rms": audio.rms,
}
# ─────────────────────────────────────────────────────────────────────────────
# 2. Editing — trim, cut, concat
# ─────────────────────────────────────────────────────────────────────────────
def trim(
audio: AudioSegment,
start_ms: int = 0,
end_ms: int | None = None,
) -> AudioSegment:
"""
Trim audio to [start_ms, end_ms].
Example:
clip = trim(audio, start_ms=10000, end_ms=60000) # first minute after intro
"""
return audio[start_ms: end_ms if end_ms else len(audio)]
def strip_silence(
audio: AudioSegment,
silence_thresh_dBFS: float = -50.0,
) -> AudioSegment:
"""
Remove leading and trailing silence.
Example:
clean = strip_silence(recording, silence_thresh_dBFS=-45)
"""
start = detect_leading_silence(audio, silence_threshold=silence_thresh_dBFS)
end = len(audio) - detect_leading_silence(audio.reverse(), silence_threshold=silence_thresh_dBFS)
return audio[start:end]
def concatenate(
segments: list[AudioSegment],
gap_ms: int = 0,
crossfade_ms: int = 0,
) -> AudioSegment:
"""
Join a list of segments with optional gap or crossfade.
gap_ms: silence between segments.
crossfade_ms: overlap for smooth transition.
Example:
full = concatenate([intro, main, outro], crossfade_ms=500)
"""
if not segments:
return AudioSegment.empty()
if crossfade_ms > 0:
result = segments[0]
for seg in segments[1:]:
result = result.append(seg, crossfade=crossfade_ms)
return result
gap = AudioSegment.silent(duration=gap_ms) if gap_ms else None
result = segments[0]
for seg in segments[1:]:
if gap:
result += gap
result += seg
return result
def loop(audio: AudioSegment, count: int = 2) -> AudioSegment:
"""Repeat audio count times."""
return audio * count
# ─────────────────────────────────────────────────────────────────────────────
# 3. Volume and dynamics
# ─────────────────────────────────────────────────────────────────────────────
def adjust_volume(audio: AudioSegment, db: float) -> AudioSegment:
"""Add or subtract dB from audio. Positive = louder, negative = quieter."""
return audio + db
def normalize_audio(audio: AudioSegment, headroom_dBFS: float = -0.5) -> AudioSegment:
"""
Normalize audio so peak is at headroom_dBFS.
Example:
normalized = normalize_audio(quiet_recording, headroom_dBFS=-1.0)
"""
return normalize(audio, headroom=headroom_dBFS)
def duck(
main: AudioSegment,
ducked: AudioSegment,
duck_db: float = -15.0,
fade_ms: int = 300,
) -> AudioSegment:
"""
Lower (duck) main audio under ducked (e.g. music under speech).
Returns main with volume reduced at positions where ducked plays.
A simplified static duck — for dynamic ducking use overlay with gain changes.
Example:
narration_with_music = duck(music, speech, duck_db=-14)
"""
attenuated = main + duck_db
return ducked.overlay(attenuated)
def compress(audio: AudioSegment, threshold: float = -20.0, ratio: float = 4.0) -> AudioSegment:
"""Apply dynamic range compression."""
return compress_dynamic_range(audio, threshold=threshold, ratio=ratio)
# ─────────────────────────────────────────────────────────────────────────────
# 4. Mixing / overlay
# ─────────────────────────────────────────────────────────────────────────────
def overlay_at(
base: AudioSegment,
overlay_seg: AudioSegment,
position_ms: int = 0,
gain_during_overlay: float = 0.0,
) -> AudioSegment:
"""
Place overlay_seg on top of base starting at position_ms.
Example:
result = overlay_at(background_music, voiceover, position_ms=3000)
"""
return base.overlay(overlay_seg, position=position_ms, gain_during_overlay=gain_during_overlay)
def mix(
segments: list[tuple[AudioSegment, int]],
base_length_ms: int | None = None,
) -> AudioSegment:
"""
Mix multiple segments: [(audio, start_ms), ...].
base_length_ms: final length (default: max end position).
Example:
result = mix([
(background, 0),
(effect1, 5000),
(jingle, 10000),
])
"""
if not segments:
return AudioSegment.empty()
length = base_length_ms or max(start + len(seg) for seg, start in segments)
result = AudioSegment.silent(duration=length)
for seg, start in segments:
result = result.overlay(seg, position=start)
return result
# ─────────────────────────────────────────────────────────────────────────────
# 5. Silence detection and splitting
# ─────────────────────────────────────────────────────────────────────────────
def split_silence(
audio: AudioSegment,
min_silence_len_ms: int = 700,
silence_thresh_dBFS: float = -40.0,
keep_silence_ms: int = 100,
) -> list[AudioSegment]:
"""
Split audio on silent pauses. Returns non-silent chunks.
Example:
chunks = split_silence(interview, min_silence_len_ms=600, silence_thresh_dBFS=-38)
for i, chunk in enumerate(chunks):
chunk.export(f"chunk_{i:03d}.wav", format="wav")
"""
return split_on_silence(
audio,
min_silence_len=min_silence_len_ms,
silence_thresh=silence_thresh_dBFS,
keep_silence=keep_silence_ms,
)
def detect_speech_segments(
audio: AudioSegment,
min_silence_len_ms: int = 700,
silence_thresh_dBFS: float = -40.0,
) -> list[tuple[int, int]]:
"""
Return list of (start_ms, end_ms) tuples for non-silent regions.
Example:
for start, end in detect_speech_segments(interview):
print(f"Speech: {start/1000:.1f}s – {end/1000:.1f}s")
"""
silent_ranges = detect_silence(
audio,
min_silence_len=min_silence_len_ms,
silence_thresh=silence_thresh_dBFS,
)
segments = []
prev_end = 0
for sil_start, sil_end in silent_ranges:
if sil_start > prev_end:
segments.append((prev_end, sil_start))
prev_end = sil_end
if prev_end < len(audio):
segments.append((prev_end, len(audio)))
return segments
def remove_silence(
audio: AudioSegment,
min_silence_len_ms: int = 700,
silence_thresh_dBFS: float = -40.0,
keep_silence_ms: int = 100,
) -> AudioSegment:
"""
Remove all silent gaps and concatenate the speech segments.
Example:
tight = remove_silence(lecture, silence_thresh_dBFS=-42)
"""
chunks = split_silence(audio, min_silence_len_ms, silence_thresh_dBFS, keep_silence_ms)
return concatenate(chunks)
# ─────────────────────────────────────────────────────────────────────────────
# 6. Format conversion and export
# ─────────────────────────────────────────────────────────────────────────────
def to_mono(audio: AudioSegment) -> AudioSegment:
"""Convert stereo to mono."""
return audio.set_channels(1)
def to_stereo(audio: AudioSegment) -> AudioSegment:
"""Convert mono to stereo."""
return audio.set_channels(2)
def resample(audio: AudioSegment, frame_rate: int = 16000) -> AudioSegment:
"""Change sample rate (e.g. 44100 → 16000 for speech models)."""
return audio.set_frame_rate(frame_rate)
def export_bytes(
audio: AudioSegment,
fmt: str = "mp3",
bitrate: str = "128k",
parameters: list[str] | None = None,
) -> bytes:
"""
Export AudioSegment to bytes in the given format.
Example:
mp3_bytes = export_bytes(audio, fmt="mp3", bitrate="192k")
wav_bytes = export_bytes(audio, fmt="wav")
"""
buf = io.BytesIO()
audio.export(buf, format=fmt, bitrate=bitrate, parameters=parameters)
return buf.getvalue()
def export_file(
audio: AudioSegment,
path: str | Path,
fmt: str | None = None,
bitrate: str = "192k",
tags: dict | None = None,
) -> Path:
"""
Save audio to file.
fmt: inferred from extension if None.
Example:
export_file(normalized, "output/episode_01.mp3", tags={"title": "Episode 1"})
"""
p = Path(path)
fmt = fmt or p.suffix.lstrip(".").lower() or "mp3"
p.parent.mkdir(parents=True, exist_ok=True)
audio.export(str(p), format=fmt, bitrate=bitrate, tags=tags or {})
return p
# ─────────────────────────────────────────────────────────────────────────────
# 7. Pipeline
# ─────────────────────────────────────────────────────────────────────────────
def process_podcast(
voice_path: str | Path,
music_path: str | Path | None = None,
output_path: str | Path = "podcast_out.mp3",
target_dBFS: float = -18.0,
silence_thresh_dBFS: float = -42.0,
music_duck_dB: float = -18.0,
intro_ms: int = 3000,
) -> AudioSegment:
"""
Full podcast processing pipeline:
1. Load and strip silence from voice
2. Normalize voice to target loudness
3. Optionally overlay background music (ducked)
4. Export
Example:
result = process_podcast("raw_recording.wav", music_path="bg_music.mp3")
"""
voice = load(voice_path)
log.info("Voice: %s", info(voice))
# Strip silence
voice = strip_silence(voice, silence_thresh_dBFS=silence_thresh_dBFS)
# Normalize
voice = normalize_audio(voice)
# Apply fade
voice = voice.fade_in(300).fade_out(500)
if music_path:
music = load(music_path)
# Loop music to match voice length + intro
needed = len(voice) + intro_ms
repeats = (needed // len(music)) + 2
music = loop(music, repeats)[:needed]
music = normalize_audio(music + music_duck_dB)
# Combine: music plays alone for intro_ms, then voice overlaid
result = music.overlay(voice, position=intro_ms)
else:
result = voice
log.info("Output: %s", info(result))
export_file(result, output_path)
return result
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
from pydub.generators import Sine, WhiteNoise
print("=== Generating test audio ===")
# 440 Hz tone for 2 seconds
tone = Sine(440).to_audio_segment(duration=2000)
# White noise burst
noise = WhiteNoise().to_audio_segment(duration=500) - 20 # quieter
# Silence pad
silence = AudioSegment.silent(duration=500)
print(f" Tone: {info(tone)}")
print("\n=== Concatenation + fade ===")
jingle = concatenate([tone, silence, noise], crossfade_ms=50)
jingle = jingle.fade_in(200).fade_out(200)
print(f" Jingle: {len(jingle)}ms")
print("\n=== Normalization ===")
quiet = tone - 12
loud = normalize_audio(quiet)
print(f" Before: {round(quiet.max_dBFS,1)} dBFS → After: {round(loud.max_dBFS,1)} dBFS")
print("\n=== Slice and volume ===")
first_sec = trim(jingle, 0, 1000)
louder = adjust_volume(first_sec, +6)
print(f" Slice: {len(first_sec)}ms | Boosted: {round(louder.max_dBFS,1)} dBFS")
print("\n=== Silence splitting ===")
# Create speech-like audio with gaps
word1 = tone[:300]
word2 = (tone + 3)[:400]
pad = AudioSegment.silent(800)
spoken = word1 + pad + word2 + pad + word1
chunks = split_silence(spoken, min_silence_len_ms=600, silence_thresh_dBFS=-80)
print(f" Chunks from silence split: {len(chunks)}")
print("\n=== Export to bytes ===")
wav_bytes = export_bytes(tone, fmt="wav")
print(f" WAV bytes: {len(wav_bytes):,}")
print("\nInstall: pip install pydub && brew install ffmpeg # or apt install ffmpeg")
For the librosa alternative — librosa provides advanced audio analysis (spectrograms, MFCC, chroma, beat tracking, pitch estimation) and is the standard for music information retrieval and ML feature extraction; pydub excels at audio editing tasks (slicing, mixing, format conversion, silence removal) with a simple, ffmpeg-backed API — use pydub for audio production workflows (trim, normalize, mix, export), librosa when you need to analyze audio content for machine learning or signal processing. For the soundfile alternative — soundfile (via libsndfile) reads and writes WAV/FLAC/AIFF as NumPy arrays with exact sample fidelity, making it ideal for signal processing pipelines that work directly with audio samples; pydub wraps ffmpeg to support MP3, OGG, M4A, and dozens of other formats — use soundfile when you need raw NumPy arrays for DSP, pydub when you need broad format support and a human-friendly editing API. The Claude Skills 360 bundle includes pydub skill sets covering load() from path/bytes/BytesIO, info() metadata, trim()/strip_silence()/concatenate()/loop() editing, adjust_volume()/normalize_audio()/duck()/compress() dynamics, overlay_at()/mix() mixing, split_silence()/detect_speech_segments()/remove_silence() VAD, to_mono()/to_stereo()/resample() conversion, export_bytes()/export_file(), and process_podcast() full pipeline. Start with the free tier to try audio manipulation and processing code generation.