TorchAudio provides audio I/O, transforms, and pretrained models. pip install torchaudio. Load: import torchaudio, waveform, sample_rate = torchaudio.load("audio.wav") — returns (channels, time) tensor. Save: torchaudio.save("out.wav", waveform, 22050). Resample: resampled = torchaudio.functional.resample(waveform, orig_freq=44100, new_freq=16000). Spectrogram: transform = torchaudio.transforms.Spectrogram(n_fft=1024, hop_length=256), spec = transform(waveform). MelSpectrogram: torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80, n_fft=1024). MFCC: torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=40). dB: torchaudio.transforms.AmplitudeToDB(). Augment: torchaudio.transforms.FrequencyMasking(freq_mask_param=30) — SpecAugment. torchaudio.transforms.TimeMasking(time_mask_param=100). AddNoise: torchaudio.functional.add_noise(waveform, noise, snr). Time stretch (speed): torchaudio.transforms.TimeStretch(fixed_rate=1.2). Wav2Vec2: from torchaudio.pipelines import WAV2VEC2_BASE, bundle = WAV2VEC2_BASE, model = bundle.get_model(), features, _ = model.extract_features(waveform). HuBERT: from torchaudio.pipelines import HUBERT_BASE. ASR: from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H, decoder = bundle.get_decoder(). Tacotron2 TTS: from torchaudio.pipelines import TACOTRON2_WAVERNN_PHONE_LJSPEECH, bundle.get_tacotron2(). Datasets: ds = torchaudio.datasets.SPEECHCOMMANDS("./data", download=True). Claude Code generates TorchAudio feature extraction pipelines, SpecAugment training transforms, Wav2Vec2 fine-tuning code, and audio dataset loaders.
CLAUDE.md for TorchAudio
## TorchAudio Stack
- Version: torchaudio >= 2.4
- I/O: torchaudio.load(path) → (waveform, sr) | torchaudio.save(path, waveform, sr)
- Resample: torchaudio.functional.resample(waveform, orig_freq, new_freq)
- Features: transforms.MelSpectrogram | MFCC | Spectrogram | AmplitudeToDB
- Augment: transforms.FrequencyMasking | TimeMasking | TimeStretch
- Noise: torchaudio.functional.add_noise(waveform, noise, snr=torch.tensor([10.0]))
- Pretrained: from torchaudio.pipelines import WAV2VEC2_BASE | HUBERT_BASE | WAV2VEC2_ASR_BASE_960H
- Dataset: torchaudio.datasets.SPEECHCOMMANDS | LibriSpeech | VCTK | LJSPEECH
TorchAudio Feature Extraction and Training
# audio/torchaudio_pipeline.py — audio feature extraction and processing
from __future__ import annotations
import os
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as T
import torchaudio.functional as AF
import numpy as np
# ── 1. Audio I/O utilities ────────────────────────────────────────────────────
def load_mono_16k(audio_path: str) -> tuple[torch.Tensor, int]:
"""Load audio, convert to mono, resample to 16kHz."""
waveform, sr = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sr != 16000:
waveform = AF.resample(waveform, sr, 16000)
return waveform, 16000
def save_audio(waveform: torch.Tensor, path: str, sample_rate: int = 16000):
"""Save waveform to audio file (supports .wav, .mp3, .flac)."""
torchaudio.save(path, waveform.cpu(), sample_rate)
def get_audio_info(audio_path: str) -> dict:
"""Get audio metadata without loading full file."""
info = torchaudio.info(audio_path)
return {
"sample_rate": info.sample_rate,
"num_frames": info.num_frames,
"num_channels": info.num_channels,
"duration_sec": info.num_frames / info.sample_rate,
"encoding": info.encoding,
"bits_per_sample": info.bits_per_sample,
}
def split_audio(
waveform: torch.Tensor,
sample_rate: int,
chunk_sec: float = 30.0,
overlap_sec: float = 1.0,
) -> list[torch.Tensor]:
"""Split long audio into overlapping chunks."""
chunk_len = int(chunk_sec * sample_rate)
overlap_len = int(overlap_sec * sample_rate)
step = chunk_len - overlap_len
n_samples = waveform.shape[-1]
chunks = []
for start in range(0, n_samples, step):
end = min(start + chunk_len, n_samples)
chunks.append(waveform[:, start:end])
if end == n_samples:
break
return chunks
# ── 2. Feature extraction transforms ─────────────────────────────────────────
def build_mel_transform(
sample_rate: int = 16000,
n_mels: int = 80,
n_fft: int = 1024,
hop_length: int = 256,
win_length: int = None,
f_min: float = 0.0,
f_max: float = 8000.0,
) -> nn.Sequential:
"""Build mel spectrogram + dB transform (Whisper-style features)."""
return nn.Sequential(
T.MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length or n_fft,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
power=2.0,
),
T.AmplitudeToDB(stype="power", top_db=80.0),
)
def build_mfcc_transform(
sample_rate: int = 16000,
n_mfcc: int = 40,
n_mels: int = 80,
n_fft: int = 1024,
hop_length: int = 256,
) -> T.MFCC:
"""Build MFCC transform for speech recognition features."""
return T.MFCC(
sample_rate=sample_rate,
n_mfcc=n_mfcc,
melkwargs={
"n_fft": n_fft,
"hop_length": hop_length,
"n_mels": n_mels,
"f_min": 20.0,
"f_max": float(sample_rate // 2),
},
)
def extract_features(
waveform: torch.Tensor,
transforms: nn.Module,
normalize: bool = True,
) -> torch.Tensor:
"""Extract features and optionally normalize per-frequency."""
features = transforms(waveform) # (C, freq, time)
if normalize:
mean = features.mean(dim=-1, keepdim=True)
std = features.std(dim=-1, keepdim=True) + 1e-8
features = (features - mean) / std
return features
# ── 3. Data augmentation ──────────────────────────────────────────────────────
class AudioAugmentation(nn.Module):
"""
Combined audio augmentation pipeline for robust ASR/classification training.
All operations work on waveforms or after spectrogram conversion.
"""
def __init__(
self,
sample_rate: int = 16000,
speed_perturb: bool = True,
add_noise: bool = True,
spec_augment: bool = True,
noise_snr_range: tuple = (5.0, 20.0),
freq_mask_param: int = 30,
time_mask_param: int = 100,
n_freq_masks: int = 2,
n_time_masks: int = 2,
):
super().__init__()
self.sample_rate = sample_rate
self.speed_perturb = speed_perturb
self.add_noise = add_noise
self.spec_augment = spec_augment
self.noise_snr_range = noise_snr_range
# SpecAugment — applied to spectrogram after mel transform
if spec_augment:
freq_masks = [T.FrequencyMasking(freq_mask_param) for _ in range(n_freq_masks)]
time_masks = [T.TimeMasking(time_mask_param) for _ in range(n_time_masks)]
self.spec_aug = nn.Sequential(*freq_masks, *time_masks)
def augment_waveform(self, waveform: torch.Tensor) -> torch.Tensor:
"""Apply waveform-level augmentations."""
# Speed perturbation (tempo change without pitch shift)
if self.speed_perturb and torch.rand(1).item() < 0.5:
speed = torch.empty(1).uniform_(0.9, 1.1).item()
effects = [["speed", str(speed)], ["rate", str(self.sample_rate)]]
waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
waveform, self.sample_rate, effects, channels_first=True
)
# Additive noise (use white noise as substitute for room noise)
if self.add_noise and torch.rand(1).item() < 0.5:
snr = torch.empty(1).uniform_(*self.noise_snr_range)
noise = torch.randn_like(waveform)
waveform = AF.add_noise(waveform, noise, snr)
return waveform
def augment_spectrogram(self, spec: torch.Tensor) -> torch.Tensor:
"""Apply SpecAugment to spectrogram."""
if self.spec_augment:
return self.spec_aug(spec)
return spec
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
return self.augment_waveform(waveform)
# ── 4. Audio filtering ────────────────────────────────────────────────────────
def apply_filters(
waveform: torch.Tensor,
sample_rate: int,
highpass_hz: float = None, # Remove low-frequency rumble
lowpass_hz: float = None, # Remove high-frequency hiss
preemphasis: float = 0.97, # Pre-emphasis for speech clarity
) -> torch.Tensor:
"""Apply audio filters for speech preprocessing."""
if highpass_hz:
waveform = AF.highpass_biquad(waveform, sample_rate, highpass_hz)
if lowpass_hz:
waveform = AF.lowpass_biquad(waveform, sample_rate, lowpass_hz)
if preemphasis > 0:
waveform = AF.preemphasis(waveform, preemphasis)
return waveform
# ── 5. Pretrained models via torchaudio.pipelines ────────────────────────────
def extract_wav2vec2_features(
audio_paths: list[str],
layer: int = -1, # -1 = last layer, 0..N = specific layer
device: str = "cpu",
) -> list[torch.Tensor]:
"""
Extract Wav2Vec2 contextual representations.
Features capture acoustic + linguistic patterns.
"""
from torchaudio.pipelines import WAV2VEC2_BASE
bundle = WAV2VEC2_BASE
model = bundle.get_model().to(device)
model.eval()
target_sr = bundle.sample_rate # 16000
embeddings = []
with torch.no_grad():
for path in audio_paths:
waveform, sr = torchaudio.load(path)
if sr != target_sr:
waveform = AF.resample(waveform, sr, target_sr)
waveform = waveform.to(device)
features, _ = model.extract_features(waveform)
if layer == -1:
emb = features[-1].mean(dim=1) # Mean-pool time dimension
else:
emb = features[layer].mean(dim=1)
embeddings.append(emb.cpu())
return embeddings
def transcribe_wav2vec2(
audio_path: str,
device: str = "cpu",
) -> str:
"""Transcribe with Wav2Vec2 ASR pipeline (CTC decoding)."""
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H
bundle = WAV2VEC2_ASR_BASE_960H
model = bundle.get_model().to(device)
decoder = bundle.get_decoder()
model.eval()
waveform, sr = torchaudio.load(audio_path)
if sr != bundle.sample_rate:
waveform = AF.resample(waveform, sr, bundle.sample_rate)
waveform = waveform.to(device)
with torch.no_grad():
emission, _ = model(waveform)
transcript = decoder(emission[0])
return str(transcript)
# ── 6. Audio dataset loading ──────────────────────────────────────────────────
def build_speechcommands_loader(
root: str = "./data",
subset: str = "training", # "training" | "validation" | "testing"
sample_rate: int = 16000,
batch_size: int = 64,
num_workers: int = 4,
) -> torch.utils.data.DataLoader:
"""Build DataLoader for Google Speech Commands dataset."""
from torch.utils.data import DataLoader
mel_transform = build_mel_transform(sample_rate)
augment = AudioAugmentation(sample_rate) if subset == "training" else None
class SpeechCommandsDataset(torch.utils.data.Dataset):
def __init__(self):
self.ds = torchaudio.datasets.SPEECHCOMMANDS(
root, download=True, subset=subset
)
self.labels = sorted(set(self.ds[i][2] for i in range(len(self.ds))))
self.label2idx = {l: i for i, l in enumerate(self.labels)}
def __len__(self):
return len(self.ds)
def __getitem__(self, idx):
waveform, sr, label, *_ = self.ds[idx]
if sr != sample_rate:
waveform = AF.resample(waveform, sr, sample_rate)
# Pad or trim to 1 second
target_len = sample_rate
if waveform.shape[-1] < target_len:
waveform = F.pad(waveform, (0, target_len - waveform.shape[-1]))
else:
waveform = waveform[:, :target_len]
if augment:
waveform = augment(waveform)
features = mel_transform(waveform)
return features, self.label2idx[label]
dataset = SpeechCommandsDataset()
return DataLoader(
dataset,
batch_size=batch_size,
shuffle=(subset == "training"),
num_workers=num_workers,
pin_memory=True,
)
# ── 7. Spectrogram visualization ─────────────────────────────────────────────
def plot_waveform_and_spectrogram(
audio_path: str,
output_path: str = "audio_analysis.png",
):
"""Plot waveform + mel spectrogram for audio analysis."""
import matplotlib
matplotlib.use("Agg") # Non-interactive backend
import matplotlib.pyplot as plt
waveform, sr = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
mel_transform = build_mel_transform(sr)
mel_spec = mel_transform(waveform).squeeze().numpy()
time_axis = np.linspace(0, waveform.shape[-1] / sr, waveform.shape[-1])
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6))
# Waveform
ax1.plot(time_axis, waveform.squeeze().numpy(), linewidth=0.5)
ax1.set_xlabel("Time (s)")
ax1.set_ylabel("Amplitude")
ax1.set_title(f"{Path(audio_path).name} — Waveform")
# Mel spectrogram
im = ax2.imshow(
mel_spec, aspect="auto", origin="lower",
extent=[0, waveform.shape[-1] / sr, 0, mel_spec.shape[0]],
)
ax2.set_xlabel("Time (s)")
ax2.set_ylabel("Mel bin")
ax2.set_title("Mel Spectrogram")
plt.colorbar(im, ax=ax2, label="dB")
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved: {output_path}")
if __name__ == "__main__":
# Feature extraction demo with synthetic audio
sample_rate = 16000
duration = 3.0
freq = 440.0 # A4 tone
t = torch.linspace(0, duration, int(sample_rate * duration))
waveform = torch.sin(2 * torch.pi * freq * t).unsqueeze(0)
# Extract mel spectrogram features
mel_transform = build_mel_transform(sample_rate)
mel_features = extract_features(waveform, mel_transform)
print(f"Waveform: {waveform.shape} @ {sample_rate}Hz")
print(f"Mel features: {mel_features.shape} (channels, mels, time)")
# Extract MFCC
mfcc_transform = build_mfcc_transform(sample_rate)
mfcc_features = mfcc_transform(waveform)
print(f"MFCC features: {mfcc_features.shape} (channels, mfcc, time)")
# Augmentation
augment = AudioAugmentation(sample_rate)
aug_wave = augment(waveform)
print(f"Augmented waveform: {aug_wave.shape}")
# Save demo
torchaudio.save("demo_tone.wav", waveform, sample_rate)
print("Saved: demo_tone.wav")
For the Librosa alternative when needing NumPy-based audio analysis with extensive time-domain and frequency-domain feature functions (chroma, tonnetz, onset detection, beat tracking) that integrate with scikit-learn — Librosa provides richer musicological feature extraction while TorchAudio’s tensor-first API with GPU support, native transforms as nn.Module components, SpecAugment for training pipelines, and direct integration with PyTorch DataLoaders make it the natural choice for deep learning audio model training. For the SoundFile + scipy alternative when loading audio purely for offline batch processing with minimal dependencies and no GPU requirement — SoundFile/scipy loading is simpler for pure signal processing scripts while TorchAudio bundles data loading, resampling, transforms, SpecAugment, and pretrained Wav2Vec2/HuBERT models in a single framework specifically designed for PyTorch-native audio deep learning pipelines. The Claude Skills 360 bundle includes TorchAudio skill sets covering audio I/O, mel spectrograms, MFCC extraction, SpecAugment, noise augmentation, Wav2Vec2 feature extraction, Speech Commands DataLoader, and audio visualization. Start with the free tier to try audio processing pipeline generation.