Librosa analyzes audio and extracts music information features. pip install librosa. import librosa. Load: y, sr = librosa.load("audio.wav", sr=22050, mono=True) — returns NumPy float32 array. MFCC: mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) — (40, frames). Mel spectrogram: mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128), mel_db = librosa.power_to_db(mel, ref=np.max). Chroma: chroma = librosa.feature.chroma_stft(y=y, sr=sr) — (12, frames). Spectral: centroid = librosa.feature.spectral_centroid(y=y, sr=sr), rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr), bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr). Zero crossing: zcr = librosa.feature.zero_crossing_rate(y). Tempo: tempo, beats = librosa.beat.beat_track(y=y, sr=sr). Onset: onset_frames = librosa.onset.onset_detect(y=y, sr=sr), onset_times = librosa.frames_to_time(onset_frames, sr=sr). Effects: y_stretch = librosa.effects.time_stretch(y, rate=1.2), y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=2). Separation: harmonic, percussive = librosa.effects.hpss(y). STFT: D = librosa.stft(y), magnitude, phase = librosa.magphase(D). Inverse: y_recon = librosa.istft(D). Display: librosa.display.specshow(mel_db, x_axis="time", y_axis="mel"). librosa.frames_to_time(frames, sr=sr, hop_length=512). Claude Code generates Librosa feature extraction pipelines, beat synchronous features, music genre classifiers, and audio similarity systems.
CLAUDE.md for Librosa
## Librosa Stack
- Version: librosa >= 0.10
- Load: librosa.load(path, sr=22050, mono=True) → (y, sr) float32 array
- MFCC: librosa.feature.mfcc(y, sr, n_mfcc=40) → (n_mfcc, n_frames)
- Mel: librosa.feature.melspectrogram(y, sr, n_mels=128) → power_to_db for dB
- Chroma: librosa.feature.chroma_stft(y, sr) → (12, n_frames) pitch class
- Tempo: librosa.beat.beat_track(y, sr) → (tempo_bpm, beat_frames)
- Onset: librosa.onset.onset_detect(y, sr) → frame indices
- Effects: time_stretch(y, rate) | pitch_shift(y, sr, n_steps) | hpss(y)
- Beat-sync: librosa.util.sync(features, beats, aggregate=np.mean)
Librosa Music Analysis Pipeline
# audio/librosa_analysis.py — music feature extraction and analysis with Librosa
from __future__ import annotations
import os
from pathlib import Path
import numpy as np
import librosa
import librosa.display
# ── 1. Audio loading and preprocessing ───────────────────────────────────────
def load_audio(
path: str,
sr: int = 22050,
duration: float = None, # None = full file
offset: float = 0.0, # Start position in seconds
mono: bool = True,
) -> tuple[np.ndarray, int]:
"""
Load audio file with automatic format detection.
Supports: WAV, MP3, FLAC, OGG, M4A.
"""
y, sample_rate = librosa.load(
path,
sr=sr,
mono=mono,
duration=duration,
offset=offset,
)
return y, sample_rate
def get_audio_info(path: str) -> dict:
"""Get audio metadata without loading full file."""
duration = librosa.get_duration(path=path)
return {"duration_sec": duration, "path": path}
def normalize(y: np.ndarray) -> np.ndarray:
"""Peak-normalize audio to [-1, 1]."""
peak = np.abs(y).max()
return y / peak if peak > 0 else y
# ── 2. Feature extraction ─────────────────────────────────────────────────────
def extract_mfcc(
y: np.ndarray,
sr: int = 22050,
n_mfcc: int = 40,
delta: bool = True, # Include delta and delta-delta
) -> np.ndarray:
"""
Extract MFCC features + deltas.
Returns (3*n_mfcc, n_frames) if delta=True, else (n_mfcc, n_frames).
"""
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
if not delta:
return mfcc
delta1 = librosa.feature.delta(mfcc, order=1)
delta2 = librosa.feature.delta(mfcc, order=2)
return np.vstack([mfcc, delta1, delta2])
def extract_mel_spectrogram(
y: np.ndarray,
sr: int = 22050,
n_mels: int = 128,
n_fft: int = 2048,
hop_length: int = 512,
fmin: float = 0.0,
fmax: float = None,
power_to_db: bool = True,
) -> np.ndarray:
"""Extract log-mel spectrogram — shape (n_mels, n_frames)."""
mel = librosa.feature.melspectrogram(
y=y, sr=sr,
n_mels=n_mels, n_fft=n_fft,
hop_length=hop_length,
fmin=fmin, fmax=fmax or sr // 2,
)
if power_to_db:
mel = librosa.power_to_db(mel, ref=np.max)
return mel
def extract_chroma(
y: np.ndarray,
sr: int = 22050,
hop_length: int = 512,
variant: str = "cqt", # "stft" | "cqt" | "cens"
) -> np.ndarray:
"""
Extract chroma (pitch class) features — shape (12, n_frames).
CQT chroma is more robust to noise; CENS is for music similarity.
"""
if variant == "cqt":
return librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)
elif variant == "cens":
return librosa.feature.chroma_cens(y=y, sr=sr, hop_length=hop_length)
else:
return librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
def extract_spectral_features(
y: np.ndarray,
sr: int = 22050,
) -> dict[str, np.ndarray]:
"""Extract timbral spectral features."""
return {
"centroid": librosa.feature.spectral_centroid(y=y, sr=sr)[0],
"bandwidth": librosa.feature.spectral_bandwidth(y=y, sr=sr)[0],
"rolloff": librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)[0],
"flatness": librosa.feature.spectral_flatness(y=y)[0],
"contrast": librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1), # (7,)
"zcr": librosa.feature.zero_crossing_rate(y)[0],
"rms": librosa.feature.rms(y=y)[0],
}
# ── 3. Beat and rhythm analysis ───────────────────────────────────────────────
def analyze_rhythm(
y: np.ndarray,
sr: int = 22050,
hop_length: int = 512,
) -> dict:
"""Estimate tempo and beat positions."""
tempo, beat_frames = librosa.beat.beat_track(
y=y, sr=sr, hop_length=hop_length, trim=True
)
beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=hop_length)
# Onset strength (energy flux)
onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
onset_tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
return {
"tempo_bpm": float(tempo),
"beat_count": len(beat_frames),
"beat_times": beat_times.tolist(),
"onset_tempo": float(onset_tempo),
}
def detect_onsets(
y: np.ndarray,
sr: int = 22050,
hop_length: int = 512,
backtrack: bool = True,
) -> np.ndarray:
"""Detect note/event onsets. Returns onset times in seconds."""
onset_frames = librosa.onset.onset_detect(
y=y, sr=sr, hop_length=hop_length,
backtrack=backtrack,
)
return librosa.frames_to_time(onset_frames, sr=sr, hop_length=hop_length)
def beat_sync_features(
features: np.ndarray, # (n_features, n_frames)
beat_frames: np.ndarray,
aggregate: callable = np.mean,
) -> np.ndarray:
"""
Aggregate features per beat for beat-synchronous representation.
Returns (n_features, n_beats) — one feature vector per beat.
"""
return librosa.util.sync(features, beat_frames, aggregate=aggregate)
# ── 4. Harmonic and source separation ────────────────────────────────────────
def separate_harmonic_percussive(
y: np.ndarray,
margin: float = 1.0, # Larger = more aggressive separation
) -> tuple[np.ndarray, np.ndarray]:
"""
Separate audio into harmonic (tonal) and percussive (transient) components.
Harmonic: melody, chords. Percussive: drums, transients.
"""
return librosa.effects.hpss(y, margin=margin)
def separate_vocal_accompaniment(
y: np.ndarray,
sr: int = 22050,
n_fft: int = 2048,
hop_length: int = 512,
n_components: int = 4,
) -> tuple[np.ndarray, np.ndarray]:
"""
Use NMF to separate vocals from accompaniment.
Returns (vocal_estimate, accompaniment_estimate).
"""
S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
components, activations = librosa.decompose.decompose(S, n_components=n_components)
# Heuristic: vocal components tend to have more consistent activation patterns
# Simple approach: reconstruct and return first two vs last two components
vocal_S = components[:, :2] @ activations[:2, :]
accomp_S = components[:, 2:] @ activations[2:, :]
phase = np.angle(librosa.stft(y))
vocal = librosa.istft(vocal_S * np.exp(1j * phase))
accomp = librosa.istft(accomp_S * np.exp(1j * phase))
return vocal, accomp
# ── 5. Audio effects ──────────────────────────────────────────────────────────
def time_stretch(y: np.ndarray, rate: float) -> np.ndarray:
"""Stretch audio by rate (>1 = faster, <1 = slower) without pitch change."""
return librosa.effects.time_stretch(y, rate=rate)
def pitch_shift(y: np.ndarray, sr: int, n_steps: float) -> np.ndarray:
"""Shift pitch by n semitones (positive = up, negative = down)."""
return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
def trim_silence(
y: np.ndarray,
threshold: float = 20.0, # dB below peak to trim
) -> tuple[np.ndarray, tuple[int, int]]:
"""Trim leading and trailing silence."""
y_trimmed, intervals = librosa.effects.trim(y, top_db=threshold)
return y_trimmed, intervals
# ── 6. Music fingerprint / all-in-one feature vector ────────────────────────
def extract_music_fingerprint(
audio_path: str,
sr: int = 22050,
duration: float = 30.0, # Analyze first 30 seconds
) -> dict:
"""
Extract a comprehensive feature fingerprint for music similarity search.
Returns a feature dict suitable for classification or nearest-neighbor search.
"""
y, sr = librosa.load(audio_path, sr=sr, duration=duration, mono=True)
# Core features
mfcc = extract_mfcc(y, sr, n_mfcc=20, delta=False)
chroma = extract_chroma(y, sr, variant="cens")
rhythm = analyze_rhythm(y, sr)
spectral = extract_spectral_features(y, sr)
# Temporal statistics (mean + std over time)
def stat(x):
return np.concatenate([x.mean(axis=-1), x.std(axis=-1)])
fingerprint = {
"mfcc_stats": stat(mfcc), # (40,)
"chroma_stats": stat(chroma), # (24,)
"tempo": rhythm["tempo_bpm"],
"beat_count": rhythm["beat_count"],
"centroid_mean": spectral["centroid"].mean(),
"rolloff_mean": spectral["rolloff"].mean(),
"zcr_mean": spectral["zcr"].mean(),
}
# Flat feature vector for ML
fingerprint["vector"] = np.concatenate([
fingerprint["mfcc_stats"],
fingerprint["chroma_stats"],
np.array([
fingerprint["tempo"] / 200.0, # Normalize BPM
fingerprint["centroid_mean"] / (sr / 2),
fingerprint["zcr_mean"],
]),
])
return fingerprint
def compute_similarity(fp1: dict, fp2: dict) -> float:
"""Cosine similarity between two music fingerprints."""
v1 = fp1["vector"]
v2 = fp2["vector"]
return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-8))
# ── 7. Visualization ──────────────────────────────────────────────────────────
def save_analysis_plot(
audio_path: str,
output_path: str = "analysis.png",
):
"""Generate comprehensive audio analysis visualization."""
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
y, sr = librosa.load(audio_path, sr=22050, duration=60.0)
mel = librosa.feature.melspectrogram(y=y, sr=sr)
mel_db = librosa.power_to_db(mel, ref=np.max)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
fig, axes = plt.subplots(3, 1, figsize=(14, 9))
# Waveform
librosa.display.waveshow(y, sr=sr, ax=axes[0])
axes[0].set_title(f"Waveform — {Path(audio_path).name}")
# Mel spectrogram
img = librosa.display.specshow(mel_db, sr=sr, x_axis="time", y_axis="mel", ax=axes[1])
plt.colorbar(img, ax=axes[1], format="%+2.0f dB")
axes[1].set_title("Mel Spectrogram")
# MFCC
img2 = librosa.display.specshow(mfcc, x_axis="time", ax=axes[2])
plt.colorbar(img2, ax=axes[2])
axes[2].set_title("MFCC (20 coefficients)")
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved: {output_path}")
if __name__ == "__main__":
import soundfile as sf
# Create a synthetic test signal (440Hz + 880Hz tone)
sr = 22050
t = np.linspace(0, 5.0, int(sr * 5.0))
y = 0.5 * np.sin(2 * np.pi * 440 * t) + 0.25 * np.sin(2 * np.pi * 880 * t)
sf.write("test_tone.wav", y, sr)
# Feature extraction
y_loaded, sr = librosa.load("test_tone.wav", sr=22050)
mfcc = extract_mfcc(y_loaded, sr, n_mfcc=20)
mel = extract_mel_spectrogram(y_loaded, sr)
print(f"MFCC shape: {mfcc.shape}") # (60, frames) with deltas
print(f"Mel shape: {mel.shape}") # (128, frames)
# Rhythm
rhythm = analyze_rhythm(y_loaded, sr)
print(f"Tempo: {rhythm['tempo_bpm']:.1f} BPM, beats: {rhythm['beat_count']}")
# Fingerprint
fp = extract_music_fingerprint("test_tone.wav")
print(f"Feature vector shape: {fp['vector'].shape}")
print("\nLibrosa analysis complete.")
For the TorchAudio alternative when building PyTorch-native deep learning pipelines with GPU-accelerated transforms and SpecAugment data augmentation for training — TorchAudio integrates seamlessly with PyTorch DataLoaders while Librosa’s NumPy-based API with its rich set of MIR features (chroma, beat tracking, onset detection, HPSS separation, tempo estimation) makes it the standard choice for music information retrieval research, audio fingerprinting, and feature engineering where the goal is analysis rather than model training. For the Essentia alternative when needing production-grade music analysis with music-theory–aware tonal features, key detection, and scale recognition used in professional music software — Essentia provides more music-domain depth while Librosa’s simpler API, extensive documentation, and wider Python ecosystem adoption make it easier to integrate into ML pipelines and prototype music classification, recommendation, and cover song detection systems. The Claude Skills 360 bundle includes Librosa skill sets covering MFCC and mel spectrogram extraction, chroma analysis, beat tracking, onset detection, harmonic-percussive separation, NMF decomposition, pitch shifting, and music fingerprinting. Start with the free tier to try music analysis code generation.