SpeechBrain is a PyTorch speech processing toolkit. pip install speechbrain. Speaker verification: from speechbrain.pretrained import SpeakerRecognition, model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb"), score, prediction = model.verify_files("spk1.wav", "spk2.wav"). Speaker embedding: model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb"), embeddings = model.encode_batch(wavs) — 192-dim x-vectors. Language ID: model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa"), prediction, score = model.classify_file("audio.wav"). Speech separation: from speechbrain.pretrained import SepformerSeparation, model = SepformerSeparation.from_hparams(source="speechbrain/sepformer-wham"), est_sources = model.separate_file("noisy.wav"). ASR: from speechbrain.pretrained import EncoderDecoderASR, asr = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech"), transcript = asr.transcribe_file("audio.wav"). TTS: from speechbrain.pretrained import Tacotron2, HIFIGAN, tacotron = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech"), mel_output, _, _ = tacotron.encode_text("Hello world"), hifigan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech"), waveforms = hifigan.decode_batch(mel_output). Custom training: from speechbrain.core import Brain, subclass with compute_forward, compute_objectives, call brain.fit(epoch_counter, train_set, valid_set). Claude Code generates SpeechBrain speaker verification, speech enhancement, custom training recipes, and audio processing pipelines.
CLAUDE.md for SpeechBrain
## SpeechBrain Stack
- Version: speechbrain >= 1.0
- Pretrained: ModelClass.from_hparams(source="speechbrain/model-name", savedir="./pretrained")
- Speaker: SpeakerRecognition.verify_files(s1, s2) | EncoderClassifier.encode_batch(wavs)
- Lang ID: EncoderClassifier(source="lang-id-*").classify_file(path)
- Separation: SepformerSeparation.separate_file(path) → est_sources tensor
- ASR: EncoderDecoderASR.transcribe_file(path) | transcribe_batch(paths)
- TTS: Tacotron2.encode_text(text) → mel | HIFIGAN.decode_batch(mel) → waveform
- Custom: subclass Brain → compute_forward + compute_objectives → brain.fit(...)
- Audio: torchaudio.load(path) → (waveform, sample_rate)
SpeechBrain Processing Pipeline
# audio/speechbrain_pipeline.py — speech processing with SpeechBrain
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
import torch
import torchaudio
import numpy as np
# ── 1. Speaker verification and identification ────────────────────────────────
class SpeakerSystem:
"""
Speaker recognition: verification (same/different person) and
identification (who is speaking from a set of enrolled speakers).
"""
def __init__(
self,
savedir: str = "./pretrained/speaker",
device: str = "cpu",
):
from speechbrain.pretrained import SpeakerRecognition, EncoderClassifier
self.device = device
self.savedir = savedir
# Verification model
self.verifier = SpeakerRecognition.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir=os.path.join(savedir, "verification"),
run_opts={"device": device},
)
# Embedding encoder (ECAPA-TDNN)
self.encoder = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir=os.path.join(savedir, "encoder"),
run_opts={"device": device},
)
self._enrolled: dict[str, torch.Tensor] = {}
def verify(self, file1: str, file2: str, threshold: float = 0.25) -> dict:
"""
Verify if two audio files are from the same speaker.
Returns score ∈ [-1, 1]; score > threshold → same speaker.
"""
score, prediction = self.verifier.verify_files(file1, file2)
s = float(score.squeeze())
return {
"score": s,
"same_speaker": s > threshold,
"confidence": abs(s),
}
def embed(self, audio_path: str) -> torch.Tensor:
"""Extract 192-dim ECAPA speaker embedding from audio file."""
signal, sr = torchaudio.load(audio_path)
if sr != 16000:
signal = torchaudio.functional.resample(signal, sr, 16000)
with torch.no_grad():
embedding = self.encoder.encode_batch(signal.unsqueeze(0))
return embedding.squeeze() # (192,)
def enroll(self, speaker_id: str, audio_paths: list[str]):
"""Enroll a speaker by averaging embeddings from multiple recordings."""
embeddings = [self.embed(p) for p in audio_paths]
self._enrolled[speaker_id] = torch.stack(embeddings).mean(dim=0)
# L2 normalize
self._enrolled[speaker_id] = (
self._enrolled[speaker_id] /
self._enrolled[speaker_id].norm()
)
print(f"Enrolled speaker '{speaker_id}' from {len(audio_paths)} recordings")
def identify(self, audio_path: str, threshold: float = 0.3) -> tuple[str, float]:
"""
Identify the speaker from enrolled speakers.
Returns (speaker_id, score) or ("unknown", score).
"""
if not self._enrolled:
raise RuntimeError("No speakers enrolled. Call enroll() first.")
query_emb = self.embed(audio_path)
query_emb = query_emb / query_emb.norm()
best_id = "unknown"
best_score = float("-inf")
for spk_id, ref_emb in self._enrolled.items():
score = float(torch.dot(query_emb, ref_emb))
if score > best_score:
best_score = score
best_id = spk_id
if best_score < threshold:
best_id = "unknown"
return best_id, best_score
# ── 2. Language identification ────────────────────────────────────────────────
class LanguageIdentifier:
"""Identify the language spoken in an audio file (107 languages)."""
def __init__(self, savedir: str = "./pretrained/langid", device: str = "cpu"):
from speechbrain.pretrained import EncoderClassifier
self.model = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-voxlingua107-ecapa",
savedir=savedir,
run_opts={"device": device},
)
def identify(self, audio_path: str) -> dict:
"""Identify language and return top-5 probabilities."""
out_prob, score, index, text_lab = self.model.classify_file(audio_path)
lang_code = text_lab[0]
# Get top-5 languages
probs = torch.nn.functional.softmax(out_prob.squeeze(), dim=0)
top5 = torch.topk(probs, 5)
top5_langs = {
self.model.hparams.label_encoder.ind2lab[idx.item()]: float(prob)
for idx, prob in zip(top5.indices, top5.values)
}
return {
"language": lang_code,
"score": float(score.squeeze()),
"top5": top5_langs,
}
# ── 3. Speech separation and enhancement ─────────────────────────────────────
class SpeechEnhancer:
"""
Speech separation and noise reduction with SepFormer.
Separates mixed speech or removes background noise.
"""
def __init__(self, savedir: str = "./pretrained/separation", device: str = "cpu"):
from speechbrain.pretrained import SepformerSeparation
# WHAM model: speech + noise separation
self.separator = SepformerSeparation.from_hparams(
source="speechbrain/sepformer-wham",
savedir=savedir,
run_opts={"device": device},
)
def separate(self, audio_path: str, output_dir: str = "./enhanced") -> list[str]:
"""
Separate audio into clean speech source(s).
Returns list of output file paths.
"""
Path(output_dir).mkdir(parents=True, exist_ok=True)
est_sources = self.separator.separate_file(audio_path)
# est_sources: (time, n_sources) tensor
output_paths = []
stem = Path(audio_path).stem
for i in range(est_sources.shape[-1]):
source = est_sources[:, :, i].T # (1, time)
out_path = os.path.join(output_dir, f"{stem}_source{i+1}.wav")
torchaudio.save(out_path, source.cpu(), 8000) # WHAM uses 8kHz
output_paths.append(out_path)
print(f"Source {i+1} saved: {out_path}")
return output_paths
def enhance_batch(
self,
audio_paths: list[str],
output_dir: str = "./enhanced",
) -> list[str]:
"""Batch speech enhancement."""
all_outputs = []
for path in audio_paths:
outputs = self.separate(path, output_dir)
all_outputs.extend(outputs)
return all_outputs
# ── 4. Automatic speech recognition ──────────────────────────────────────────
class ASRSystem:
"""Encoder-decoder ASR with language model rescoring."""
def __init__(self, savedir: str = "./pretrained/asr", device: str = "cpu"):
from speechbrain.pretrained import EncoderDecoderASR
self.asr = EncoderDecoderASR.from_hparams(
source="speechbrain/asr-crdnn-rnnlm-librispeech",
savedir=savedir,
run_opts={"device": device},
)
def transcribe(self, audio_path: str) -> str:
"""Transcribe a single audio file."""
return self.asr.transcribe_file(audio_path)
def transcribe_batch(self, audio_paths: list[str]) -> list[str]:
"""Transcribe multiple files."""
return [self.asr.transcribe_file(p) for p in audio_paths]
# ── 5. Text-to-speech ─────────────────────────────────────────────────────────
class TTSSystem:
"""Tacotron2 + HiFi-GAN TTS pipeline."""
def __init__(
self,
tts_savedir: str = "./pretrained/tts-tacotron2",
vocoder_savedir: str = "./pretrained/tts-hifigan",
device: str = "cpu",
):
from speechbrain.pretrained import Tacotron2, HIFIGAN
self.tacotron = Tacotron2.from_hparams(
source="speechbrain/tts-tacotron2-ljspeech",
savedir=tts_savedir,
run_opts={"device": device},
)
self.hifigan = HIFIGAN.from_hparams(
source="speechbrain/tts-hifigan-ljspeech",
savedir=vocoder_savedir,
run_opts={"device": device},
)
self.sample_rate = 22050
def synthesize(self, text: str) -> torch.Tensor:
"""Synthesize speech from text. Returns waveform tensor."""
with torch.no_grad():
mel_output, mel_length, alignment = self.tacotron.encode_text(text)
waveforms = self.hifigan.decode_batch(mel_output)
return waveforms.squeeze(1) # (1, time)
def save_audio(self, text: str, output_path: str = "output.wav") -> str:
"""Synthesize and save to WAV file."""
waveforms = self.synthesize(text)
torchaudio.save(output_path, waveforms.cpu(), self.sample_rate)
print(f"Saved: {output_path} ({waveforms.shape[-1]/self.sample_rate:.1f}s)")
return output_path
def batch_synthesize(
self,
texts: list[str],
output_dir: str = "./tts_output",
) -> list[str]:
"""Synthesize multiple texts."""
Path(output_dir).mkdir(parents=True, exist_ok=True)
paths = []
for i, text in enumerate(texts):
out_path = os.path.join(output_dir, f"utterance_{i:04d}.wav")
self.save_audio(text, out_path)
paths.append(out_path)
return paths
# ── 6. Audio utilities ────────────────────────────────────────────────────────
def load_audio_16k(audio_path: str) -> tuple[torch.Tensor, int]:
"""Load audio and resample to 16kHz mono."""
signal, sr = torchaudio.load(audio_path)
if signal.shape[0] > 1:
signal = signal.mean(dim=0, keepdim=True) # Stereo → mono
if sr != 16000:
signal = torchaudio.functional.resample(signal, sr, 16000)
return signal, 16000
def compute_snr(clean: torch.Tensor, noisy: torch.Tensor) -> float:
"""Compute Signal-to-Noise Ratio in dB."""
noise = noisy - clean
signal_power = (clean ** 2).mean()
noise_power = (noise ** 2).mean()
if noise_power == 0:
return float("inf")
return float(10 * torch.log10(signal_power / noise_power))
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
# TTS demo — no audio input needed
tts = TTSSystem(device="cpu")
tts.save_audio(
"SpeechBrain provides a clean interface for speech processing tasks.",
"speechbrain_demo.wav",
)
print("TTS demo complete: speechbrain_demo.wav")
# ASR demo
asr = ASRSystem(device="cpu")
transcript = asr.transcribe("speechbrain_demo.wav")
print(f"Transcript: {transcript}")
# Language ID demo
lang_id = LanguageIdentifier(device="cpu")
result = lang_id.identify("speechbrain_demo.wav")
print(f"Language: {result['language']} (score={result['score']:.3f})")
print(f"Top 5: {result['top5']}")
For the Whisper alternative when needing multilingual transcription with the highest accuracy on challenging real-world audio across all languages — Whisper excels at transcription while SpeechBrain’s speaker recognition (ECAPA-TDNN embeddings, cosine similarity enrollment), speech separation (SepFormer), and 107-language identification provide capabilities that Whisper doesn’t cover, making SpeechBrain the right toolkit for multi-speaker, speaker-aware, or speech enhancement applications. For the TorchAudio alternative when adding audio augmentations, spectral features, and codec transforms via a thin PyTorch-native library — TorchAudio provides low-level audio operations while SpeechBrain adds complete pretrained pipelines (ECAPA speaker embeddings, SepFormer separation, Tacotron2+HiFi-GAN TTS) with a uniform from_hparams(source=...) loader and a full training framework for fine-tuning on custom datasets. The Claude Skills 360 bundle includes SpeechBrain skill sets covering speaker verification and enrollment, language identification, speech separation, ASR transcription, TTS synthesis, custom Brain training loops, and audio utility functions. Start with the free tier to try speech processing pipeline generation.