NeMo builds speech and NLP models on NVIDIA hardware. pip install nemo_toolkit[all]. ASR inference: from nemo.collections.asr.models import EncDecCTCModelBPE, asr = EncDecCTCModelBPE.from_pretrained("nvidia/stt_en_conformer_ctc_large"), transcripts = asr.transcribe(["audio.wav"]). Streaming ASR: asr.transcribe(["audio.wav"], batch_size=8). Fine-tune ASR: prepare manifest — {"audio_filepath": "path.wav", "duration": 3.2, "text": "hello world"}. Train config via Hydra YAML: trainer.max_epochs=50, model.train_ds.manifest_filepath=train.json. python train.py model=conformer_ctc_bpe model.train_ds.manifest_filepath=train.json. TTS inference: from nemo.collections.tts.models import FastPitchModel, HifiGanModel, spec = FastPitchModel.from_pretrained("nvidia/tts_en_fastpitch"), vocoder = HifiGanModel.from_pretrained("nvidia/tts_hifigan"), parsed = spec.parse("Hello world"), spectrogram, _, _ = spec.generate_spectrogram(tokens=parsed), audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram). NLP: from nemo.collections.nlp.models.language_modeling import MegatronGPTModel, load pretrained or train from scratch with tensor/pipeline parallelism. Custom model: subclass nemo.core.ModelPT (PyTorch Lightning module with NeMo features), implement training_step/validation_step, use self.register_artifact for saving weights with config. exp_manager: from nemo.utils import exp_manager, exp_manager(trainer, cfg.get("exp_manager", None)) — handles checkpointing, W&B, TensorBoard logging. Config: @hydra_runner(config_path="conf", config_name="config"), access via OmegaConf.to_container(cfg). CTC decode: BeamCTCInfer(beam_size=128, return_best_hypothesis=True) with n-gram LM. Claude Code generates NeMo ASR training configs, TTS pipelines, custom model subclasses, manifest creation scripts, and Hydra experiment configs.
CLAUDE.md for NeMo
## NeMo Stack
- Version: nemo_toolkit >= 1.23 + pytorch >= 2.0 + lightning >= 2.0
- ASR: EncDecCTCModelBPE.from_pretrained("nvidia/...") → transcribe(["audio.wav"])
- TTS: FastPitchModel.from_pretrained + HifiGanModel → parse → generate_spectrogram → audio
- Manifest: {"audio_filepath": str, "duration": float, "text": str} — one JSON per line
- Config: Hydra YAML — trainer.max_epochs, model.train_ds.manifest_filepath, etc.
- exp_manager: exp_manager(trainer, cfg.exp_manager) handles checkpoints + logging
- Custom: subclass nemo.core.ModelPT → training_step + validation_step + register_artifact
ASR Fine-Tuning and TTS Pipeline
# nemo_pipeline/asr_finetune.py — NeMo ASR fine-tuning and TTS inference
from __future__ import annotations
import json
import os
import subprocess
from pathlib import Path
import torch
# ── 1. Data manifest creation ─────────────────────────────────────────────────
def create_manifest(
audio_dir: str,
transcripts: dict[str, str], # {filename: transcript}
output_path: str,
) -> str:
"""
Create a NeMo data manifest JSONL file.
NeMo manifest format: one JSON object per line with audio_filepath, duration, text.
"""
import librosa
entries: list[dict] = []
audio_dir_path = Path(audio_dir)
for filename, text in transcripts.items():
audio_path = audio_dir_path / filename
if not audio_path.exists():
print(f"Warning: {audio_path} not found, skipping")
continue
# Get duration without loading full audio
try:
duration = librosa.get_duration(path=str(audio_path))
except Exception:
duration = 0.0
entries.append({
"audio_filepath": str(audio_path.resolve()),
"duration": round(duration, 4),
"text": text.lower().strip(),
})
with open(output_path, "w") as f:
for entry in entries:
f.write(json.dumps(entry) + "\n")
print(f"Manifest written: {output_path} ({len(entries)} entries)")
return output_path
def split_manifest(
manifest_path: str,
train_ratio: float = 0.9,
seed: int = 42,
) -> tuple[str, str]:
"""Split manifest into train/val sets."""
import random
random.seed(seed)
with open(manifest_path) as f:
lines = f.readlines()
random.shuffle(lines)
split = int(len(lines) * train_ratio)
base = manifest_path.replace(".json", "")
train_path = f"{base}_train.json"
val_path = f"{base}_val.json"
with open(train_path, "w") as f: f.writelines(lines[:split])
with open(val_path, "w") as f: f.writelines(lines[split:])
print(f"Train: {train_path} ({split} samples)")
print(f"Val: {val_path} ({len(lines)-split} samples)")
return train_path, val_path
# ── 2. ASR inference ──────────────────────────────────────────────────────────
class ASRInferencer:
"""Wrapper for NeMo ASR models with batched transcription."""
def __init__(
self,
model_name: str = "nvidia/stt_en_conformer_ctc_large",
device: str = "cuda",
):
from nemo.collections.asr.models import EncDecCTCModelBPE
self.model = EncDecCTCModelBPE.from_pretrained(model_name)
self.model = self.model.to(device)
self.model.eval()
print(f"Loaded ASR model: {model_name}")
def transcribe(
self,
audio_paths: list[str],
batch_size: int = 8,
) -> list[str]:
"""Transcribe a list of audio files."""
with torch.no_grad():
transcripts = self.model.transcribe(
audio_paths,
batch_size=batch_size,
verbose=False,
)
return transcripts
def transcribe_with_timestamps(
self,
audio_path: str,
) -> dict:
"""Transcribe a single file with word-level timestamps."""
output = self.model.transcribe(
[audio_path],
return_hypotheses=True,
)[0]
return {
"text": output.text,
"timesteps": output.timestep,
}
@classmethod
def from_checkpoint(cls, checkpoint_path: str, device: str = "cuda") -> "ASRInferencer":
"""Load fine-tuned model from saved checkpoint."""
from nemo.collections.asr.models import EncDecCTCModelBPE
obj = cls.__new__(cls)
obj.model = EncDecCTCModelBPE.restore_from(checkpoint_path).to(device)
obj.model.eval()
return obj
# ── 3. TTS pipeline ───────────────────────────────────────────────────────────
class TTSPipeline:
"""FastPitch + HiFiGAN text-to-speech pipeline."""
def __init__(
self,
spec_model: str = "nvidia/tts_en_fastpitch",
vocoder_model: str = "nvidia/tts_hifigan",
device: str = "cuda",
):
from nemo.collections.tts.models import FastPitchModel, HifiGanModel
self.spec_model = FastPitchModel.from_pretrained(spec_model).to(device)
self.vocoder = HifiGanModel.from_pretrained(vocoder_model).to(device)
self.spec_model.eval()
self.vocoder.eval()
self.sample_rate = 22050
print("TTS pipeline loaded")
def synthesize(
self,
text: str,
pace: float = 1.0,
pitch_shift: float = 0.0,
) -> "np.ndarray":
"""Convert text to waveform (float32 numpy array)."""
import numpy as np
with torch.no_grad():
parsed = self.spec_model.parse(text)
spectrogram, _, _ = self.spec_model.generate_spectrogram(
tokens=parsed,
pace=pace,
pitch_contour_factor=1.0 + pitch_shift,
)
audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
return audio.squeeze().cpu().numpy()
def save_wav(self, text: str, output_path: str) -> None:
"""Synthesize and save to WAV file."""
import soundfile as sf
audio = self.synthesize(text)
sf.write(output_path, audio, self.sample_rate)
print(f"Audio saved: {output_path} ({len(audio)/self.sample_rate:.2f}s)")
# ── 4. Custom NeMo model subclass ────────────────────────────────────────────
def example_custom_model():
"""
Minimal NeMo custom model skeleton.
ModelPT = ModelPT inherits from LightningModule + NeMo model utilities.
"""
from nemo.core import ModelPT
from omegaconf import DictConfig
class CustomClassifier(ModelPT):
def __init__(self, cfg: DictConfig, trainer=None):
super().__init__(cfg, trainer)
import torch.nn as nn
self.encoder = nn.Linear(cfg.input_dim, cfg.hidden_dim)
self.head = nn.Linear(cfg.hidden_dim, cfg.num_classes)
self.loss = nn.CrossEntropyLoss()
def forward(self, x):
return self.head(torch.relu(self.encoder(x)))
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = self.loss(logits, y)
self.log("train_loss", loss, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = self.loss(logits, y)
self.log("val_loss", loss, prog_bar=True)
def setup_training_data(self, train_data_config):
pass # Return a dataloader
def setup_validation_data(self, val_data_config):
pass # Return a dataloader
@classmethod
def list_available_models(cls):
return []
return CustomClassifier
# ── 5. Generate NeMo ASR training config ─────────────────────────────────────
def generate_asr_config(
train_manifest: str,
val_manifest: str,
output_dir: str = "experiments/asr",
max_epochs: int = 50,
) -> str:
"""Generate a Hydra-compatible ASR fine-tuning YAML config."""
config = f"""
name: conformer_ctc_finetune
model:
pretrained_model: nvidia/stt_en_conformer_ctc_large
train_ds:
manifest_filepath: {train_manifest}
sample_rate: 16000
batch_size: 16
shuffle: true
num_workers: 4
trim_silence: true
max_duration: 20.0
validation_ds:
manifest_filepath: {val_manifest}
sample_rate: 16000
batch_size: 16
shuffle: false
num_workers: 4
optim:
name: adamw
lr: 5e-5
weight_decay: 1e-3
sched:
name: CosineAnnealing
warmup_steps: 500
min_lr: 1e-6
trainer:
devices: 1
accelerator: gpu
strategy: auto
max_epochs: {max_epochs}
gradient_clip_val: 1.0
log_every_n_steps: 10
val_check_interval: 0.25
precision: 16-mixed
exp_manager:
exp_dir: {output_dir}
name: asr_finetune
create_tensorboard_logger: true
create_wandb_logger: false
checkpoint_callback_params:
monitor: val_wer
mode: min
save_top_k: 3
"""
config_path = "conf/asr_finetune.yaml"
Path("conf").mkdir(exist_ok=True)
with open(config_path, "w") as f:
f.write(config)
print(f"Config saved: {config_path}")
print(f"Train with: python -m nemo.collections.asr.train_asr.py --config-path conf --config-name asr_finetune")
return config_path
For the OpenAI Whisper alternative when needing a simple, battle-tested ASR model that runs anywhere without NVIDIA-specific tooling — Whisper’s simplicity and broad language support make it the default choice for most transcription tasks while NeMo’s Conformer CTC/RNNT models achieve lower word error rates on English and domain-specific speech, offer streaming ASR for real-time applications, and integrate tightly with NVIDIA’s Riva production serving stack. For the Coqui TTS alternative when needing an open-source voices with fine-tuning for custom voices without NVIDIA hardware — Coqui handles CPU inference while NeMo’s FastPitch + HiFiGAN pipeline is specifically optimized for NVIDIA GPU throughput and integrates with the NeMo training framework for custom voice cloning on GPU clusters. The Claude Skills 360 bundle includes NeMo skill sets covering ASR inference and fine-tuning, TTS synthesis, manifest creation, Hydra configs, custom ModelPT subclasses, and exp_manager experiment tracking. Start with the free tier to try speech model code generation.