Whisper transcribes audio in 99 languages with high accuracy. pip install openai-whisper. import whisper. model = whisper.load_model("base") — sizes: tiny (39M), base (74M), small (244M), medium (769M), large-v3 (1.5B). Transcribe: result = model.transcribe("audio.mp3"), result["text"] — full transcript. Language: result = model.transcribe("audio.mp3", language="fr"). Translate to English: result = model.transcribe("audio.mp3", task="translate"). Word timestamps: result = model.transcribe("audio.mp3", word_timestamps=True), result["segments"][0]["words"] — list with {word, start, end, probability}. Segment-level: result["segments"] — list with {id, start, end, text}. Decoding options: options = whisper.DecodingOptions(language="en", without_timestamps=False, beam_size=5, best_of=5, temperature=0.0). Initial prompt: model.transcribe(audio, initial_prompt="This is a medical lecture about cardiology.") — guides vocabulary. Long audio: whisper.audio.load_audio("file.mp3") → whisper.audio.pad_or_trim(audio). Detect language: mel = whisper.log_mel_spectrogram(audio), probs = model.detect_language(mel)[1], max(probs, key=probs.get). faster-whisper: from faster_whisper import WhisperModel, model = WhisperModel("large-v3", device="cpu", compute_type="int8"), segments, info = model.transcribe("audio.mp3", beam_size=5, word_timestamps=True). OpenAI API: from openai import OpenAI, client.audio.transcriptions.create(model="whisper-1", file=open("audio.mp3","rb"), response_format="verbose_json"). Claude Code generates Whisper transcription pipelines, batch processors, subtitle exporters, and speaker-aware transcription scripts.
CLAUDE.md for Whisper
## Whisper Stack
- Version: openai-whisper >= 20231117 | faster-whisper >= 1.0
- Load: whisper.load_model("base" | "small" | "medium" | "large-v3")
- Transcribe: model.transcribe(audio_path, language=, task="transcribe"|"translate")
- Timestamps: word_timestamps=True → result["segments"][i]["words"][j] with start/end/prob
- Prompt: initial_prompt="domain vocab" to guide recognition
- FasterWhisper: WhisperModel(size, device, compute_type="int8") → transcribe(path, beam_size)
- OpenAI API: client.audio.transcriptions.create(model="whisper-1", file=..., response_format)
- Formats: response_format="json"|"text"|"srt"|"vtt"|"verbose_json"
Whisper Transcription Pipeline
# audio/whisper_pipeline.py — speech recognition with OpenAI Whisper
from __future__ import annotations
import os
import json
import time
from pathlib import Path
from typing import Optional
# ── 1. Model loading ──────────────────────────────────────────────────────────
def load_whisper_model(
size: str = "base", # tiny | base | small | medium | large-v3
device: str = "cpu", # cpu | cuda
):
"""Load Whisper model. Larger = more accurate, slower."""
import whisper
print(f"Loading whisper-{size} on {device}...")
model = whisper.load_model(size, device=device)
print(f"Model ready: {sum(p.numel() for p in model.parameters()):,} params")
return model
def load_faster_whisper(
size: str = "base",
device: str = "cpu",
compute_type: str = "int8", # int8 | float16 | float32 | int8_float16
):
"""
Load faster-whisper for 2-4x speed vs original Whisper.
CTranslate2-backed — much faster CPU inference with int8.
"""
from faster_whisper import WhisperModel
model = WhisperModel(
size,
device=device,
compute_type=compute_type,
cpu_threads=os.cpu_count() or 4,
num_workers=2,
)
print(f"faster-whisper-{size} ({compute_type}) ready")
return model
# ── 2. Basic transcription ────────────────────────────────────────────────────
def transcribe(
model,
audio_path: str,
language: str = None, # None = auto-detect
task: str = "transcribe", # "transcribe" | "translate"
beam_size: int = 5,
temperature: float = 0.0,
initial_prompt: str = None,
word_timestamps: bool = False,
) -> dict:
"""
Transcribe audio file. Returns dict with text, segments, language.
Works with both openai-whisper and faster-whisper models.
"""
import whisper as _whisper
is_faster = not hasattr(model, "transcribe")
if is_faster:
# faster-whisper API
segments, info = model.transcribe(
audio_path,
language=language,
task=task,
beam_size=beam_size,
temperature=temperature,
initial_prompt=initial_prompt,
word_timestamps=word_timestamps,
vad_filter=True,
vad_parameters={"min_silence_duration_ms": 300},
)
segment_list = []
full_text = []
for seg in segments:
s = {"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text.strip()}
if word_timestamps and seg.words:
s["words"] = [{"word": w.word, "start": w.start, "end": w.end, "probability": w.probability}
for w in seg.words]
segment_list.append(s)
full_text.append(seg.text.strip())
return {
"text": " ".join(full_text),
"language": info.language,
"segments": segment_list,
}
else:
# Original openai-whisper API
result = model.transcribe(
audio_path,
language=language,
task=task,
beam_size=beam_size,
temperature=temperature,
initial_prompt=initial_prompt,
word_timestamps=word_timestamps,
)
return result
def detect_language(model, audio_path: str) -> tuple[str, dict]:
"""Detect spoken language and return probabilities."""
import whisper
import numpy as np
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
top_lang = max(probs, key=probs.get)
top5 = dict(sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5])
return top_lang, top5
# ── 3. Subtitle generation ────────────────────────────────────────────────────
def format_timestamp(seconds: float, srt: bool = False) -> str:
"""Format seconds as SRT (00:00:00,000) or VTT (00:00:00.000) timestamp."""
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
ms = int((seconds - int(seconds)) * 1000)
sep = "," if srt else "."
return f"{h:02d}:{m:02d}:{s:02d}{sep}{ms:03d}"
def to_srt(result: dict) -> str:
"""Convert Whisper result to SRT subtitle format."""
lines = []
for i, seg in enumerate(result["segments"], 1):
start = format_timestamp(seg["start"], srt=True)
end = format_timestamp(seg["end"], srt=True)
lines.append(f"{i}\n{start} --> {end}\n{seg['text'].strip()}\n")
return "\n".join(lines)
def to_vtt(result: dict) -> str:
"""Convert Whisper result to WebVTT subtitle format."""
lines = ["WEBVTT\n"]
for seg in result["segments"]:
start = format_timestamp(seg["start"], srt=False)
end = format_timestamp(seg["end"], srt=False)
lines.append(f"{start} --> {end}\n{seg['text'].strip()}\n")
return "\n".join(lines)
def save_subtitles(result: dict, base_path: str):
"""Save both SRT and VTT subtitle files."""
srt_path = base_path + ".srt"
vtt_path = base_path + ".vtt"
Path(srt_path).write_text(to_srt(result), encoding="utf-8")
Path(vtt_path).write_text(to_vtt(result), encoding="utf-8")
print(f"Saved: {srt_path}, {vtt_path}")
# ── 4. Batch audio processing ─────────────────────────────────────────────────
def batch_transcribe(
model,
audio_files: list[str],
output_dir: str = "./transcripts",
language: str = None,
format: str = "json", # "json" | "txt" | "srt"
) -> list[dict]:
"""
Transcribe multiple audio files and save results.
Returns list of result dicts.
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
results = []
for i, audio_file in enumerate(audio_files):
stem = Path(audio_file).stem
print(f"[{i+1}/{len(audio_files)}] Transcribing: {audio_file}...")
t0 = time.perf_counter()
result = transcribe(model, audio_file, language=language)
elapsed = time.perf_counter() - t0
print(f" Done in {elapsed:.1f}s: {result['language']} — {len(result['text'].split())} words")
if format == "json":
out_file = output_path / f"{stem}.json"
out_file.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
elif format == "txt":
out_file = output_path / f"{stem}.txt"
out_file.write_text(result["text"], encoding="utf-8")
elif format == "srt":
save_subtitles(result, str(output_path / stem))
results.append(result)
print(f"\nCompleted {len(results)} files → {output_dir}")
return results
# ── 5. Domain-specific transcription ─────────────────────────────────────────
DOMAIN_PROMPTS = {
"medical": "This is a medical consultation discussing symptoms, diagnoses, and treatments.",
"legal": "This is a legal deposition with formal legal terminology and case discussions.",
"technical": "This is a software engineering discussion about APIs, microservices, and machine learning.",
"finance": "This is a financial earnings call discussing revenue, EBITDA, and market outlook.",
"podcast": "This is a podcast interview with casual conversation.",
}
def transcribe_domain(
model,
audio_path: str,
domain: str = "technical",
language: str = "en",
) -> dict:
"""Transcribe audio with domain-specific vocabulary guidance."""
prompt = DOMAIN_PROMPTS.get(domain, "")
return transcribe(
model,
audio_path,
language=language,
initial_prompt=prompt,
temperature=0.0,
beam_size=5,
word_timestamps=True,
)
# ── 6. OpenAI Whisper API ─────────────────────────────────────────────────────
def transcribe_via_api(
audio_path: str,
language: str = None,
response_format: str = "verbose_json", # json | text | srt | vtt | verbose_json
timestamp_granularities: list = None, # ["word"] | ["segment"]
) -> dict | str:
"""
Use OpenAI Whisper API — no local GPU needed.
Costs ~$0.006/minute of audio.
"""
from openai import OpenAI
client = OpenAI()
params = dict(
model="whisper-1",
file=open(audio_path, "rb"),
response_format=response_format,
)
if language:
params["language"] = language
if timestamp_granularities:
params["timestamp_granularities"] = timestamp_granularities
result = client.audio.transcriptions.create(**params)
if response_format in ("json", "verbose_json"):
return result.model_dump()
return result # text, srt, vtt as string
def translate_via_api(audio_path: str) -> str:
"""Translate any language audio → English text via Whisper API."""
from openai import OpenAI
client = OpenAI()
result = client.audio.translations.create(
model="whisper-1",
file=open(audio_path, "rb"),
response_format="text",
)
return result
# ── 7. WhisperX with speaker diarization ─────────────────────────────────────
def transcribe_with_diarization(
audio_path: str,
model_size: str = "base",
device: str = "cpu",
hf_token: str = None, # Required for pyannote diarization models
num_speakers: int = None, # None = auto-detect
language: str = None,
) -> dict:
"""
WhisperX: word-level alignment + speaker diarization.
pip install whisperx
Returns segments with speaker labels: {"start": 0.0, "end": 2.3, "text": "...", "speaker": "SPEAKER_00"}
"""
import whisperx
# 1. Transcribe
model = whisperx.load_model(model_size, device, compute_type="int8")
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio, batch_size=16, language=language)
# 2. Align word timestamps
align_model, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device,
)
result = whisperx.align(
result["segments"], align_model, metadata, audio, device,
return_char_alignments=False,
)
# 3. Diarize (requires HF token and pyannote models)
if hf_token:
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
diarize_result = diarize_model(audio, num_speakers=num_speakers)
result = whisperx.assign_word_speakers(diarize_result, result)
return result
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Use faster-whisper for CPU inference demo
model = load_faster_whisper("base", device="cpu", compute_type="int8")
# Transcribe (replace "audio.mp3" with a real file)
result = transcribe(model, "audio.mp3", word_timestamps=True)
print(f"Language: {result['language']}")
print(f"Transcript: {result['text'][:200]}...")
if result["segments"]:
print(f"\nFirst segment: [{result['segments'][0]['start']:.2f}s - {result['segments'][0]['end']:.2f}s]")
print(f" Text: {result['segments'][0]['text']}")
if "words" in result["segments"][0]:
first_words = result["segments"][0]["words"][:5]
for w in first_words:
print(f" [{w['start']:.2f}-{w['end']:.2f}] {w['word']} (p={w['probability']:.2f})")
# Generate subtitles
srt = to_srt(result)
print(f"\nSRT preview:\n{srt[:200]}")
For the AssemblyAI API alternative when needing production-grade cloud transcription with speaker diarization, auto-chapters, entity detection, and sentiment analysis in a single managed API — AssemblyAI bundles multiple post-processing features while Whisper runs completely on your own hardware with no per-minute cost, no data leaving your infrastructure, and full control over the model size and accuracy/speed trade-off. For the Google Speech-to-Text alternative when needing real-time streaming transcription of telephony audio with automatic punctuation and speaker tagging via managed cloud infrastructure — Google STT offers streaming with low first-byte latency while Whisper’s batch transcription achieves higher accuracy on challenging audio (accents, noise, domain vocabulary) and its initial_prompt parameter provides domain vocabulary guidance without any API configuration. The Claude Skills 360 bundle includes Whisper skill sets covering model loading, transcription with word timestamps, subtitle SRT/VTT generation, domain-specific prompting, batch audio processing, OpenAI Whisper API, faster-whisper CPU acceleration, and WhisperX speaker diarization. Start with the free tier to try speech recognition pipeline generation.