AudioCraft generates music and audio from text descriptions. pip install audiocraft. from audiocraft.models import MusicGen. Load: model = MusicGen.get_pretrained("facebook/musicgen-small") — sizes: small (300M), medium (1.5B), large (3.3B), melody (1.5B), stereo variants. Set params: model.set_generation_params(duration=15, temperature=1.0, top_k=250, cfg_coef=3.0). Generate: wav = model.generate(["upbeat jazz piano with walking bass"]) — returns (batch, channels, samples) tensor. Multiple: wavs = model.generate(["classical piano", "electronic dance music", "acoustic guitar folk"]). Save: from audiocraft.utils.audio import audio_write, audio_write("output", wav[0].cpu(), model.sample_rate, strategy="loudness"). Melody conditioning: melody_waveform, sr = torchaudio.load("melody.wav"), model = MusicGen.get_pretrained("facebook/musicgen-melody"), wavs = model.generate_with_chroma(["orchestral version"], melody_waveform[None], sr). Continuation: model.generate_continuation(prompt_waveform, prompt_sample_rate, ["continues with bridge section"]). AudioGen: from audiocraft.models import AudioGen, model = AudioGen.get_pretrained("facebook/audiogen-medium"), model.set_generation_params(duration=5), wav = model.generate(["dog barking", "thunder storm", "office keyboard typing"]). EnCodec compression: from audiocraft.models import EncodecModel, codec = EncodecModel.get_pretrained("facebook/encodec_24khz"), codec.set_target_bandwidth(6.0). Stereo: model = MusicGen.get_pretrained("facebook/musicgen-stereo-medium") — returns 2-channel audio. Claude Code generates AudioCraft music generation scripts, batch audio pipelines, melody-conditioned generation, and sound effect synthesis code.
CLAUDE.md for AudioCraft
## AudioCraft Stack
- Version: audiocraft >= 1.3
- MusicGen: MusicGen.get_pretrained("facebook/musicgen-small|medium|large|melody|stereo-*")
- AudioGen: AudioGen.get_pretrained("facebook/audiogen-medium")
- Params: model.set_generation_params(duration, temperature, top_k, cfg_coef)
- Generate: model.generate(["text prompt 1", "text prompt 2"]) → (B, C, T) tensor
- Melody: model.generate_with_chroma(descriptions, melody_wavs, melody_sr)
- Continue: model.generate_continuation(prompt_wav, prompt_sr, descriptions)
- Save: audio_write(stem, wav.cpu(), model.sample_rate, strategy="loudness")
- EnCodec: EncodecModel.get_pretrained("facebook/encodec_24khz") for compression
AudioCraft Generation Pipeline
# audio/audiocraft_pipeline.py — AI music and audio generation with AudioCraft
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
import torch
import torchaudio
from audiocraft.utils.audio import audio_write
# ── 1. Model loading ──────────────────────────────────────────────────────────
def load_musicgen(
model_size: str = "small", # small | medium | large | melody | stereo-small | stereo-medium
device: str = "cuda" if torch.cuda.is_available() else "cpu",
):
"""
Load MusicGen model. Auto-detects GPU.
Memory: small=~2GB, medium=~5GB, large=~10GB.
"""
from audiocraft.models import MusicGen
model_name = f"facebook/musicgen-{model_size}"
print(f"Loading {model_name} on {device}...")
model = MusicGen.get_pretrained(model_name)
model.to(device)
print(f"MusicGen-{model_size} ready | sample_rate={model.sample_rate}Hz")
return model
def load_audiogen(
model_size: str = "medium", # medium
device: str = "cuda" if torch.cuda.is_available() else "cpu",
):
"""Load AudioGen for sound effects generation."""
from audiocraft.models import AudioGen
model = AudioGen.get_pretrained(f"facebook/audiogen-{model_size}")
model.to(device)
print(f"AudioGen-{model_size} ready")
return model
# ── 2. Basic music generation ─────────────────────────────────────────────────
def generate_music(
model,
descriptions: list[str],
duration: float = 15.0, # seconds
temperature: float = 1.0,
top_k: int = 250,
cfg_coef: float = 3.0, # Classifier-free guidance scale
output_dir: str = "./generated_music",
) -> list[str]:
"""
Generate music from text descriptions.
Returns list of output file paths.
"""
Path(output_dir).mkdir(parents=True, exist_ok=True)
model.set_generation_params(
duration=duration,
temperature=temperature,
top_k=top_k,
cfg_coef=cfg_coef,
two_step_cfg=False,
)
print(f"Generating {len(descriptions)} music clips ({duration}s each)...")
with torch.no_grad():
wavs = model.generate(descriptions) # (B, C, T)
output_paths = []
for i, (wav, desc) in enumerate(zip(wavs, descriptions)):
stem = os.path.join(output_dir, f"music_{i:04d}")
out_path = stem + ".wav"
audio_write(
stem,
wav.cpu(),
model.sample_rate,
strategy="loudness", # Normalize loudness
loudness_compressor=True,
)
output_paths.append(out_path)
short_desc = desc[:50] + "..." if len(desc) > 50 else desc
print(f" [{i+1}] '{short_desc}' → {out_path}")
return output_paths
# ── 3. Melody-conditioned generation ─────────────────────────────────────────
def generate_from_melody(
model,
descriptions: list[str],
melody_path: str,
duration: float = 20.0,
output_dir: str = "./generated_music",
) -> list[str]:
"""
Generate music that follows a melody from an audio file.
Model must be 'melody' variant: MusicGen.get_pretrained("facebook/musicgen-melody").
"""
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Load melody audio
melody_wav, melody_sr = torchaudio.load(melody_path)
if melody_wav.shape[0] > 1:
melody_wav = melody_wav.mean(dim=0, keepdim=True) # Mono
# Replicate melody for each description in batch
melody_batch = melody_wav.unsqueeze(0).repeat(len(descriptions), 1, 1)
model.set_generation_params(duration=duration, cfg_coef=3.0)
print(f"Melody-conditioned generation from '{melody_path}'...")
with torch.no_grad():
wavs = model.generate_with_chroma(
descriptions=descriptions,
melody_wavs=melody_batch.to(model.device),
melody_sample_rate=melody_sr,
)
output_paths = []
for i, (wav, desc) in enumerate(zip(wavs, descriptions)):
stem = os.path.join(output_dir, f"melody_cond_{i:04d}")
audio_write(stem, wav.cpu(), model.sample_rate, strategy="loudness")
output_paths.append(stem + ".wav")
return output_paths
# ── 4. Music continuation ─────────────────────────────────────────────────────
def continue_music(
model,
prompt_path: str,
descriptions: list[str],
prompt_dur: float = 5.0, # Use first N seconds as prompt
total_dur: float = 20.0,
output_dir: str = "./generated_music",
) -> list[str]:
"""
Continue music from an existing audio prompt.
Useful for extending or transforming existing music.
"""
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Load and trim prompt audio
prompt_wav, prompt_sr = torchaudio.load(prompt_path)
prompt_samples = int(prompt_dur * prompt_sr)
prompt_wav = prompt_wav[:, :prompt_samples]
model.set_generation_params(duration=total_dur)
# Replicate prompt for batch
prompt_batch = prompt_wav.unsqueeze(0).repeat(len(descriptions), 1, 1)
print(f"Music continuation ({prompt_dur}s prompt → {total_dur}s total)...")
with torch.no_grad():
wavs = model.generate_continuation(
prompt=prompt_batch.to(model.device),
prompt_sample_rate=prompt_sr,
descriptions=descriptions,
)
output_paths = []
for i, (wav, desc) in enumerate(zip(wavs, descriptions)):
stem = os.path.join(output_dir, f"continuation_{i:04d}")
audio_write(stem, wav.cpu(), model.sample_rate, strategy="loudness")
output_paths.append(stem + ".wav")
return output_paths
# ── 5. Sound effects generation ──────────────────────────────────────────────
def generate_sound_effects(
model,
descriptions: list[str],
duration: float = 5.0,
temperature: float = 1.0,
output_dir: str = "./generated_sfx",
) -> list[str]:
"""
Generate sound effects from text with AudioGen.
Best for: foley, ambient sounds, UI sounds, nature sounds.
"""
Path(output_dir).mkdir(parents=True, exist_ok=True)
model.set_generation_params(
duration=duration,
temperature=temperature,
top_k=250,
)
print(f"Generating {len(descriptions)} sound effects ({duration}s each)...")
with torch.no_grad():
wavs = model.generate(descriptions)
output_paths = []
for i, (wav, desc) in enumerate(zip(wavs, descriptions)):
stem = os.path.join(output_dir, f"sfx_{i:04d}")
audio_write(stem, wav.cpu(), model.sample_rate, strategy="loudness")
output_paths.append(stem + ".wav")
print(f" [{i+1}] '{desc[:50]}' → {stem}.wav")
return output_paths
# ── 6. Prompt engineering for music ──────────────────────────────────────────
STYLE_TEMPLATES = {
"cinematic": "{mood} cinematic score, orchestral, {instruments}, film music",
"electronic": "{bpm}bpm {subgenre} electronic, synthesizer, {energy} energy",
"acoustic": "acoustic {instruments}, {mood}, {setting}, no drums",
"jazz": "{tempo} jazz, {instruments}, improvisation, swing rhythm",
"ambient": "ambient {mood} soundscape, {setting}, atmospheric, no lyrics",
"game_music": "{game_type} video game music, {mood}, 8-bit inspired, looping",
}
def build_music_prompt(
template: str = "cinematic",
**kwargs,
) -> str:
"""Build a structured music generation prompt."""
template_str = STYLE_TEMPLATES.get(template, "{mood} {genre} music")
# Fill in any provided kwargs, leave rest as-is
try:
return template_str.format(**kwargs)
except KeyError as e:
# Return partially filled template
for k, v in kwargs.items():
template_str = template_str.replace(f"{{{k}}}", v)
return template_str
def generate_music_set(
model,
style: str = "cinematic",
variations: int = 4,
duration: float = 15.0,
output_dir: str = "./music_set",
**style_kwargs,
) -> list[str]:
"""Generate a set of variations on a style."""
base_prompt = build_music_prompt(style, **style_kwargs)
# Create slight variations
descriptions = [f"{base_prompt}, variation {i+1}" for i in range(variations)]
return generate_music(model, descriptions, duration=duration, output_dir=output_dir)
# ── 7. EnCodec audio compression ─────────────────────────────────────────────
def compress_audio(
audio_path: str,
bandwidth: float = 6.0, # kbps: 1.5, 3.0, 6.0, 12.0, 24.0
output_path: str = None,
) -> tuple[torch.Tensor, int]:
"""
Compress audio with EnCodec neural codec.
Lower bandwidth = more compression, lower quality.
"""
from audiocraft.models import EncodecModel
if bandwidth <= 12.0:
model = EncodecModel.get_pretrained("facebook/encodec_24khz")
else:
model = EncodecModel.get_pretrained("facebook/encodec_48khz")
model.set_target_bandwidth(bandwidth)
waveform, sr = torchaudio.load(audio_path)
if sr != model.sample_rate:
waveform = torchaudio.functional.resample(waveform, sr, model.sample_rate)
with torch.no_grad():
encoded = model.encode(waveform.unsqueeze(0))
codes = encoded[0] # Compressed tokens
decoded = model.decode(encoded)
decoded_wav = decoded.squeeze(0)
if output_path:
torchaudio.save(output_path, decoded_wav.cpu(), model.sample_rate)
print(f"Compressed ({bandwidth}kbps) → {output_path}")
return decoded_wav, model.sample_rate
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Load small model for demo
model = load_musicgen("small")
# Basic music generation
descriptions = [
"upbeat jazz piano with walking bass and light drums, 120bpm",
"ambient electronic meditation music, slow tempo, peaceful atmosphere",
"energetic rock guitar riff, distorted electric guitar, heavy drums",
]
paths = generate_music(model, descriptions, duration=10.0, output_dir="./demo_music")
print(f"\nGenerated {len(paths)} music clips")
for p in paths:
print(f" {p}")
# Structured prompt
cinematic_prompt = build_music_prompt(
"cinematic",
mood="tense",
instruments="strings and brass",
)
print(f"\nCinematic prompt: {cinematic_prompt}")
# Sound effects with AudioGen
sfx_model = load_audiogen("medium")
sfx_descriptions = [
"heavy rain on a rooftop with distant thunder",
"busy coffee shop ambience with background chatter",
"notification ping, clean digital sound",
]
sfx_paths = generate_sound_effects(sfx_model, sfx_descriptions, duration=5.0)
print(f"\nGenerated {len(sfx_paths)} sound effects")
For the Suno API alternative when needing AI music generation with lyrics and full song structure (verse, chorus, bridge) via a managed cloud service — Suno handles vocal synthesis and song structure while AudioCraft runs fully on local hardware with no API costs, enabling unlimited generation for training data creation, game audio production, and research applications where cloud API rate limits and per-generation costs are prohibitive. For the Stable Audio alternative when needing high-quality 44.1kHz stereo music generation with longer durations (up to 3+ minutes) and more consistent genre fidelity — Stable Audio excels at quality for consumer music while AudioCraft’s open-source Apache 2.0 license, melody conditioning, and AudioGen sound effects generation make it the complete open-source toolkit for developers building audio generation into products without licensing restrictions. The Claude Skills 360 bundle includes AudioCraft skill sets covering MusicGen text-to-music, melody conditioning, music continuation, AudioGen sound effects, prompt engineering templates, EnCodec compression, and batch generation pipelines. Start with the free tier to try AI music generation code.