Voice cloning allows you to generate speech that matches a specific voice using reference audio. Fish Audio supports two approaches:
Using pre-trained voice models (reference_id)
Providing reference audio directly in your request
Use reference_id when you’ll reuse a voice multiple times - it’s faster and more efficient. Use references for one-off voice cloning or testing different voices without creating models.
Clone a voice by providing reference audio directly:
Copy
Ask AI
from fish_audio_sdk import Session, TTSRequest, ReferenceAudiosession = Session("your_api_key")# Load reference audiowith open("voice_sample.wav", "rb") as f: audio_data = f.read()request = TTSRequest( text="This will sound like the reference voice", references=[ ReferenceAudio( audio=audio_data, text="Text spoken in the reference audio" ) ])# Generate speechwith open("cloned_voice.mp3", "wb") as f: for chunk in session.tts(request): f.write(chunk)
For repeated use, create a persistent voice model:
Copy
Ask AI
# Create a voice model from samplesvoices = []texts = []for i in range(3): with open(f"voice_{i}.wav", "rb") as f: voices.append(f.read()) texts.append(f"Sample text {i}")model = session.create_model( title="My Custom Voice", description="Voice cloned from samples", voices=voices, texts=texts, visibility="private" # or "public", "unlist")print(f"Created model: {model.id}")# Use the modelrequest = TTSRequest( text="Using my saved voice model", reference_id=model.id)
def create_voice_bank(): voice_bank = {} # List existing models models = session.list_models(self_only=True) for model in models.items: voice_bank[model.title] = model.id return voice_bankdef generate_with_voice(text, voice_name): voice_bank = create_voice_bank() if voice_name not in voice_bank: print(f"Voice '{voice_name}' not found") return request = TTSRequest( text=text, reference_id=voice_bank[voice_name] ) with open(f"{voice_name}_output.mp3", "wb") as f: for chunk in session.tts(request): f.write(chunk)
request = TTSRequest( text="(happy) This is exciting news! (calm) Let me explain the details.", reference_id="your_model_id")# Or with direct referencesrequest = TTSRequest( text="(excited) Amazing discovery!", references=[reference_audio])
try: request = TTSRequest( text="Test speech", references=[reference_audio] ) for chunk in session.tts(request): # Process audio passexcept Exception as e: if "Invalid audio format" in str(e): print("Check audio format - use WAV or MP3") elif "Audio too short" in str(e): print("Reference audio should be at least 10 seconds") else: raise e