from typing import Annotated, AsyncGenerator, Literal
import httpx
import ormsgpack
from pydantic import AfterValidator, BaseModel, conint
class ServeReferenceAudio(BaseModel):
audio: bytes
text: str
class ServeTTSRequest(BaseModel):
text: str
chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
# Audio format
format: Literal["wav", "pcm", "mp3"] = "mp3"
mp3_bitrate: Literal[64, 128, 192] = 128
# References audios for in-context learning
references: list[ServeReferenceAudio] = []
# Reference id
# For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
# Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
reference_id: str | None = None
# Normalize text for en & zh, this increase stability for numbers
normalize: bool = True
# Balance mode will reduce latency to 300ms, but may decrease stability
latency: Literal["normal", "balanced"] = "normal"
request = ServeTTSRequest(
text="Hello, world!",
references=[
ServeReferenceAudio(
audio=open("lengyue.wav", "rb").read(),
text="Text in reference AUDIO",
)
],
)
with (
httpx.Client() as client,
open("hello.mp3", "wb") as f,
):
with client.stream(
"POST",
"https://api.fish.audio/v1/tts",
content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
headers={
"authorization": "Bearer YOUR_API_KEY",
"content-type": "application/msgpack",
"model": "speech-1.6", # Specify which TTS model to use
},
timeout=None,
) as response:
for chunk in response.iter_bytes():
f.write(chunk)