kyutai/tts_pytorch.ipynb

165 lines
264 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0b7eed16",
"metadata": {},
"outputs": [],
"source": [
"!pip install git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "353b9498",
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import sphn\n",
"import torch\n",
"from moshi.models.loaders import CheckpointInfo\n",
"from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel\n",
"\n",
"from IPython.display import display, Audio"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8846418a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"See https://huggingface.co/datasets/kyutai/tts-voices for available voices.\n"
]
}
],
"source": [
"# Configuration\n",
"text = \"Hey there! How are you? I had the craziest day today.\"\n",
"voice = \"expresso/ex03-ex01_happy_001_channel1_334s.wav\"\n",
"print(f\"See https://huggingface.co/datasets/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b9f022ec",
"metadata": {},
"outputs": [],
"source": [
"# Set everything up\n",
"checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)\n",
"tts_model = TTSModel.from_checkpoint_info(\n",
" checkpoint_info, n_q=32, temp=0.6, device=torch.device(\"cuda\"), dtype=torch.half\n",
")\n",
"\n",
"# You could also generate multiple audios at once by passing a list of texts.\n",
"entries = tts_model.prepare_script([text], padding_between=1)\n",
"voice_path = tts_model.get_voice_path(voice)\n",
"# CFG coef goes here because the model was trained with CFG distillation,\n",
"# so it's not _actually_ doing CFG at inference time.\n",
"condition_attributes = tts_model.make_condition_attributes(\n",
" [voice_path], cfg_coef=2.0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f4f76c73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating audio...\n"
]
}
],
"source": [
"print(\"Generating audio...\")\n",
"\n",
"# This doesn't do streaming generation,\n",
"result = tts_model.generate([entries], [condition_attributes])\n",
"\n",
"frames = torch.cat(result.frames, dim=-1)\n",
"audio_tokens = frames[:, tts_model.lm.audio_offset :, tts_model.delay_steps :]\n",
"with torch.no_grad():\n",
" audios = tts_model.mimi.decode(audio_tokens)\n",
"\n",
"audio = audios[0].cpu().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "732e4b4b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <audio controls=\"controls\" autoplay=\"autoplay\">\n",
" <source src=\"data:audio/wav;base64,UklGRiQMAwBXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAAZGF0YQAMAwC2AeIBwv87ABkBBgHdAJoAfQB1AGgAYQBVADoAOQApABAABQD9/+7/5P/d/8z/yv/J/77/rv+n/6P/nv+c/5n/n/+q/6z/pP+i/6X/of+f/6f/t//F/83/zf/R/93/6P/x//j//v8AAAMABwAJAAgAAQD5//T/9f/1//D/7f/t/+3/6v/q/+//8f/y//L/8f/x//f/AAAHAAcABQAJABAAEgAOAA4ADQAMAAwACwAKAA4AEgAOAAgACAAHAAYABwAHAAcACQAJAAcABwAHAAUACgASABYAGQAaABsAHQAhACEAHQAaABgAGAAZABsAHAAcAB4AHwAeABsAGAAVABUAFgAYABgAGQAbABsAGwAcABwAGgAYABcAFgAWABcAFgAUABIAEAAOAAwACgAIAAgABwAFAAQABQAEAAQABAADAAIAAAAAAP7//v/+//7//v/9//z/+v/4//f/9v/1//X/9P/0//T/9P/z//P/8v/x//D/8P/x//H/8//0//T/8//z//L/8f/v/+//7//v/+7/7//u/+3/7f/s/+z/7P/s/+z/7f/u//D/8v/z//T/9P/2//f/+P/4//n/+P/3//b/9f/1//T/9f/1//X/9P/z//L/8v/z//T/9P/1//b/9//5//r/+v/6//r/+f/6//v//P/8//z//f/+//7///////7//f/8//r/+f/4//f/9//3//f/9//3//b/9v/2//X/9v/1//T/8//z//P/9P/1//X/9f/2//b/9//3//b/9v/2//b/9v/3//j/+P/4//n/+v/6//r/+f/4//j/+P/3//f/9//3//j/+P/4//j/+P/4//j/+P/5//n/+P/4//j/+P/4//j/+P/4//f/9v/2//b/9//4//n/+f/5//n/+f/5//n/+P/4//j/9//3//b/9v/3//j/9//3//f/9//3//f/9//4//j/+P/4//j/+P/4//j/9//3//b/9v/2//b/9//3//f/9//4//n/+f/5//n/+P/4//f/9//3//j/+P/4//j/+P/4//n/+v/6//v/+//7//v/+//7//z//f/9//z/+//7//v//P/8//z//P/8//3//f/+//7//v/+//7//v/9//3//P/8//z//f/9//3//P/9//3//////wAAAAD//////v//////AAAAAP///v/+//7//v/+//7//v/9//3//v/+////AAAAAAAAAAAAAAAAAAAAAAEAAQAAAAAAAAAAAAEAAwADAAQAAwADAAMAAwAEAAQABAAEAAIAAQABAAEAAQABAAEAAAABAAEAAQABAAAAAAAAAP//////////AAD///7//f/8//z//f/+//7//v/9//3//f/9////AAABAAIAAgACAAIAAQABAAEAAQAAAAAAAAAAAAAAAAAAAP/////////////+//7//f/9//z/+//6//n/+f/5//j/9//2//b/9v/2//b/9f/1//T/9P/0//P/8//z//P/9P/0//T/9f/2//b/9//3//f/9//2//X/9f/0//P/8//y//L/8f/w//D/7//v/+7/7v/u/+7/7//v/+//8P/x//H/8v/y//P/8//z//P/8v/y//L/8v/y//L/8f/x//H/8f/x//D/8P/w//D/7//v/+//7//v/+//8P/w//H/8f/x//D/8P/v/+//7v/u/+7/7//v/+7/7//v/+//7//v/+//7//v/+//8P/w//D/8P/w//D/8P/w//D/8P/w/+//7//v/+7/7v/u/+7/7v/u/+7/7//v/+//7//v/+//8P/w//D/8f/x//H/8v/x//H/8f/w//D/8P/w//D/8P/w//D/8P/w//H/8f/x//H/8P/w//D/8P/x//H/8P/w/+//7//v/+//8P/w//D/7//w//D/8P/x//H/8f/x//H/8v/z//P/9P/z//T/9P/1//X/9f/1//X/9v/2//b/9f/1//X/9f/2//X/9f/1//X/9f/1//X/9f/0//T/9P/1//X/9f/1//T/9P/0//P/8//z//P/8v/y//H/8P/x//D/8P/w//D/8P/w//D/8P/x//H/8v/x//H/8v/y//L/8v/y//L/8v/z//P/9P/0//T/9f/1//b/9v/3//j/+f/5//r/+v/6//v/+//7//z//P/9//3//f/9//3//f/9//3//f/9//7//v/+///////+//7//v/+//7/////////////////AAAAAAAA/////////////////////wAAAAAAAAAAAQAAAAAAAQABAAEAAQABAAEAAQABAAEAAgACAAIAAwADAAMAAwADAAMAAwADAAMABAAEAAUABQAEAAQABAADAAMABAAEAAQABAAEAAQABQAFAAUABgAFAAYABgAFAAUABQAFAAQABAAEAAQABAAEAAQABAADAAMAAwADAAMAAwADAAMAAgACAAIAAQABAAEAAQACAAIAAgACAAMAAwADAAQABAAEAAUABQAFAAUABQAFAAUABQAFAAYABgAHAAgACAAIAAgACQAJAAoACgAJAAgACAAHAAcABwAGAAYABQAFAAUABQAFAAUABgAHAAcABwAHAAYABgAGAAYABgAFAAYABgAGAAUABgAGAAcACAAIAAkACQAJAAoACgAKAAoACgAKAAoACgAKAAoACQAJAAgACAAIAAcABwAHAAcABwAHAAYABgAGAAUABQAFAAYABgAHAAcABwAIAAgACAAJAAkACQAJAAoACgAJAAkACQAIAAgABwAHAAcABwAGAAUABQAFAAUABAAEAAUABQAFAAUABQAFAAUABQAEAAUABQAFAAYABgAGAAYABwAGAAYABgAGAAcACAAHAAcABwAGAAUABQAFAAUABQAFAAUABAAEAAQABQAFAAUABgAHAAcABwAHAAcABwAGAAYABgAGAAYABgAGAAYABgAGAAcABwAIAAkACQAJAAkACQAJAAgACAAIAAgACAAJAAkACQAJAAkACgALAAsADAANAA4ADgAOAA0ADQAMAAwACwALAAwADAAMAAwADAAMAAwADAANAA0ADgAOAA4ADgAOAA4ADQANAAwADAANAA0ADQANAA0ADQAOAA4ADwAQABEAEQARABIAEgARABEAEAAQABAAEAARABEAEQARABEAEgATABQAFQAVABYAFwAXABcAFgAWABYAFQAVABUAFQAVABQAFAAUABYAFwAYABoAHAAdAB8AIAAgACAAIAAfAB8AHwAfACAAIAAgAB8AIAAgACEAIgAjACQAJgAnACgAKAAoACgAJwAmACYAJgAnACcAJwAnACcAJwAnACgAKAApACoAKwAsACsAKwAqACoAKQApACkAKQApACkAKQApACoAKgArACwALQAuAC8AMAAwADAAMAAvAC8ALgAvAC8ALwAvAC8ALwAuAC4ALgAvADAAMQAzADMAMwAyADEAMAAvAC8ALwAvAC8ALwAvAC8AMAAwADEAMgA0ADUANgA3ADcANwA2ADUANAA0ADQAMwAzADMAMgAyADIAMgAyADMANAA1ADYANgA2ADYANgA1ADQAMwAzADMAMgAxADEAMQAwADAAMAAxADEAMgAzADQANAAzADMAMgAyADEAMQAyADIAMgAyADIAMgAyADIAMwA0ADUANgA3ADYANwA2ADUANQA0ADQANAA0ADQANAAzADMAMwAyADMAMwA0ADUANgA2ADYANgA1ADUANAA0ADQANAA0ADQAMwAyADIAMQAwADAAMQAyADIAMgAyADEAMAAwADAAMAAwADAAMQAxADAAMAAvAC8ALwAvAC8AMAAwAC8ALwAuAC4ALQAsACwALAAsACwAKwAqACoAKAAoACcAJwAnACcAJwAmACUAJAAjACEAIQAhACEAIQAgAB8AHgAdAB0AHAAcABwAHQAdAB0AHQAdABwAHAAbABsAGwAbABsAGwAbABoAGgAZABkAGQAaABoAGwAbABs
" Your browser does not support the audio element.\n",
" </audio>\n",
" "
],
"text/plain": [
"<IPython.lib.display.Audio object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(\n",
" Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2dbdd275",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}