kyutai/tts_pytorch.ipynb

141 lines
3.8 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0",
"metadata": {},
"outputs": [],
"source": [
"# Fast install, might break in the future.\n",
"!pip install 'sphn<0.2'\n",
2025-07-16 14:02:28 +00:00
"!pip install --no-deps \"moshi==0.2.10\"\n",
"# Slow install (will download torch and cuda), but future proof.\n",
2025-07-16 14:02:28 +00:00
"# !pip install \"moshi==0.2.10\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import torch\n",
"from moshi.models.loaders import CheckpointInfo\n",
"from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel\n",
"\n",
"from IPython.display import display, Audio"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2",
"metadata": {},
"outputs": [],
"source": [
"# Configuration\n",
"text = \"Hey there! How are you? I had the craziest day today.\"\n",
"voice = \"expresso/ex03-ex01_happy_001_channel1_334s.wav\"\n",
2025-07-03 10:48:04 +00:00
"print(f\"See https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3",
"metadata": {},
"outputs": [],
"source": [
"# Set everything up\n",
"checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)\n",
"tts_model = TTSModel.from_checkpoint_info(\n",
" checkpoint_info, n_q=32, temp=0.6, device=torch.device(\"cuda\")\n",
")\n",
"\n",
"# If you want to make a dialog, you can pass more than one turn [text_speaker_1, text_speaker_2, text_2_speaker_1, ...]\n",
"entries = tts_model.prepare_script([text], padding_between=1)\n",
"voice_path = tts_model.get_voice_path(voice)\n",
"# CFG coef goes here because the model was trained with CFG distillation,\n",
"# so it's not _actually_ doing CFG at inference time.\n",
"# Also, if you are generating a dialog, you should have two voices in the list.\n",
"condition_attributes = tts_model.make_condition_attributes(\n",
" [voice_path], cfg_coef=2.0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4",
"metadata": {},
"outputs": [],
"source": [
"print(\"Generating audio...\")\n",
"\n",
2025-07-03 10:56:00 +00:00
"pcms = []\n",
"def _on_frame(frame):\n",
" print(\"Step\", len(pcms), end=\"\\r\")\n",
" if (frame != -1).all():\n",
" pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()\n",
" pcms.append(np.clip(pcm[0, 0], -1, 1))\n",
"\n",
"# You could also generate multiple audios at once by extending the following lists.\n",
"all_entries = [entries]\n",
"all_condition_attributes = [condition_attributes]\n",
"with tts_model.mimi.streaming(len(all_entries)):\n",
" result = tts_model.generate(all_entries, all_condition_attributes, on_frame=_on_frame)\n",
"\n",
2025-07-03 10:56:00 +00:00
"print(\"Done generating.\")\n",
2025-07-03 10:57:22 +00:00
"audio = np.concatenate(pcms, axis=-1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5",
"metadata": {},
"outputs": [],
"source": [
"display(\n",
" Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}