2025-07-02 15:51:27 +00:00
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
2025-07-03 09:05:06 +00:00
|
|
|
"id": "0",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
2025-07-03 11:05:00 +00:00
|
|
|
"!pip install \"git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi\""
|
2025-07-02 15:51:27 +00:00
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "1",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import argparse\n",
|
|
|
|
|
"import sys\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"import sphn\n",
|
|
|
|
|
"import torch\n",
|
|
|
|
|
"from moshi.models.loaders import CheckpointInfo\n",
|
|
|
|
|
"from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from IPython.display import display, Audio"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "2",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
2025-07-03 09:05:06 +00:00
|
|
|
"outputs": [],
|
2025-07-02 15:51:27 +00:00
|
|
|
"source": [
|
|
|
|
|
"# Configuration\n",
|
|
|
|
|
"text = \"Hey there! How are you? I had the craziest day today.\"\n",
|
|
|
|
|
"voice = \"expresso/ex03-ex01_happy_001_channel1_334s.wav\"\n",
|
2025-07-03 10:48:04 +00:00
|
|
|
"print(f\"See https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.\")"
|
2025-07-02 15:51:27 +00:00
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "3",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Set everything up\n",
|
|
|
|
|
"checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)\n",
|
|
|
|
|
"tts_model = TTSModel.from_checkpoint_info(\n",
|
|
|
|
|
" checkpoint_info, n_q=32, temp=0.6, device=torch.device(\"cuda\"), dtype=torch.half\n",
|
|
|
|
|
")\n",
|
2025-07-03 11:05:00 +00:00
|
|
|
"tts_model.mimi.streaming_forever(1)\n",
|
2025-07-02 15:51:27 +00:00
|
|
|
"\n",
|
|
|
|
|
"# You could also generate multiple audios at once by passing a list of texts.\n",
|
|
|
|
|
"entries = tts_model.prepare_script([text], padding_between=1)\n",
|
|
|
|
|
"voice_path = tts_model.get_voice_path(voice)\n",
|
|
|
|
|
"# CFG coef goes here because the model was trained with CFG distillation,\n",
|
|
|
|
|
"# so it's not _actually_ doing CFG at inference time.\n",
|
|
|
|
|
"condition_attributes = tts_model.make_condition_attributes(\n",
|
|
|
|
|
" [voice_path], cfg_coef=2.0\n",
|
|
|
|
|
")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "4",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
2025-07-03 09:05:06 +00:00
|
|
|
"outputs": [],
|
2025-07-02 15:51:27 +00:00
|
|
|
"source": [
|
|
|
|
|
"print(\"Generating audio...\")\n",
|
|
|
|
|
"\n",
|
2025-07-03 10:56:00 +00:00
|
|
|
"pcms = []\n",
|
|
|
|
|
"def _on_frame(frame):\n",
|
|
|
|
|
" print(\"Step\", len(pcms), end=\"\\r\")\n",
|
|
|
|
|
" if (frame != -1).all():\n",
|
|
|
|
|
" pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()\n",
|
|
|
|
|
" pcms.append(np.clip(pcm[0, 0], -1, 1))\n",
|
2025-07-02 15:51:27 +00:00
|
|
|
"\n",
|
2025-07-03 10:56:00 +00:00
|
|
|
"result = tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)\n",
|
2025-07-02 15:51:27 +00:00
|
|
|
"\n",
|
2025-07-03 10:56:00 +00:00
|
|
|
"print(\"Done generating.\")\n",
|
2025-07-03 10:57:22 +00:00
|
|
|
"audio = np.concatenate(pcms, axis=-1)"
|
2025-07-02 15:51:27 +00:00
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "5",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
2025-07-03 09:05:06 +00:00
|
|
|
"outputs": [],
|
2025-07-02 15:51:27 +00:00
|
|
|
"source": [
|
|
|
|
|
"display(\n",
|
|
|
|
|
" Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n",
|
|
|
|
|
")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
2025-07-03 09:05:06 +00:00
|
|
|
"id": "6",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.13.2"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
}
|