2025-07-02 15:51:27 +00:00
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
2025-07-03 09:05:06 +00:00
|
|
|
"id": "0",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"!pip install git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "1",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import argparse\n",
|
|
|
|
|
"import sys\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"import sphn\n",
|
|
|
|
|
"import torch\n",
|
|
|
|
|
"from moshi.models.loaders import CheckpointInfo\n",
|
|
|
|
|
"from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from IPython.display import display, Audio"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "2",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
2025-07-03 09:05:06 +00:00
|
|
|
"outputs": [],
|
2025-07-02 15:51:27 +00:00
|
|
|
"source": [
|
|
|
|
|
"# Configuration\n",
|
|
|
|
|
"text = \"Hey there! How are you? I had the craziest day today.\"\n",
|
|
|
|
|
"voice = \"expresso/ex03-ex01_happy_001_channel1_334s.wav\"\n",
|
|
|
|
|
"print(f\"See https://huggingface.co/datasets/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "3",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Set everything up\n",
|
|
|
|
|
"checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)\n",
|
|
|
|
|
"tts_model = TTSModel.from_checkpoint_info(\n",
|
|
|
|
|
" checkpoint_info, n_q=32, temp=0.6, device=torch.device(\"cuda\"), dtype=torch.half\n",
|
|
|
|
|
")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# You could also generate multiple audios at once by passing a list of texts.\n",
|
|
|
|
|
"entries = tts_model.prepare_script([text], padding_between=1)\n",
|
|
|
|
|
"voice_path = tts_model.get_voice_path(voice)\n",
|
|
|
|
|
"# CFG coef goes here because the model was trained with CFG distillation,\n",
|
|
|
|
|
"# so it's not _actually_ doing CFG at inference time.\n",
|
|
|
|
|
"condition_attributes = tts_model.make_condition_attributes(\n",
|
|
|
|
|
" [voice_path], cfg_coef=2.0\n",
|
|
|
|
|
")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "4",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
2025-07-03 09:05:06 +00:00
|
|
|
"outputs": [],
|
2025-07-02 15:51:27 +00:00
|
|
|
"source": [
|
|
|
|
|
"print(\"Generating audio...\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# This doesn't do streaming generation,\n",
|
|
|
|
|
"result = tts_model.generate([entries], [condition_attributes])\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"frames = torch.cat(result.frames, dim=-1)\n",
|
|
|
|
|
"audio_tokens = frames[:, tts_model.lm.audio_offset :, tts_model.delay_steps :]\n",
|
|
|
|
|
"with torch.no_grad():\n",
|
|
|
|
|
" audios = tts_model.mimi.decode(audio_tokens)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"audio = audios[0].cpu().numpy()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-07-03 09:05:06 +00:00
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "5",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
2025-07-03 09:05:06 +00:00
|
|
|
"outputs": [],
|
2025-07-02 15:51:27 +00:00
|
|
|
"source": [
|
|
|
|
|
"display(\n",
|
|
|
|
|
" Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n",
|
|
|
|
|
")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
2025-07-03 09:05:06 +00:00
|
|
|
"id": "6",
|
2025-07-02 15:51:27 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.13.2"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
}
|