{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0", "metadata": {}, "outputs": [], "source": [ "!pip install git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import argparse\n", "import sys\n", "\n", "import numpy as np\n", "import sphn\n", "import torch\n", "from moshi.models.loaders import CheckpointInfo\n", "from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel\n", "\n", "from IPython.display import display, Audio" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "# Configuration\n", "text = \"Hey there! How are you? I had the craziest day today.\"\n", "voice = \"expresso/ex03-ex01_happy_001_channel1_334s.wav\"\n", "print(f\"See https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": {}, "outputs": [], "source": [ "# Set everything up\n", "checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)\n", "tts_model = TTSModel.from_checkpoint_info(\n", " checkpoint_info, n_q=32, temp=0.6, device=torch.device(\"cuda\"), dtype=torch.half\n", ")\n", "\n", "# You could also generate multiple audios at once by passing a list of texts.\n", "entries = tts_model.prepare_script([text], padding_between=1)\n", "voice_path = tts_model.get_voice_path(voice)\n", "# CFG coef goes here because the model was trained with CFG distillation,\n", "# so it's not _actually_ doing CFG at inference time.\n", "condition_attributes = tts_model.make_condition_attributes(\n", " [voice_path], cfg_coef=2.0\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "4", "metadata": {}, "outputs": [], "source": [ "print(\"Generating audio...\")\n", "\n", "pcms = []\n", "def _on_frame(frame):\n", " print(\"Step\", len(pcms), end=\"\\r\")\n", " if (frame != -1).all():\n", " pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()\n", " pcms.append(np.clip(pcm[0, 0], -1, 1))\n", "\n", "result = tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)\n", "\n", "print(\"Done generating.\")\n", "audio = np.concatenate(pcms, axis=-1)" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": {}, "outputs": [], "source": [ "display(\n", " Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "6", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 5 }