kyutai/scripts/tts_pytorch.py

# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "moshi==0.2.8",
#     "torch",
#     "sphn",
#     "sounddevice",
# ]
# ///
import argparse
import sys

import numpy as np
import queue
import sphn
import time
import torch
from moshi.models.loaders import CheckpointInfo
from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel


def main():
    parser = argparse.ArgumentParser(
        description="Run Kyutai TTS using the PyTorch implementation"
    )
    parser.add_argument("inp", type=str, help="Input file, use - for stdin.")
    parser.add_argument(
        "out", type=str, help="Output file to generate, use - for playing the audio"
    )
    parser.add_argument(
        "--hf-repo",
        type=str,
        default=DEFAULT_DSM_TTS_REPO,
        help="HF repo in which to look for the pretrained models.",
    )
    parser.add_argument(
        "--voice-repo",
        default=DEFAULT_DSM_TTS_VOICE_REPO,
        help="HF repo in which to look for pre-computed voice embeddings.",
    )
    parser.add_argument(
        "--voice",
        default="expresso/ex03-ex01_happy_001_channel1_334s.wav",
        help="The voice to use, relative to the voice repo root. "
        f"See {DEFAULT_DSM_TTS_VOICE_REPO}",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
        help="Device on which to run, defaults to 'cuda'.",
    )
    args = parser.parse_args()

    print("Loading model...")
    checkpoint_info = CheckpointInfo.from_hf_repo(args.hf_repo)
    tts_model = TTSModel.from_checkpoint_info(
        checkpoint_info, n_q=32, temp=0.6, device=args.device
    )

    if args.inp == "-":
        if sys.stdin.isatty():  # Interactive
            print("Enter text to synthesize (Ctrl+D to end input):")
        text = sys.stdin.read().strip()
    else:
        with open(args.inp, "r") as fobj:
            text = fobj.read().strip()

    # If you want to make a dialog, you can pass more than one turn [text_speaker_1, text_speaker_2, text_2_speaker_1, ...]
    entries = tts_model.prepare_script([text], padding_between=1)
    voice_path = tts_model.get_voice_path(args.voice)
    # CFG coef goes here because the model was trained with CFG distillation,
    # so it's not _actually_ doing CFG at inference time.
    # Also, if you are generating a dialog, you should have two voices in the list.
    condition_attributes = tts_model.make_condition_attributes(
        [voice_path], cfg_coef=2.0
    )

    if args.out == "-":
        # Stream the audio to the speakers using sounddevice.
        import sounddevice as sd

        pcms = queue.Queue()

        def _on_frame(frame):
            if (frame != -1).all():
                pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
                pcms.put_nowait(np.clip(pcm[0, 0], -1, 1))

        def audio_callback(outdata, _a, _b, _c):
            try:
                pcm_data = pcms.get(block=False)
                outdata[:, 0] = pcm_data
            except queue.Empty:
                outdata[:] = 0

        with sd.OutputStream(
            samplerate=tts_model.mimi.sample_rate,
            blocksize=1920,
            channels=1,
            callback=audio_callback,
        ):
            with tts_model.mimi.streaming(1):
                tts_model.generate(
                    [entries], [condition_attributes], on_frame=_on_frame
                )
            time.sleep(3)
            while True:
                if pcms.qsize() == 0:
                    break
                time.sleep(1)
    else:
        result = tts_model.generate([entries], [condition_attributes])
        with tts_model.mimi.streaming(1), torch.no_grad():
            pcms = []
            for frame in result.frames[tts_model.delay_steps :]:
                pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
                pcms.append(np.clip(pcm[0, 0], -1, 1))
            pcm = np.concatenate(pcms, axis=-1)
        sphn.write_wav(args.out, pcm, tts_model.mimi.sample_rate)


if __name__ == "__main__":
    main()
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`# /// script`
			`# requires-python = ">=3.12"`
			`# dependencies = [`
Use moshi 0.2.8. 2025-07-07 06:12:16 +00:00			`# "moshi==0.2.8",`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`# "torch",`
			`# "sphn",`
			`# "sounddevice",`
			`# ]`
			`# ///`
			`import argparse`
			`import sys`

			`import numpy as np`
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`import queue`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`import sphn`
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`import time`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`import torch`
			`from moshi.models.loaders import CheckpointInfo`
			`from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description="Run Kyutai TTS using the PyTorch implementation"`
			`)`
			`parser.add_argument("inp", type=str, help="Input file, use - for stdin.")`
			`parser.add_argument(`
			`"out", type=str, help="Output file to generate, use - for playing the audio"`
			`)`
			`parser.add_argument(`
			`"--hf-repo",`
			`type=str,`
			`default=DEFAULT_DSM_TTS_REPO,`
			`help="HF repo in which to look for the pretrained models.",`
			`)`
			`parser.add_argument(`
			`"--voice-repo",`
			`default=DEFAULT_DSM_TTS_VOICE_REPO,`
			`help="HF repo in which to look for pre-computed voice embeddings.",`
			`)`
			`parser.add_argument(`
			`"--voice",`
			`default="expresso/ex03-ex01_happy_001_channel1_334s.wav",`
			`help="The voice to use, relative to the voice repo root. "`
			`f"See {DEFAULT_DSM_TTS_VOICE_REPO}",`
			`)`
Add a device argument to the tts pytorch script. (#62) 2025-07-07 06:36:47 +00:00			`parser.add_argument(`
			`"--device",`
			`type=str,`
			`default="cuda",`
			`help="Device on which to run, defaults to 'cuda'.",`
			`)`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`args = parser.parse_args()`

			`print("Loading model...")`
			`checkpoint_info = CheckpointInfo.from_hf_repo(args.hf_repo)`
			`tts_model = TTSModel.from_checkpoint_info(`
Add a device argument to the tts pytorch script. (#62) 2025-07-07 06:36:47 +00:00			`checkpoint_info, n_q=32, temp=0.6, device=args.device`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`)`

			`if args.inp == "-":`
			`if sys.stdin.isatty(): # Interactive`
			`print("Enter text to synthesize (Ctrl+D to end input):")`
			`text = sys.stdin.read().strip()`
			`else:`
			`with open(args.inp, "r") as fobj:`
			`text = fobj.read().strip()`

Some updates to the colab and script (#38) * changing streaming to be robust to repeated generation * some changes * plop * plop * plop * plop 2025-07-03 13:06:37 +00:00			`# If you want to make a dialog, you can pass more than one turn [text_speaker_1, text_speaker_2, text_2_speaker_1, ...]`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`entries = tts_model.prepare_script([text], padding_between=1)`
			`voice_path = tts_model.get_voice_path(args.voice)`
			`# CFG coef goes here because the model was trained with CFG distillation,`
			`# so it's not _actually_ doing CFG at inference time.`
Some updates to the colab and script (#38) * changing streaming to be robust to repeated generation * some changes * plop * plop * plop * plop 2025-07-03 13:06:37 +00:00			`# Also, if you are generating a dialog, you should have two voices in the list.`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`condition_attributes = tts_model.make_condition_attributes(`
			`[voice_path], cfg_coef=2.0`
			`)`

Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`if args.out == "-":`
			`# Stream the audio to the speakers using sounddevice.`
			`import sounddevice as sd`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`pcms = queue.Queue()`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`def _on_frame(frame):`
			`if (frame != -1).all():`
			`pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()`
			`pcms.put_nowait(np.clip(pcm[0, 0], -1, 1))`

			`def audio_callback(outdata, _a, _b, _c):`
			`try:`
			`pcm_data = pcms.get(block=False)`
			`outdata[:, 0] = pcm_data`
			`except queue.Empty:`
			`outdata[:] = 0`

			`with sd.OutputStream(`
			`samplerate=tts_model.mimi.sample_rate,`
			`blocksize=1920,`
			`channels=1,`
			`callback=audio_callback,`
			`):`
Some updates to the colab and script (#38) * changing streaming to be robust to repeated generation * some changes * plop * plop * plop * plop 2025-07-03 13:06:37 +00:00			`with tts_model.mimi.streaming(1):`
Run pre-commit correctly in CI (#66) * fix and break * Remove intentional error 2025-07-08 08:11:52 +00:00			`tts_model.generate(`
			`[entries], [condition_attributes], on_frame=_on_frame`
			`)`
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`time.sleep(3)`
			`while True:`
			`if pcms.qsize() == 0:`
			`break`
			`time.sleep(1)`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00			`else:`
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`result = tts_model.generate([entries], [condition_attributes])`
Some updates to the colab and script (#38) * changing streaming to be robust to repeated generation * some changes * plop * plop * plop * plop 2025-07-03 13:06:37 +00:00			`with tts_model.mimi.streaming(1), torch.no_grad():`
Streaming output for the pytorch tts example. (#33) * Streaming output for the pytorch tts example. * Run the pre-commit hooks. 2025-07-03 09:05:06 +00:00			`pcms = []`
			`for frame in result.frames[tts_model.delay_steps :]:`
			`pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()`
			`pcms.append(np.clip(pcm[0, 0], -1, 1))`
			`pcm = np.concatenate(pcms, axis=-1)`
			`sphn.write_wav(args.out, pcm, tts_model.mimi.sample_rate)`
Add Pytorch inference for TTS (#28) * Add tts_pytorch.py * Add attempt at interactive playback 2025-07-02 15:02:05 +00:00

			`if __name__ == "__main__":`
			`main()`