In [None]:
!pip install git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi

In [None]:
import argparse
import sys

import numpy as np
import sphn
import torch
from moshi.models.loaders import CheckpointInfo
from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel

from IPython.display import display, Audio

In [None]:
# Configuration
text = "Hey there! How are you? I had the craziest day today."
voice = "expresso/ex03-ex01_happy_001_channel1_334s.wav"
print(f"See https://huggingface.co/datasets/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.")

In [None]:
# Set everything up
checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
tts_model = TTSModel.from_checkpoint_info(
    checkpoint_info, n_q=32, temp=0.6, device=torch.device("cuda"), dtype=torch.half
)

# You could also generate multiple audios at once by passing a list of texts.
entries = tts_model.prepare_script([text], padding_between=1)
voice_path = tts_model.get_voice_path(voice)
# CFG coef goes here because the model was trained with CFG distillation,
# so it's not _actually_ doing CFG at inference time.
condition_attributes = tts_model.make_condition_attributes(
    [voice_path], cfg_coef=2.0
)

In [None]:
print("Generating audio...")

# This doesn't do streaming generation,
result = tts_model.generate([entries], [condition_attributes])

frames = torch.cat(result.frames, dim=-1)
audio_tokens = frames[:, tts_model.lm.audio_offset :, tts_model.delay_steps :]
with torch.no_grad():
    audios = tts_model.mimi.decode(audio_tokens)

audio = audios[0].cpu().numpy()

In [None]:
display(
    Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)
)