In [None]:
!pip install "moshi==0.2.7"

In [None]:
import argparse
import sys

import numpy as np
import sphn
import torch
from moshi.models.loaders import CheckpointInfo
from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel

from IPython.display import display, Audio

In [None]:
# Configuration
text = "Hey there! How are you? I had the craziest day today."
voice = "expresso/ex03-ex01_happy_001_channel1_334s.wav"
print(f"See https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.")

In [None]:
# Set everything up
checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
tts_model = TTSModel.from_checkpoint_info(
    checkpoint_info, n_q=32, temp=0.6, device=torch.device("cuda"), dtype=torch.half
)
tts_model.mimi.streaming_forever(1)

# You could also generate multiple audios at once by passing a list of texts.
entries = tts_model.prepare_script([text], padding_between=1)
voice_path = tts_model.get_voice_path(voice)
# CFG coef goes here because the model was trained with CFG distillation,
# so it's not _actually_ doing CFG at inference time.
condition_attributes = tts_model.make_condition_attributes(
    [voice_path], cfg_coef=2.0
)

In [None]:
print("Generating audio...")

pcms = []
def _on_frame(frame):
    print("Step", len(pcms), end="\r")
    if (frame != -1).all():
        pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
        pcms.append(np.clip(pcm[0, 0], -1, 1))

result = tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)

print("Done generating.")
audio = np.concatenate(pcms, axis=-1)

In [None]:
display(
    Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)
)