Fix the pytorch tts streaming example. (#84)

* Fix the pytorch tts streaming example.

* Edit the readme too.
This commit is contained in:
Laurent Mazare 2025-07-16 21:07:02 +02:00 committed by GitHub
parent 66a33c989f
commit a2f031deb5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 4 deletions

View File

@ -237,6 +237,14 @@ echo "Hey, how are you?" | python scripts/tts_pytorch.py - -
python scripts/tts_pytorch.py text_to_say.txt audio_output.wav
```
The `tts_pytorch.py` script waits for all the text to be available before
starting the audio generation. A fully streaming implementation is available in
the `tts_pytorch_streaming.py` script, which can be used as follows:
```bash
echo "Hey, how are you?" | python scripts/tts_pytorch_streaming.py audio_output.wav
```
This requires the [moshi package](https://pypi.org/project/moshi/), which can be installed via pip.
If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
and just prefix the command above with `uvx --with moshi`.

View File

@ -21,10 +21,24 @@ from moshi.models.loaders import CheckpointInfo
from moshi.conditioners import dropout_all_conditions
from moshi.models.lm import LMGen
from moshi.models.tts import (
Entry,
DEFAULT_DSM_TTS_REPO,
DEFAULT_DSM_TTS_VOICE_REPO,
TTSModel,
ConditionAttributes,
script_to_entries,
)
def prepare_script(model: TTSModel, script: str, first_turn: bool) -> list[Entry]:
multi_speaker = first_turn and model.multi_speaker
return script_to_entries(
model.tokenizer,
model.machine.token_ids,
model.mimi.frame_rate,
[script],
multi_speaker=multi_speaker,
padding_between=1,
)
@ -206,9 +220,10 @@ def main():
channels=1,
callback=audio_callback,
) and tts_model.mimi.streaming(1):
first_turn = True
for line in sys.stdin:
# TODO: Fix the following to only include bos on the first line.
entries = tts_model.prepare_script([line.strip()], padding_between=1)
entries = prepare_script(tts_model, line.strip(), first_turn=first_turn)
first_turn = False
for entry in entries:
gen.append_entry(entry)
gen.process()
@ -227,9 +242,10 @@ def main():
gen = TTSGen(tts_model, [condition_attributes], on_frame=_on_frame)
with tts_model.mimi.streaming(1):
first_turn = True
for line in sys.stdin:
# TODO: Fix the following to only include bos on the first line.
entries = tts_model.prepare_script([line.strip()], padding_between=1)
entries = prepare_script(tts_model, line.strip(), first_turn=first_turn)
first_turn = False
for entry in entries:
gen.append_entry(entry)
gen.process()