From ae575a04c693fa8a3d10b204faa3f27538c33737 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Jul 2025 18:59:04 +0200 Subject: [PATCH 1/5] Handle stdin in the mlx tts example. (#31) --- scripts/tts_mlx.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/tts_mlx.py b/scripts/tts_mlx.py index 050fce5..70dca65 100644 --- a/scripts/tts_mlx.py +++ b/scripts/tts_mlx.py @@ -10,8 +10,8 @@ import argparse import json -from pathlib import Path import queue +import sys import time import numpy as np @@ -105,8 +105,13 @@ def main(): mimi = tts_model.mimi log("info", f"reading input from {args.inp}") - with open(args.inp, "r") as fobj: - text_to_tts = fobj.read().strip() + if args.inp == "-": + if sys.stdin.isatty(): # Interactive + print("Enter text to synthesize (Ctrl+D to end input):") + text_to_tts = sys.stdin.read().strip() + else: + with open(args.inp, "r") as fobj: + text_to_tts = fobj.read().strip() all_entries = [tts_model.prepare_script([text_to_tts])] if tts_model.multi_speaker: From 20cf8d7365352d1336e152037b88623696e8556f Mon Sep 17 00:00:00 2001 From: laurent Date: Thu, 3 Jul 2025 07:43:56 +0200 Subject: [PATCH 2/5] Collapsible sections. --- README.md | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 364d6c2..73df270 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,8 @@ Here is how to choose which one to use: MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon. If you want to run the model on a Mac or an iPhone, choose the MLX implementation. -### PyTorch implementation +
+PyTorch implementation Hugging Face @@ -99,8 +100,10 @@ In the heart of an ancient forest, where the trees whispered secrets of the past Apart from nudging the model for a specific spelling of a word, other potential use-cases include speaker adaptation and steering the model towards a specific formatting style or even a language. However, please bear in mind that is an experimental feature and its behavior is very sensitive to the prompt provided. +
-### Rust server +
+Rust server Hugging Face @@ -143,8 +146,10 @@ The script limits the decoding speed to simulates real-time processing of the au Faster processing can be triggered by setting the real-time factor, e.g. `--rtf 1000` will process the data as fast as possible. +
-### Rust standalone +
+Rust standalone Hugging Face @@ -157,8 +162,10 @@ cargo run --features cuda -r -- audio/bria.mp3 ``` You can get the timestamps by adding the `--timestamps` flag, and see the output of the semantic VAD by adding the `--vad` flag. +
-### MLX implementation +
+MLX implementation Hugging Face @@ -187,6 +194,7 @@ python scripts/stt_from_mic_mlx.py The MLX models can also be used in swift using the [moshi-swift codebase](https://github.com/kyutai-labs/moshi-swift), the 1b model has been tested to work fine on an iPhone 16 Pro. +
## Kyutai Text-to-Speech @@ -200,7 +208,8 @@ We provide different implementations of Kyutai TTS for different use cases. Here - Rust: for production. If you want to serve Kyutai TTS in a production setting, use our Rust server. Our robust Rust server provides streaming access to the model over websockets. We use this server to run Unmute. - MLX: for on-device inference on iPhone and Mac. MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon. If you want to run the model on a Mac or an iPhone, choose the MLX implementation. -### PyTorch implementation +
+PyTorch implementation Open In Colab @@ -219,12 +228,16 @@ python scripts/tts_pytorch.py text_to_say.txt audio_output.wav This requires the [moshi package](https://pypi.org/project/moshi/), which can be installed via pip. If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step and just prefix the command above with `uvx --with moshi`. +
-### Rust server +
+Rust server Example coming soon. +
-### MLX implementation +
+MLX implementation [MLX](https://ml-explore.github.io/mlx/build/html/index.html) is Apple's ML framework that allows you to use hardware acceleration on Apple silicon. @@ -243,6 +256,7 @@ python scripts/tts_mlx.py text_to_say.txt audio_output.wav This requires the [moshi-mlx package](https://pypi.org/project/moshi-mlx/), which can be installed via pip. If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step and just prefix the command above with `uvx --with moshi-mlx`. +
## License @@ -262,4 +276,4 @@ pip install pre-commit pre-commit install ``` -If you're using `uv`, you can replace the two commands with `uvx pre-commit install`. \ No newline at end of file +If you're using `uv`, you can replace the two commands with `uvx pre-commit install`. From 236df522b848476a5aeca8617ae9ed43d0cae107 Mon Sep 17 00:00:00 2001 From: laurent Date: Thu, 3 Jul 2025 07:47:16 +0200 Subject: [PATCH 3/5] Add some links. --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 73df270..a397045 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,9 @@ a flexible formulation for streaming, multimodal sequence-to-sequence learning.
Hugging Face + + Open In Colab + **More details can be found on the [project page](https://kyutai.org/next/stt).** @@ -198,6 +201,9 @@ tested to work fine on an iPhone 16 Pro. ## Kyutai Text-to-Speech + + Hugging Face + Open In Colab From 6c1e9f12cf5a84972a2cab225f32e680d9833a80 Mon Sep 17 00:00:00 2001 From: laurent Date: Thu, 3 Jul 2025 07:52:27 +0200 Subject: [PATCH 4/5] Mention the MLX quantization. --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a397045..7954612 100644 --- a/README.md +++ b/README.md @@ -250,10 +250,13 @@ hardware acceleration on Apple silicon. Use our example script to run Kyutai TTS on MLX. The script takes text from stdin or a file and can output to a file or stream the resulting audio. +When streaming the output, if the model is not fast enough to keep with +real-time, you can use the `--quantize 8` or `--quantize 4` flags to quantize +the model resulting in faster inference. ```bash # From stdin, plays audio immediately -echo "Hey, how are you?" | python scripts/tts_mlx.py - - +echo "Hey, how are you?" | python scripts/tts_mlx.py - - --quantize 8 # From text file to audio file python scripts/tts_mlx.py text_to_say.txt audio_output.wav From d92e4c26954b4bc8642f37fb0450f7be5033bba7 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 3 Jul 2025 09:29:04 +0200 Subject: [PATCH 5/5] Use the on_frame callback in the mlx tts example. (#34) --- scripts/tts_mlx.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/tts_mlx.py b/scripts/tts_mlx.py index 70dca65..9d89295 100644 --- a/scripts/tts_mlx.py +++ b/scripts/tts_mlx.py @@ -2,7 +2,7 @@ # requires-python = ">=3.12" # dependencies = [ # "huggingface_hub", -# "moshi_mlx>=0.2.8", +# "moshi_mlx @ git+https://git@github.com/kyutai-labs/moshi#egg=moshi_mlx&subdirectory=moshi_mlx", # "numpy", # "sounddevice", # ] @@ -121,10 +121,10 @@ def main(): all_attributes = [tts_model.make_condition_attributes(voices, cfg_coef_conditioning)] wav_frames = queue.Queue() - def _on_audio_hook(audio_tokens): - if (audio_tokens == -1).any(): + def _on_frame(frame): + if (frame == -1).any(): return - _pcm = tts_model.mimi.decode_step(audio_tokens[None, :, None]) + _pcm = tts_model.mimi.decode_step(frame[:, :, None]) _pcm = np.array(mx.clip(_pcm[0, 0], -1, 1)) wav_frames.put_nowait(_pcm) @@ -136,7 +136,7 @@ def main(): all_attributes, cfg_is_no_prefix=cfg_is_no_prefix, cfg_is_no_text=cfg_is_no_text, - on_audio_hook=_on_audio_hook, + on_frame=_on_frame, ) frames = mx.concat(result.frames, axis=-1) total_duration = frames.shape[0] * frames.shape[-1] / mimi.frame_rate @@ -163,6 +163,7 @@ def main(): break time.sleep(1) else: + run() frames = [] while True: try: