From ae575a04c693fa8a3d10b204faa3f27538c33737 Mon Sep 17 00:00:00 2001
From: Laurent Mazare <laurent.mazare@gmail.com>
Date: Wed, 2 Jul 2025 18:59:04 +0200
Subject: [PATCH 1/5] Handle stdin in the mlx tts example. (#31)

---
 scripts/tts_mlx.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/scripts/tts_mlx.py b/scripts/tts_mlx.py
index 050fce5..70dca65 100644
--- a/scripts/tts_mlx.py
+++ b/scripts/tts_mlx.py
@@ -10,8 +10,8 @@
 
 import argparse
 import json
-from pathlib import Path
 import queue
+import sys
 import time
 
 import numpy as np
@@ -105,8 +105,13 @@ def main():
     mimi = tts_model.mimi
 
     log("info", f"reading input from {args.inp}")
-    with open(args.inp, "r") as fobj:
-        text_to_tts = fobj.read().strip()
+    if args.inp == "-":
+        if sys.stdin.isatty():  # Interactive
+            print("Enter text to synthesize (Ctrl+D to end input):")
+        text_to_tts = sys.stdin.read().strip()
+    else:
+        with open(args.inp, "r") as fobj:
+            text_to_tts = fobj.read().strip()
 
     all_entries = [tts_model.prepare_script([text_to_tts])]
     if tts_model.multi_speaker:

From 20cf8d7365352d1336e152037b88623696e8556f Mon Sep 17 00:00:00 2001
From: laurent <laurent.mazare@gmail.com>
Date: Thu, 3 Jul 2025 07:43:56 +0200
Subject: [PATCH 2/5] Collapsible sections.

---
 README.md | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 364d6c2..73df270 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,8 @@ Here is how to choose which one to use:
   MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon.
   If you want to run the model on a Mac or an iPhone, choose the MLX implementation.
 
-### PyTorch implementation
+<details>
+<summary>PyTorch implementation</summary>
 <a href="https://huggingface.co/kyutai/stt-2.6b-en" target="_blank" style="margin: 2px;">
     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
 </a>
@@ -99,8 +100,10 @@ In the heart of an ancient forest, where the trees whispered secrets of the past
 
 Apart from nudging the model for a specific spelling of a word, other potential use-cases include speaker adaptation and steering the model towards a specific formatting style or even a language.
 However, please bear in mind that is an experimental feature and its behavior is very sensitive to the prompt provided.
+</details>
 
-### Rust server
+<details>
+<summary>Rust server</summary>
 
 <a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
@@ -143,8 +146,10 @@ The script limits the decoding speed to simulates real-time processing of the au
 Faster processing can be triggered by setting 
 the real-time factor, e.g. `--rtf 1000` will process
 the data as fast as possible.
+</details>
 
-### Rust standalone
+<details>
+<summary>Rust standalone</summary>
 <a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
 </a>
@@ -157,8 +162,10 @@ cargo run --features cuda -r -- audio/bria.mp3
 ```
 You can get the timestamps by adding the `--timestamps` flag, and see the output
 of the semantic VAD by adding the `--vad` flag.
+</details>
 
-### MLX implementation
+<details>
+<summary>MLX implementation</summary>
 <a href="https://huggingface.co/kyutai/stt-2.6b-en-mlx" target="_blank" style="margin: 2px;">
     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
 </a>
@@ -187,6 +194,7 @@ python scripts/stt_from_mic_mlx.py
 The MLX models can also be used in swift using the [moshi-swift
 codebase](https://github.com/kyutai-labs/moshi-swift), the 1b model has been
 tested to work fine on an iPhone 16 Pro.
+</details>
 
 ## Kyutai Text-to-Speech
 
@@ -200,7 +208,8 @@ We provide different implementations of Kyutai TTS for different use cases. Here
 - Rust: for production. If you want to serve Kyutai TTS in a production setting, use our Rust server. Our robust Rust server provides streaming access to the model over websockets. We use this server to run Unmute.
 - MLX: for on-device inference on iPhone and Mac. MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon. If you want to run the model on a Mac or an iPhone, choose the MLX implementation.
 
-### PyTorch implementation
+<details>
+<summary>PyTorch implementation</summary>
 
 <a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/tts_pytorch.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@@ -219,12 +228,16 @@ python scripts/tts_pytorch.py text_to_say.txt audio_output.wav
 This requires the [moshi package](https://pypi.org/project/moshi/), which can be installed via pip.
 If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
 and just prefix the command above with `uvx --with moshi`.
+</details>
 
-### Rust server
+<details>
+<summary>Rust server</summary>
 
 Example coming soon.
+</details>
 
-### MLX implementation
+<details>
+<summary>MLX implementation</summary>
 
 [MLX](https://ml-explore.github.io/mlx/build/html/index.html) is Apple's ML framework that allows you to use
 hardware acceleration on Apple silicon.
@@ -243,6 +256,7 @@ python scripts/tts_mlx.py text_to_say.txt audio_output.wav
 This requires the [moshi-mlx package](https://pypi.org/project/moshi-mlx/), which can be installed via pip.
 If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
 and just prefix the command above with `uvx --with moshi-mlx`.
+</details>
 
 ## License
 
@@ -262,4 +276,4 @@ pip install pre-commit
 pre-commit install
 ```
 
-If you're using `uv`, you can replace the two commands with `uvx pre-commit install`.
\ No newline at end of file
+If you're using `uv`, you can replace the two commands with `uvx pre-commit install`.

From 236df522b848476a5aeca8617ae9ed43d0cae107 Mon Sep 17 00:00:00 2001
From: laurent <laurent.mazare@gmail.com>
Date: Thu, 3 Jul 2025 07:47:16 +0200
Subject: [PATCH 3/5] Add some links.

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 73df270..a397045 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ a flexible formulation for streaming, multimodal sequence-to-sequence learning.
 <a href="https://huggingface.co/collections/kyutai/speech-to-text-685403682cf8a23ab9466886" target="_blank" style="margin: 2px;">
     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-KyutaiSTT-blue" style="display: inline-block; vertical-align: middle;"/>
 </a>
+<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a>
 
 **More details can be found on the [project page](https://kyutai.org/next/stt).**
 
@@ -198,6 +201,9 @@ tested to work fine on an iPhone 16 Pro.
 
 ## Kyutai Text-to-Speech
 
+<a href="https://huggingface.co/collections/kyutai/text-to-speech-6866192e7e004ed04fd39e29" target="_blank" style="margin: 2px;">
+    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-KyutaiTTS-blue" style="display: inline-block; vertical-align: middle;"/>
+</a>
 <a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

From 6c1e9f12cf5a84972a2cab225f32e680d9833a80 Mon Sep 17 00:00:00 2001
From: laurent <laurent.mazare@gmail.com>
Date: Thu, 3 Jul 2025 07:52:27 +0200
Subject: [PATCH 4/5] Mention the MLX quantization.

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a397045..7954612 100644
--- a/README.md
+++ b/README.md
@@ -250,10 +250,13 @@ hardware acceleration on Apple silicon.
 
 Use our example script to run Kyutai TTS on MLX.
 The script takes text from stdin or a file and can output to a file or stream the resulting audio.
+When streaming the output, if the model is not fast enough to keep with
+real-time, you can use the `--quantize 8` or `--quantize 4` flags to quantize
+the model resulting in faster inference.
 
 ```bash
 # From stdin, plays audio immediately
-echo "Hey, how are you?" | python scripts/tts_mlx.py - -
+echo "Hey, how are you?" | python scripts/tts_mlx.py - - --quantize 8
 
 # From text file to audio file
 python scripts/tts_mlx.py text_to_say.txt audio_output.wav

From d92e4c26954b4bc8642f37fb0450f7be5033bba7 Mon Sep 17 00:00:00 2001
From: Laurent Mazare <laurent.mazare@gmail.com>
Date: Thu, 3 Jul 2025 09:29:04 +0200
Subject: [PATCH 5/5] Use the on_frame callback in the mlx tts example. (#34)

---
 scripts/tts_mlx.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/tts_mlx.py b/scripts/tts_mlx.py
index 70dca65..9d89295 100644
--- a/scripts/tts_mlx.py
+++ b/scripts/tts_mlx.py
@@ -2,7 +2,7 @@
 # requires-python = ">=3.12"
 # dependencies = [
 #     "huggingface_hub",
-#     "moshi_mlx>=0.2.8",
+#     "moshi_mlx @ git+https://git@github.com/kyutai-labs/moshi#egg=moshi_mlx&subdirectory=moshi_mlx",
 #     "numpy",
 #     "sounddevice",
 # ]
@@ -121,10 +121,10 @@ def main():
     all_attributes = [tts_model.make_condition_attributes(voices, cfg_coef_conditioning)]
 
     wav_frames = queue.Queue()
-    def _on_audio_hook(audio_tokens):
-        if (audio_tokens == -1).any():
+    def _on_frame(frame):
+        if (frame == -1).any():
             return
-        _pcm = tts_model.mimi.decode_step(audio_tokens[None, :, None])
+        _pcm = tts_model.mimi.decode_step(frame[:, :, None])
         _pcm = np.array(mx.clip(_pcm[0, 0], -1, 1))
         wav_frames.put_nowait(_pcm)
 
@@ -136,7 +136,7 @@ def main():
             all_attributes,
             cfg_is_no_prefix=cfg_is_no_prefix,
             cfg_is_no_text=cfg_is_no_text,
-            on_audio_hook=_on_audio_hook,
+            on_frame=_on_frame,
         )
         frames = mx.concat(result.frames, axis=-1)
         total_duration = frames.shape[0] * frames.shape[-1] / mimi.frame_rate
@@ -163,6 +163,7 @@ def main():
                     break
                 time.sleep(1)
     else:
+        run()
         frames = []
         while True:
             try: