Merge branch 'main' of github.com:kyutai-labs/delayed-streams-modeling into vv/rust-example
This commit is contained in:
commit
1dfae2747f
41
README.md
41
README.md
|
|
@ -11,6 +11,9 @@ a flexible formulation for streaming, multimodal sequence-to-sequence learning.
|
||||||
<a href="https://huggingface.co/collections/kyutai/speech-to-text-685403682cf8a23ab9466886" target="_blank" style="margin: 2px;">
|
<a href="https://huggingface.co/collections/kyutai/speech-to-text-685403682cf8a23ab9466886" target="_blank" style="margin: 2px;">
|
||||||
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-KyutaiSTT-blue" style="display: inline-block; vertical-align: middle;"/>
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-KyutaiSTT-blue" style="display: inline-block; vertical-align: middle;"/>
|
||||||
</a>
|
</a>
|
||||||
|
<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
|
||||||
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
|
</a>
|
||||||
|
|
||||||
**More details can be found on the [project page](https://kyutai.org/next/stt).**
|
**More details can be found on the [project page](https://kyutai.org/next/stt).**
|
||||||
|
|
||||||
|
|
@ -44,7 +47,8 @@ Here is how to choose which one to use:
|
||||||
MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon.
|
MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon.
|
||||||
If you want to run the model on a Mac or an iPhone, choose the MLX implementation.
|
If you want to run the model on a Mac or an iPhone, choose the MLX implementation.
|
||||||
|
|
||||||
### PyTorch implementation
|
<details>
|
||||||
|
<summary>PyTorch implementation</summary>
|
||||||
<a href="https://huggingface.co/kyutai/stt-2.6b-en" target="_blank" style="margin: 2px;">
|
<a href="https://huggingface.co/kyutai/stt-2.6b-en" target="_blank" style="margin: 2px;">
|
||||||
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
@ -99,8 +103,10 @@ In the heart of an ancient forest, where the trees whispered secrets of the past
|
||||||
|
|
||||||
Apart from nudging the model for a specific spelling of a word, other potential use-cases include speaker adaptation and steering the model towards a specific formatting style or even a language.
|
Apart from nudging the model for a specific spelling of a word, other potential use-cases include speaker adaptation and steering the model towards a specific formatting style or even a language.
|
||||||
However, please bear in mind that is an experimental feature and its behavior is very sensitive to the prompt provided.
|
However, please bear in mind that is an experimental feature and its behavior is very sensitive to the prompt provided.
|
||||||
|
</details>
|
||||||
|
|
||||||
### Rust server
|
<details>
|
||||||
|
<summary>Rust server</summary>
|
||||||
|
|
||||||
<a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
|
<a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
|
||||||
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
||||||
|
|
@ -143,8 +149,10 @@ The script limits the decoding speed to simulates real-time processing of the au
|
||||||
Faster processing can be triggered by setting
|
Faster processing can be triggered by setting
|
||||||
the real-time factor, e.g. `--rtf 1000` will process
|
the real-time factor, e.g. `--rtf 1000` will process
|
||||||
the data as fast as possible.
|
the data as fast as possible.
|
||||||
|
</details>
|
||||||
|
|
||||||
### Rust standalone
|
<details>
|
||||||
|
<summary>Rust standalone</summary>
|
||||||
<a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
|
<a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
|
||||||
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
@ -157,8 +165,10 @@ cargo run --features cuda -r -- audio/bria.mp3
|
||||||
```
|
```
|
||||||
You can get the timestamps by adding the `--timestamps` flag, and see the output
|
You can get the timestamps by adding the `--timestamps` flag, and see the output
|
||||||
of the semantic VAD by adding the `--vad` flag.
|
of the semantic VAD by adding the `--vad` flag.
|
||||||
|
</details>
|
||||||
|
|
||||||
### MLX implementation
|
<details>
|
||||||
|
<summary>MLX implementation</summary>
|
||||||
<a href="https://huggingface.co/kyutai/stt-2.6b-en-mlx" target="_blank" style="margin: 2px;">
|
<a href="https://huggingface.co/kyutai/stt-2.6b-en-mlx" target="_blank" style="margin: 2px;">
|
||||||
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
@ -187,9 +197,13 @@ python scripts/stt_from_mic_mlx.py
|
||||||
The MLX models can also be used in swift using the [moshi-swift
|
The MLX models can also be used in swift using the [moshi-swift
|
||||||
codebase](https://github.com/kyutai-labs/moshi-swift), the 1b model has been
|
codebase](https://github.com/kyutai-labs/moshi-swift), the 1b model has been
|
||||||
tested to work fine on an iPhone 16 Pro.
|
tested to work fine on an iPhone 16 Pro.
|
||||||
|
</details>
|
||||||
|
|
||||||
## Kyutai Text-to-Speech
|
## Kyutai Text-to-Speech
|
||||||
|
|
||||||
|
<a href="https://huggingface.co/collections/kyutai/text-to-speech-6866192e7e004ed04fd39e29" target="_blank" style="margin: 2px;">
|
||||||
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-KyutaiTTS-blue" style="display: inline-block; vertical-align: middle;"/>
|
||||||
|
</a>
|
||||||
<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
|
<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
|
||||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
@ -200,7 +214,8 @@ We provide different implementations of Kyutai TTS for different use cases. Here
|
||||||
- Rust: for production. If you want to serve Kyutai TTS in a production setting, use our Rust server. Our robust Rust server provides streaming access to the model over websockets. We use this server to run Unmute.
|
- Rust: for production. If you want to serve Kyutai TTS in a production setting, use our Rust server. Our robust Rust server provides streaming access to the model over websockets. We use this server to run Unmute.
|
||||||
- MLX: for on-device inference on iPhone and Mac. MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon. If you want to run the model on a Mac or an iPhone, choose the MLX implementation.
|
- MLX: for on-device inference on iPhone and Mac. MLX is Apple's ML framework that allows you to use hardware acceleration on Apple silicon. If you want to run the model on a Mac or an iPhone, choose the MLX implementation.
|
||||||
|
|
||||||
### PyTorch implementation
|
<details>
|
||||||
|
<summary>PyTorch implementation</summary>
|
||||||
|
|
||||||
<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/tts_pytorch.ipynb">
|
<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/tts_pytorch.ipynb">
|
||||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
|
|
@ -219,8 +234,10 @@ python scripts/tts_pytorch.py text_to_say.txt audio_output.wav
|
||||||
This requires the [moshi package](https://pypi.org/project/moshi/), which can be installed via pip.
|
This requires the [moshi package](https://pypi.org/project/moshi/), which can be installed via pip.
|
||||||
If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
|
If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
|
||||||
and just prefix the command above with `uvx --with moshi`.
|
and just prefix the command above with `uvx --with moshi`.
|
||||||
|
</details>
|
||||||
|
|
||||||
### Rust server
|
<details>
|
||||||
|
<summary>Rust server</summary>
|
||||||
|
|
||||||
|
|
||||||
The Rust implementation provides a server that can process multiple streaming
|
The Rust implementation provides a server that can process multiple streaming
|
||||||
|
|
@ -251,18 +268,23 @@ echo "Hey, how are you?" | python scripts/tts_rust_server.py - -
|
||||||
# From text file to audio file
|
# From text file to audio file
|
||||||
python scripts/tts_rust_server.py text_to_say.txt audio_output.wav
|
python scripts/tts_rust_server.py text_to_say.txt audio_output.wav
|
||||||
```
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
### MLX implementation
|
<details>
|
||||||
|
<summary>MLX implementation</summary>
|
||||||
|
|
||||||
[MLX](https://ml-explore.github.io/mlx/build/html/index.html) is Apple's ML framework that allows you to use
|
[MLX](https://ml-explore.github.io/mlx/build/html/index.html) is Apple's ML framework that allows you to use
|
||||||
hardware acceleration on Apple silicon.
|
hardware acceleration on Apple silicon.
|
||||||
|
|
||||||
Use our example script to run Kyutai TTS on MLX.
|
Use our example script to run Kyutai TTS on MLX.
|
||||||
The script takes text from stdin or a file and can output to a file or stream the resulting audio.
|
The script takes text from stdin or a file and can output to a file or stream the resulting audio.
|
||||||
|
When streaming the output, if the model is not fast enough to keep with
|
||||||
|
real-time, you can use the `--quantize 8` or `--quantize 4` flags to quantize
|
||||||
|
the model resulting in faster inference.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# From stdin, plays audio immediately
|
# From stdin, plays audio immediately
|
||||||
echo "Hey, how are you?" | python scripts/tts_mlx.py - -
|
echo "Hey, how are you?" | python scripts/tts_mlx.py - - --quantize 8
|
||||||
|
|
||||||
# From text file to audio file
|
# From text file to audio file
|
||||||
python scripts/tts_mlx.py text_to_say.txt audio_output.wav
|
python scripts/tts_mlx.py text_to_say.txt audio_output.wav
|
||||||
|
|
@ -271,6 +293,7 @@ python scripts/tts_mlx.py text_to_say.txt audio_output.wav
|
||||||
This requires the [moshi-mlx package](https://pypi.org/project/moshi-mlx/), which can be installed via pip.
|
This requires the [moshi-mlx package](https://pypi.org/project/moshi-mlx/), which can be installed via pip.
|
||||||
If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
|
If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step
|
||||||
and just prefix the command above with `uvx --with moshi-mlx`.
|
and just prefix the command above with `uvx --with moshi-mlx`.
|
||||||
|
</details>
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
|
@ -290,4 +313,4 @@ pip install pre-commit
|
||||||
pre-commit install
|
pre-commit install
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're using `uv`, you can replace the two commands with `uvx pre-commit install`.
|
If you're using `uv`, you can replace the two commands with `uvx pre-commit install`.
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
# requires-python = ">=3.12"
|
# requires-python = ">=3.12"
|
||||||
# dependencies = [
|
# dependencies = [
|
||||||
# "huggingface_hub",
|
# "huggingface_hub",
|
||||||
# "moshi_mlx>=0.2.8",
|
# "moshi_mlx @ git+https://git@github.com/kyutai-labs/moshi#egg=moshi_mlx&subdirectory=moshi_mlx",
|
||||||
# "numpy",
|
# "numpy",
|
||||||
# "sounddevice",
|
# "sounddevice",
|
||||||
# ]
|
# ]
|
||||||
|
|
@ -142,10 +142,10 @@ def main():
|
||||||
|
|
||||||
wav_frames = queue.Queue()
|
wav_frames = queue.Queue()
|
||||||
|
|
||||||
def _on_audio_hook(audio_tokens):
|
def _on_frame(frame):
|
||||||
if (audio_tokens == -1).any():
|
if (frame == -1).any():
|
||||||
return
|
return
|
||||||
_pcm = tts_model.mimi.decode_step(audio_tokens[None, :, None])
|
_pcm = tts_model.mimi.decode_step(frame[:, :, None])
|
||||||
_pcm = np.array(mx.clip(_pcm[0, 0], -1, 1))
|
_pcm = np.array(mx.clip(_pcm[0, 0], -1, 1))
|
||||||
wav_frames.put_nowait(_pcm)
|
wav_frames.put_nowait(_pcm)
|
||||||
|
|
||||||
|
|
@ -157,7 +157,7 @@ def main():
|
||||||
all_attributes,
|
all_attributes,
|
||||||
cfg_is_no_prefix=cfg_is_no_prefix,
|
cfg_is_no_prefix=cfg_is_no_prefix,
|
||||||
cfg_is_no_text=cfg_is_no_text,
|
cfg_is_no_text=cfg_is_no_text,
|
||||||
on_audio_hook=_on_audio_hook,
|
on_frame=_on_frame,
|
||||||
)
|
)
|
||||||
frames = mx.concat(result.frames, axis=-1)
|
frames = mx.concat(result.frames, axis=-1)
|
||||||
total_duration = frames.shape[0] * frames.shape[-1] / mimi.frame_rate
|
total_duration = frames.shape[0] * frames.shape[-1] / mimi.frame_rate
|
||||||
|
|
@ -188,6 +188,7 @@ def main():
|
||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
else:
|
else:
|
||||||
|
run()
|
||||||
frames = []
|
frames = []
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user