Config tweaks.

This commit is contained in:
laurent 2025-06-17 11:59:33 +02:00
parent a786ad8a0b
commit f87b8f1e6f
3 changed files with 11 additions and 11 deletions

View File

@ -7,7 +7,7 @@ Delayed Streams Modeling (DSM) is a flexible formulation for streaming, multimod
The main model handles english only, it has ~2.6B parameters. The main model handles english only, it has ~2.6B parameters.
#### PyTorch implementation #### PyTorch implementation
[[Hugging Face]](https://huggingface.co/kyutai/stt-2.6B-en) [[Hugging Face]](https://huggingface.co/kyutai/stt-2.6b-en)
```bash ```bash
# wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3 # wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3
@ -15,15 +15,15 @@ python -m moshi.run_inference --hf-repo kyutai/stt-2.6B-en bria.mp3
``` ```
#### MLX implementation #### MLX implementation
[[Hugging Face]](https://huggingface.co/kyutai/stt-2.6B-en-mlx) [[Hugging Face]](https://huggingface.co/kyutai/stt-2.6b-en-mlx)
```bash ```bash
# wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3 # wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3
python -m moshi_mlx.run_inference --hf-repo kyutai/stt-2.6B-en-mlx bria.mp3 --temp 0 python -m moshi_mlx.run_inference --hf-repo kyutai/stt-2.6b-en-mlx bria.mp3 --temp 0
``` ```
#### Rust implementation #### Rust implementation
[[Hugging Face]](https://huggingface.co/kyutai/stt-2.6B-en-candle) [[Hugging Face]](https://huggingface.co/kyutai/stt-2.6b-en-candle)
The Rust implementation provides a server that can process multiple streaming The Rust implementation provides a server that can process multiple streaming
queries in parallel. Dependening on the amount of memory on your GPU, you may queries in parallel. Dependening on the amount of memory on your GPU, you may
@ -59,7 +59,7 @@ the data as fast as possible.
This model has ~1B parameters and supports both English and French. This model has ~1B parameters and supports both English and French.
#### Rust implementation #### Rust implementation
[[Hugging Face]](https://huggingface.co/kyutai/stt-1B-en_fr-candle) [[Hugging Face]](https://huggingface.co/kyutai/stt-1b-en_fr-candle)
The only difference with the en only model is the config file used when The only difference with the en only model is the config file used when
launching the server. launching the server.

View File

@ -6,9 +6,9 @@ authorized_ids = ["open_token"]
[modules.asr] [modules.asr]
path = "/api/asr-streaming" path = "/api/asr-streaming"
type = "BatchedAsr" type = "BatchedAsr"
lm_model_file = "hf://kyutai/stt-1B-en_fr-candle/model.safetensors" lm_model_file = "hf://kyutai/stt-1b-en_fr-candle/model.safetensors"
text_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/tokenizer_en_fr_audio_8000.model" text_tokenizer_file = "hf://kyutai/stt-1b-en_fr-candle/tokenizer_en_fr_audio_8000.model"
audio_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/mimi-pytorch-e351c8d8@125.safetensors" audio_tokenizer_file = "hf://kyutai/stt-1b-en_fr-candle/mimi-pytorch-e351c8d8@125.safetensors"
asr_delay_in_tokens = 6 asr_delay_in_tokens = 6
batch_size = 64 batch_size = 64
conditioning_learnt_padding = true conditioning_learnt_padding = true

View File

@ -6,9 +6,9 @@ authorized_ids = ["open_token"]
[modules.asr] [modules.asr]
path = "/api/asr-streaming" path = "/api/asr-streaming"
type = "BatchedAsr" type = "BatchedAsr"
lm_model_file = "hf://kyutai/stt-2.6B-en-candle/model.safetensors" lm_model_file = "hf://kyutai/stt-2.6b-en-candle/model.safetensors"
text_tokenizer_file = "hf://kyutai/stt-2.6B-en-candle/tokenizer_en_audio_4000.model" text_tokenizer_file = "hf://kyutai/stt-2.6b-en-candle/tokenizer_en_audio_4000.model"
audio_tokenizer_file = "hf://kyutai/stt-2.6B-en-candle/mimi-pytorch-e351c8d8@125.safetensors" audio_tokenizer_file = "hf://kyutai/stt-2.6b-en-candle/mimi-pytorch-e351c8d8@125.safetensors"
asr_delay_in_tokens = 6 asr_delay_in_tokens = 6
batch_size = 16 batch_size = 16
conditioning_learnt_padding = true conditioning_learnt_padding = true