Add the en-fr config.

2025-06-17 11:14:17 +02:00 · 2025-06-17 11:14:17 +02:00 · a786ad8a0b
commit a786ad8a0b
parent ad618fd64d
2 changed files with 61 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -3,8 +3,8 @@ Delayed Streams Modeling (DSM) is a flexible formulation for streaming, multimod
 ## Speech To Text
-### Leaderboard model
+### English only model
-The leaderboard model handles english only, it has ~2.6B parameters.
+The main model handles english only, it has ~2.6B parameters.
 #### PyTorch implementation
 [[Hugging Face]](https://huggingface.co/kyutai/stt-2.6B-en)
@ -55,6 +55,19 @@ The script simulates some real-time processing of the audio. Faster processing
 can be triggered by setting the real-time factor, e.g. `--rtf 500` will process
 the data as fast as possible.
 ### English + French model
 This model has ~1B parameters and supports both English and French.
 #### Rust implementation
 [[Hugging Face]](https://huggingface.co/kyutai/stt-1B-en_fr-candle)
 The only difference with the en only model is the config file used when
 launching the server.
 ```bash
 moshi-server worker --config configs/config-stt-enfr-hf.toml
 ```
 ## Text To Speech
 We're in the process of open-sourcing our TTS models. Check back for updates!
--- a/configs/config-stt-enfr-hf.toml
+++ b/configs/config-stt-enfr-hf.toml
@ -0,0 +1,46 @@
 static_dir = "./static/"
 log_dir = "$HOME/tmp/tts-logs"
 instance_name = "tts"
 authorized_ids = ["open_token"]
 [modules.asr]
 path = "/api/asr-streaming"
 type = "BatchedAsr"
 lm_model_file = "hf://kyutai/stt-1B-en_fr-candle/model.safetensors"
 text_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/tokenizer_en_fr_audio_8000.model"
 audio_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/mimi-pytorch-e351c8d8@125.safetensors"
 asr_delay_in_tokens = 6
 batch_size = 64
 conditioning_learnt_padding = true
 temperature = 0.0
 [modules.asr.model]
 audio_vocab_size = 2049
 text_in_vocab_size = 8001
 text_out_vocab_size = 8000
 audio_codebooks = 32
 [modules.asr.model.transformer]
 d_model = 2048
 num_heads = 16
 num_layers = 16
 dim_feedforward = 8192
 causal = true
 norm_first = true
 bias_ff = false
 bias_attn = false
 context = 750
 max_period = 100000
 use_conv_block = false
 use_conv_bias = true
 gating = "silu"
 norm = "RmsNorm"
 positional_embedding = "Rope"
 conv_layout = false
 conv_kernel_size = 3
 kv_repeat = 1
 max_seq_len = 40960
 [modules.asr.model.extra_heads]
 num_heads = 4
 dim = 6