diff --git a/README.md b/README.md index c67a138..da15108 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ Delayed Streams Modeling (DSM) is a flexible formulation for streaming, multimod ## Speech To Text -### Leaderboard model -The leaderboard model handles english only, it has ~2.6B parameters. +### English only model +The main model handles english only, it has ~2.6B parameters. #### PyTorch implementation [[Hugging Face]](https://huggingface.co/kyutai/stt-2.6B-en) @@ -55,6 +55,19 @@ The script simulates some real-time processing of the audio. Faster processing can be triggered by setting the real-time factor, e.g. `--rtf 500` will process the data as fast as possible. +### English + French model +This model has ~1B parameters and supports both English and French. + +#### Rust implementation +[[Hugging Face]](https://huggingface.co/kyutai/stt-1B-en_fr-candle) + +The only difference with the en only model is the config file used when +launching the server. +```bash +moshi-server worker --config configs/config-stt-enfr-hf.toml +``` + + ## Text To Speech We're in the process of open-sourcing our TTS models. Check back for updates! diff --git a/configs/config-stt-enfr-hf.toml b/configs/config-stt-enfr-hf.toml new file mode 100644 index 0000000..52cf40b --- /dev/null +++ b/configs/config-stt-enfr-hf.toml @@ -0,0 +1,46 @@ +static_dir = "./static/" +log_dir = "$HOME/tmp/tts-logs" +instance_name = "tts" +authorized_ids = ["open_token"] + +[modules.asr] +path = "/api/asr-streaming" +type = "BatchedAsr" +lm_model_file = "hf://kyutai/stt-1B-en_fr-candle/model.safetensors" +text_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/tokenizer_en_fr_audio_8000.model" +audio_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/mimi-pytorch-e351c8d8@125.safetensors" +asr_delay_in_tokens = 6 +batch_size = 64 +conditioning_learnt_padding = true +temperature = 0.0 + +[modules.asr.model] +audio_vocab_size = 2049 +text_in_vocab_size = 8001 +text_out_vocab_size = 8000 +audio_codebooks = 32 + +[modules.asr.model.transformer] +d_model = 2048 +num_heads = 16 +num_layers = 16 +dim_feedforward = 8192 +causal = true +norm_first = true +bias_ff = false +bias_attn = false +context = 750 +max_period = 100000 +use_conv_block = false +use_conv_bias = true +gating = "silu" +norm = "RmsNorm" +positional_embedding = "Rope" +conv_layout = false +conv_kernel_size = 3 +kv_repeat = 1 +max_seq_len = 40960 + +[modules.asr.model.extra_heads] +num_heads = 4 +dim = 6