Add the en-fr config.
This commit is contained in:
parent
ad618fd64d
commit
a786ad8a0b
17
README.md
17
README.md
|
|
@ -3,8 +3,8 @@ Delayed Streams Modeling (DSM) is a flexible formulation for streaming, multimod
|
|||
|
||||
## Speech To Text
|
||||
|
||||
### Leaderboard model
|
||||
The leaderboard model handles english only, it has ~2.6B parameters.
|
||||
### English only model
|
||||
The main model handles english only, it has ~2.6B parameters.
|
||||
|
||||
#### PyTorch implementation
|
||||
[[Hugging Face]](https://huggingface.co/kyutai/stt-2.6B-en)
|
||||
|
|
@ -55,6 +55,19 @@ The script simulates some real-time processing of the audio. Faster processing
|
|||
can be triggered by setting the real-time factor, e.g. `--rtf 500` will process
|
||||
the data as fast as possible.
|
||||
|
||||
### English + French model
|
||||
This model has ~1B parameters and supports both English and French.
|
||||
|
||||
#### Rust implementation
|
||||
[[Hugging Face]](https://huggingface.co/kyutai/stt-1B-en_fr-candle)
|
||||
|
||||
The only difference with the en only model is the config file used when
|
||||
launching the server.
|
||||
```bash
|
||||
moshi-server worker --config configs/config-stt-enfr-hf.toml
|
||||
```
|
||||
|
||||
|
||||
## Text To Speech
|
||||
|
||||
We're in the process of open-sourcing our TTS models. Check back for updates!
|
||||
|
|
|
|||
46
configs/config-stt-enfr-hf.toml
Normal file
46
configs/config-stt-enfr-hf.toml
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
static_dir = "./static/"
|
||||
log_dir = "$HOME/tmp/tts-logs"
|
||||
instance_name = "tts"
|
||||
authorized_ids = ["open_token"]
|
||||
|
||||
[modules.asr]
|
||||
path = "/api/asr-streaming"
|
||||
type = "BatchedAsr"
|
||||
lm_model_file = "hf://kyutai/stt-1B-en_fr-candle/model.safetensors"
|
||||
text_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/tokenizer_en_fr_audio_8000.model"
|
||||
audio_tokenizer_file = "hf://kyutai/stt-1B-en_fr-candle/mimi-pytorch-e351c8d8@125.safetensors"
|
||||
asr_delay_in_tokens = 6
|
||||
batch_size = 64
|
||||
conditioning_learnt_padding = true
|
||||
temperature = 0.0
|
||||
|
||||
[modules.asr.model]
|
||||
audio_vocab_size = 2049
|
||||
text_in_vocab_size = 8001
|
||||
text_out_vocab_size = 8000
|
||||
audio_codebooks = 32
|
||||
|
||||
[modules.asr.model.transformer]
|
||||
d_model = 2048
|
||||
num_heads = 16
|
||||
num_layers = 16
|
||||
dim_feedforward = 8192
|
||||
causal = true
|
||||
norm_first = true
|
||||
bias_ff = false
|
||||
bias_attn = false
|
||||
context = 750
|
||||
max_period = 100000
|
||||
use_conv_block = false
|
||||
use_conv_bias = true
|
||||
gating = "silu"
|
||||
norm = "RmsNorm"
|
||||
positional_embedding = "Rope"
|
||||
conv_layout = false
|
||||
conv_kernel_size = 3
|
||||
kv_repeat = 1
|
||||
max_seq_len = 40960
|
||||
|
||||
[modules.asr.model.extra_heads]
|
||||
num_heads = 4
|
||||
dim = 6
|
||||
Loading…
Reference in New Issue
Block a user