2025-06-17 06:37:44 +00:00
|
|
|
static_dir = "./static/"
|
|
|
|
|
log_dir = "$HOME/tmp/tts-logs"
|
|
|
|
|
instance_name = "tts"
|
2025-06-26 14:36:32 +00:00
|
|
|
authorized_ids = ["public_token"]
|
2025-06-17 06:37:44 +00:00
|
|
|
|
|
|
|
|
[modules.asr]
|
|
|
|
|
path = "/api/asr-streaming"
|
|
|
|
|
type = "BatchedAsr"
|
2025-06-17 09:59:33 +00:00
|
|
|
lm_model_file = "hf://kyutai/stt-2.6b-en-candle/model.safetensors"
|
|
|
|
|
text_tokenizer_file = "hf://kyutai/stt-2.6b-en-candle/tokenizer_en_audio_4000.model"
|
|
|
|
|
audio_tokenizer_file = "hf://kyutai/stt-2.6b-en-candle/mimi-pytorch-e351c8d8@125.safetensors"
|
2025-06-18 10:20:23 +00:00
|
|
|
asr_delay_in_tokens = 32
|
2025-06-17 06:37:44 +00:00
|
|
|
batch_size = 16
|
|
|
|
|
conditioning_learnt_padding = true
|
|
|
|
|
temperature = 0
|
|
|
|
|
|
|
|
|
|
[modules.asr.model]
|
|
|
|
|
audio_vocab_size = 2049
|
|
|
|
|
text_in_vocab_size = 4001
|
|
|
|
|
text_out_vocab_size = 4000
|
|
|
|
|
audio_codebooks = 32
|
|
|
|
|
|
|
|
|
|
[modules.asr.model.transformer]
|
|
|
|
|
d_model = 2048
|
|
|
|
|
num_heads = 32
|
|
|
|
|
num_layers = 48
|
|
|
|
|
dim_feedforward = 8192
|
|
|
|
|
causal = true
|
|
|
|
|
norm_first = true
|
|
|
|
|
bias_ff = false
|
|
|
|
|
bias_attn = false
|
|
|
|
|
context = 375
|
|
|
|
|
max_period = 100000
|
|
|
|
|
use_conv_block = false
|
|
|
|
|
use_conv_bias = true
|
|
|
|
|
gating = "silu"
|
|
|
|
|
norm = "RmsNorm"
|
|
|
|
|
positional_embedding = "Rope"
|
|
|
|
|
conv_layout = false
|
|
|
|
|
conv_kernel_size = 3
|
|
|
|
|
kv_repeat = 1
|
|
|
|
|
max_seq_len = 40960
|