static_dir = "./static/" log_dir = "$HOME/tmp/tts-logs" instance_name = "tts" authorized_ids = ["public_token"] [modules.asr] path = "/api/asr-streaming" type = "BatchedAsr" lm_model_file = "hf://kyutai/stt-1b-en_fr-candle/model.safetensors" text_tokenizer_file = "hf://kyutai/stt-1b-en_fr-candle/tokenizer_en_fr_audio_8000.model" audio_tokenizer_file = "hf://kyutai/stt-1b-en_fr-candle/mimi-pytorch-e351c8d8@125.safetensors" asr_delay_in_tokens = 6 batch_size = 64 conditioning_learnt_padding = true temperature = 0.0 [modules.asr.model] audio_vocab_size = 2049 text_in_vocab_size = 8001 text_out_vocab_size = 8000 audio_codebooks = 32 [modules.asr.model.transformer] d_model = 2048 num_heads = 16 num_layers = 16 dim_feedforward = 8192 causal = true norm_first = true bias_ff = false bias_attn = false context = 750 max_period = 100000 use_conv_block = false use_conv_bias = true gating = "silu" norm = "RmsNorm" positional_embedding = "Rope" conv_layout = false conv_kernel_size = 3 kv_repeat = 1 max_seq_len = 40960 [modules.asr.model.extra_heads] num_heads = 4 dim = 6