STT example w/ prompting (#26)

* STT example w/ prompting * Text-audio prompt example into README.md + cutting prompt transcript. * A line in README * formatting in README --------- Co-authored-by: Eugene <eugene@kyutai.org>
2025-07-02 11:23:11 +02:00 · 2025-07-02 11:23:11 +02:00 · c4ef93770a
commit c4ef93770a
parent 395eaeae95
3 changed files with 213 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -87,6 +87,23 @@ uv run scripts/evaluate_on_dataset.py  \
  --hf-repo kyutai/stt-2.6b-en
 ```
 Another example shows how one can provide a text-, audio-, or text-audio prompt to our STT model:
 ```bash
 uv run scripts/transcribe_from_file_via_pytorch_with_prompt.py \
  --hf-repo kyutai/stt-2.6b-en \
  --file bria.mp3 \
  --prompt_file ./audio/loonah.mp3 \
  --prompt_text "Loonah" \
  --cut-prompt-transcript
 ```
 Produces the transcript of `bria.mp3` using the `Loonah` spelling for the name, instead of the `Luna` used without any prompt:
 ```
 In the heart of an ancient forest, where the trees whispered secrets of the past, there lived a peculiar rabbit named Loonah (...)
 ```
 Apart from nudging the model for a specific spelling of a word, other potential use-cases include speaker adaptation and steering the model towards a specific formatting style or even a language.
 However, please bear in mind that is an experimental feature and its behavior is very sensitive to the prompt provided.
 ### Rust server
 <a href="https://huggingface.co/kyutai/stt-2.6b-en-candle" target="_blank" style="margin: 2px;">
--- a/audio/loona.mp3
+++ b/audio/loona.mp3
--- a/scripts/transcribe_from_file_via_pytorch_with_prompt.py
+++ b/scripts/transcribe_from_file_via_pytorch_with_prompt.py
@ -0,0 +1,196 @@
 """An example script that illustrates how one can prompt Kyutai STT models."""
 import argparse
 import dataclasses
 import itertools
 import math
 from collections import deque
 import julius
 import moshi.models
 import sphn
 import torch
 import tqdm
 class PromptHook:
    def __init__(
        self,
        tokenizer,
        prefix,
        padding_tokens=(
            0,
            3,
        ),
    ):
        self.tokenizer = tokenizer
        self.prefix_enforce = deque(self.tokenizer.encode(prefix))
        self.padding_tokens = padding_tokens
    def on_token(self, token):
        if not self.prefix_enforce:
            return
        token = token.item()
        if token in self.padding_tokens:
            pass
        elif token == self.prefix_enforce[0]:
            self.prefix_enforce.popleft()
        else:
            assert False
    def on_logits(self, logits):
        if not self.prefix_enforce:
            return
        mask = torch.zeros_like(logits, dtype=torch.bool)
        for t in self.padding_tokens:
            mask[..., t] = True
        mask[..., self.prefix_enforce[0]] = True
        logits[:] = torch.where(mask, logits, float("-inf"))
 def main(args):
    info = moshi.models.loaders.CheckpointInfo.from_hf_repo(
        args.hf_repo,
        moshi_weights=args.moshi_weight,
        mimi_weights=args.mimi_weight,
        tokenizer=args.tokenizer,
        config_path=args.config_path,
    )
    mimi = info.get_mimi(device=args.device)
    tokenizer = info.get_text_tokenizer()
    lm = info.get_moshi(
        device=args.device,
        dtype=torch.bfloat16,
    )
    if args.prompt_text:
        prompt_hook = PromptHook(tokenizer, args.prompt_text)
        lm_gen = moshi.models.LMGen(
            lm,
            temp=0,
            temp_text=0.0,
            on_text_hook=prompt_hook.on_token,
            on_text_logits_hook=prompt_hook.on_logits,
        )
    else:
        lm_gen = moshi.models.LMGen(lm, temp=0, temp_text=0.0)
    audio_silence_prefix_seconds = info.stt_config.get(
        "audio_silence_prefix_seconds", 1.0
    )
    audio_delay_seconds = info.stt_config.get("audio_delay_seconds", 5.0)
    padding_token_id = info.raw_config.get("text_padding_token_id", 3)
    def _load_and_process(path):
        audio, input_sample_rate = sphn.read(path)
        audio = torch.from_numpy(audio).to(args.device).mean(axis=0, keepdim=True)
        audio = julius.resample_frac(audio, input_sample_rate, mimi.sample_rate)
        if audio.shape[-1] % mimi.frame_size != 0:
            to_pad = mimi.frame_size - audio.shape[-1] % mimi.frame_size
            audio = torch.nn.functional.pad(audio, (0, to_pad))
        return audio
    n_prefix_chunks = math.ceil(audio_silence_prefix_seconds * mimi.frame_rate)
    n_suffix_chunks = math.ceil(audio_delay_seconds * mimi.frame_rate)
    silence_chunk = torch.zeros(
        (1, 1, mimi.frame_size), dtype=torch.float32, device=args.device
    )
    audio = _load_and_process(args.file)
    if args.prompt_file:
        audio_prompt = _load_and_process(args.prompt_file)
    else:
        audio_prompt = None
    chain = [itertools.repeat(silence_chunk, n_prefix_chunks)]
    if audio_prompt is not None:
        chain.append(torch.split(audio_prompt[:, None, :], mimi.frame_size, dim=-1))
        # adding a bit (0.8s) of silence to separate prompt and the actual audio
        chain.append(itertools.repeat(silence_chunk, 10))
    chain += [
        torch.split(audio[:, None, :], mimi.frame_size, dim=-1),
        itertools.repeat(silence_chunk, n_suffix_chunks),
    ]
    chunks = itertools.chain(*chain)
    text_tokens_accum = []
    with mimi.streaming(1), lm_gen.streaming(1):
        for audio_chunk in tqdm.tqdm(chunks):
            audio_tokens = mimi.encode(audio_chunk)
            text_tokens = lm_gen.step(audio_tokens)
            if text_tokens is not None:
                text_tokens_accum.append(text_tokens)
    utterance_tokens = torch.concat(text_tokens_accum, dim=-1)
    text_tokens = utterance_tokens.cpu().view(-1)
    # if we have an audio prompt and we don't want to have it in the transcript,
    # we should cut the corresponding number of frames from the output tokens.
    # However, there is also some amount of padding that happens before it
    # due to silence_prefix and audio_delay. Normally it is ignored in detokenization,
    # but now we should account for it to find the position of the prompt transcript.
    if args.cut_prompt_transcript and audio_prompt is not None:
        prompt_frames = audio_prompt.shape[1] // mimi.frame_size
        no_prompt_offset_seconds = audio_delay_seconds + audio_silence_prefix_seconds
        no_prompt_offset = int(no_prompt_offset_seconds * mimi.frame_rate)
        text_tokens = text_tokens[prompt_frames + no_prompt_offset:]
    text = tokenizer.decode(
        text_tokens[text_tokens > padding_token_id].numpy().tolist()
    )
    print(text)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Example streaming STT w/ a prompt.")
    parser.add_argument(
        "--file",
        required=True,
        help="File to transcribe.",
    )
    parser.add_argument(
        "--prompt_file",
        required=False,
        help="Audio of the prompt.",
    )
    parser.add_argument(
        "--prompt_text",
        required=False,
        help="Text of the prompt.",
    )
    parser.add_argument(
        "--cut-prompt-transcript",
        action="store_true",
        help="Cut the prompt from the output transcript",
    )
    parser.add_argument(
        "--hf-repo", type=str, help="HF repo to load the STT model from. "
    )
    parser.add_argument("--tokenizer", type=str, help="Path to a local tokenizer file.")
    parser.add_argument(
        "--moshi-weight", type=str, help="Path to a local checkpoint file."
    )
    parser.add_argument(
        "--mimi-weight", type=str, help="Path to a local checkpoint file for Mimi."
    )
    parser.add_argument(
        "--config-path", type=str, help="Path to a local config file.", default=None
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
        help="Device on which to run, defaults to 'cuda'.",
    )
    args = parser.parse_args()
    main(args)