From 433dca3751a2a21a95a6d7ca1fd2a44c516a729c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A1clav=20Volhejn?=
 <8401624+vvolhejn@users.noreply.github.com>
Date: Wed, 2 Jul 2025 18:21:42 +0200
Subject: [PATCH] Shorter names for STT scripts (#30)

---
 README.md                                        | 16 ++++++++--------
 ..._on_dataset.py => stt_evaluate_on_dataset.py} |  0
 ...e_via_pytorch.py => stt_from_file_pytorch.py} |  0
 ...st_server.py => stt_from_file_rust_server.py} |  0
 ...t.py => stt_from_file_with_prompt_pytorch.py} | 12 ++----------
 ...e_from_mic_via_mlx.py => stt_from_mic_mlx.py} |  0
 ...ust_server.py => stt_from_mic_rust_server.py} |  0
 ...scribe_via_pytorch.ipynb => stt_pytorch.ipynb |  6 ++----
 8 files changed, 12 insertions(+), 22 deletions(-)
 rename scripts/{evaluate_on_dataset.py => stt_evaluate_on_dataset.py} (100%)
 rename scripts/{transcribe_from_file_via_pytorch.py => stt_from_file_pytorch.py} (100%)
 rename scripts/{transcribe_from_file_via_rust_server.py => stt_from_file_rust_server.py} (100%)
 rename scripts/{transcribe_from_file_via_pytorch_with_prompt.py => stt_from_file_with_prompt_pytorch.py} (97%)
 rename scripts/{transcribe_from_mic_via_mlx.py => stt_from_mic_mlx.py} (100%)
 rename scripts/{transcribe_from_mic_via_rust_server.py => stt_from_mic_rust_server.py} (100%)
 rename transcribe_via_pytorch.ipynb => stt_pytorch.ipynb (99%)
diff --git a/README.md b/README.md
index e36cc37..364d6c2 100644
--- a/README.md
+++ b/README.md
@@ -48,12 +48,12 @@ Here is how to choose which one to use:
 <a href="https://huggingface.co/kyutai/stt-2.6b-en" target="_blank" style="margin: 2px;">
     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue" style="display: inline-block; vertical-align: middle;"/>
 </a>
-<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/transcribe_via_pytorch.ipynb">
+<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>
 
 For an example of how to use the model in a way where you can directly stream in PyTorch tensors,
-[see our Colab notebook](https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/transcribe_via_pytorch.ipynb).
+[see our Colab notebook](https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb).
 
 This requires the [moshi package](https://pypi.org/project/moshi/)
 with version 0.2.6 or later, which can be installed via pip.
@@ -71,7 +71,7 @@ Additionally, we provide two scripts that highlight different usage scenarios. T
 
 ```bash
 uv run \
-  scripts/transcribe_from_file_via_pytorch.py \
+  scripts/stt_from_file_pytorch.py \
   --hf-repo kyutai/stt-2.6b-en \
   --file audio/bria.mp3
 ```
@@ -85,7 +85,7 @@ uv run scripts/evaluate_on_dataset.py  \
 
 Another example shows how one can provide a text-, audio-, or text-audio prompt to our STT model:
 ```bash
-uv run scripts/transcribe_from_file_via_pytorch_with_prompt.py \
+uv run scripts/stt_from_file_pytorch_with_prompt.py \
   --hf-repo kyutai/stt-2.6b-en \
   --file bria.mp3 \
   --prompt_file ./audio/loonah.mp3 \
@@ -131,12 +131,12 @@ moshi-server worker --config configs/config-stt-en_fr-hf.toml
 
 Once the server has started you can transcribe audio from your microphone with the following script.
 ```bash
-uv run scripts/transcribe_from_mic_via_rust_server.py
+uv run scripts/stt_from_mic_rust_server.py
 ```
 
 We also provide a script for transcribing from an audio file.
 ```bash
-uv run scripts/transcribe_from_file_via_rust_server.py audio/bria.mp3
+uv run scripts/stt_from_file_rust_server.py audio/bria.mp3
 ```
 
 The script limits the decoding speed to simulates real-time processing of the audio. 
@@ -181,7 +181,7 @@ and just prefix the command above with `uvx --with moshi-mlx`.
 If you want to transcribe audio from your microphone, use:
 
 ```bash
-python scripts/transcribe_from_mic_via_mlx.py
+python scripts/stt_from_mic_mlx.py
 ```
 
 The MLX models can also be used in swift using the [moshi-swift
@@ -190,7 +190,7 @@ tested to work fine on an iPhone 16 Pro.
 
 ## Kyutai Text-to-Speech
 
-<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/transcribe_via_pytorch.ipynb">
+<a target="_blank" href="https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>
 
diff --git a/scripts/evaluate_on_dataset.py b/scripts/stt_evaluate_on_dataset.py
similarity index 100%
rename from scripts/evaluate_on_dataset.py
rename to scripts/stt_evaluate_on_dataset.py
diff --git a/scripts/transcribe_from_file_via_pytorch.py b/scripts/stt_from_file_pytorch.py
similarity index 100%
rename from scripts/transcribe_from_file_via_pytorch.py
rename to scripts/stt_from_file_pytorch.py
diff --git a/scripts/transcribe_from_file_via_rust_server.py b/scripts/stt_from_file_rust_server.py
similarity index 100%
rename from scripts/transcribe_from_file_via_rust_server.py
rename to scripts/stt_from_file_rust_server.py
diff --git a/scripts/transcribe_from_file_via_pytorch_with_prompt.py b/scripts/stt_from_file_with_prompt_pytorch.py
similarity index 97%
rename from scripts/transcribe_from_file_via_pytorch_with_prompt.py
rename to scripts/stt_from_file_with_prompt_pytorch.py
index 5861116..63fe748 100644
--- a/scripts/transcribe_from_file_via_pytorch_with_prompt.py
+++ b/scripts/stt_from_file_with_prompt_pytorch.py
@@ -14,15 +14,7 @@ import tqdm
 
 
 class PromptHook:
-    def __init__(
-        self,
-        tokenizer,
-        prefix,
-        padding_tokens=(
-            0,
-            3,
-        ),
-    ):
+    def __init__(self, tokenizer, prefix, padding_tokens=(0, 3)):
         self.tokenizer = tokenizer
         self.prefix_enforce = deque(self.tokenizer.encode(prefix))
         self.padding_tokens = padding_tokens
@@ -141,7 +133,7 @@ def main(args):
         prompt_frames = audio_prompt.shape[1] // mimi.frame_size
         no_prompt_offset_seconds = audio_delay_seconds + audio_silence_prefix_seconds
         no_prompt_offset = int(no_prompt_offset_seconds * mimi.frame_rate)
-        text_tokens = text_tokens[prompt_frames + no_prompt_offset:]
+        text_tokens = text_tokens[prompt_frames + no_prompt_offset :]
 
     text = tokenizer.decode(
         text_tokens[text_tokens > padding_token_id].numpy().tolist()
diff --git a/scripts/transcribe_from_mic_via_mlx.py b/scripts/stt_from_mic_mlx.py
similarity index 100%
rename from scripts/transcribe_from_mic_via_mlx.py
rename to scripts/stt_from_mic_mlx.py
diff --git a/scripts/transcribe_from_mic_via_rust_server.py b/scripts/stt_from_mic_rust_server.py
similarity index 100%
rename from scripts/transcribe_from_mic_via_rust_server.py
rename to scripts/stt_from_mic_rust_server.py
diff --git a/transcribe_via_pytorch.ipynb b/stt_pytorch.ipynb
similarity index 99%
rename from transcribe_via_pytorch.ipynb
rename to stt_pytorch.ipynb
index 4210d64..acad8e5 100644
--- a/transcribe_via_pytorch.ipynb
+++ b/stt_pytorch.ipynb
@@ -228,11 +228,9 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "name": "python"
   }
  },
  "nbformat": 4,