From 433dca3751a2a21a95a6d7ca1fd2a44c516a729c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Volhejn?= <8401624+vvolhejn@users.noreply.github.com> Date: Wed, 2 Jul 2025 18:21:42 +0200 Subject: [PATCH] Shorter names for STT scripts (#30) --- README.md | 16 ++++++++-------- ..._on_dataset.py => stt_evaluate_on_dataset.py} | 0 ...e_via_pytorch.py => stt_from_file_pytorch.py} | 0 ...st_server.py => stt_from_file_rust_server.py} | 0 ...t.py => stt_from_file_with_prompt_pytorch.py} | 12 ++---------- ...e_from_mic_via_mlx.py => stt_from_mic_mlx.py} | 0 ...ust_server.py => stt_from_mic_rust_server.py} | 0 ...scribe_via_pytorch.ipynb => stt_pytorch.ipynb | 6 ++---- 8 files changed, 12 insertions(+), 22 deletions(-) rename scripts/{evaluate_on_dataset.py => stt_evaluate_on_dataset.py} (100%) rename scripts/{transcribe_from_file_via_pytorch.py => stt_from_file_pytorch.py} (100%) rename scripts/{transcribe_from_file_via_rust_server.py => stt_from_file_rust_server.py} (100%) rename scripts/{transcribe_from_file_via_pytorch_with_prompt.py => stt_from_file_with_prompt_pytorch.py} (97%) rename scripts/{transcribe_from_mic_via_mlx.py => stt_from_mic_mlx.py} (100%) rename scripts/{transcribe_from_mic_via_rust_server.py => stt_from_mic_rust_server.py} (100%) rename transcribe_via_pytorch.ipynb => stt_pytorch.ipynb (99%) diff --git a/README.md b/README.md index e36cc37..364d6c2 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,12 @@ Here is how to choose which one to use: Hugging Face - + Open In Colab For an example of how to use the model in a way where you can directly stream in PyTorch tensors, -[see our Colab notebook](https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/transcribe_via_pytorch.ipynb). +[see our Colab notebook](https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/stt_pytorch.ipynb). This requires the [moshi package](https://pypi.org/project/moshi/) with version 0.2.6 or later, which can be installed via pip. @@ -71,7 +71,7 @@ Additionally, we provide two scripts that highlight different usage scenarios. T ```bash uv run \ - scripts/transcribe_from_file_via_pytorch.py \ + scripts/stt_from_file_pytorch.py \ --hf-repo kyutai/stt-2.6b-en \ --file audio/bria.mp3 ``` @@ -85,7 +85,7 @@ uv run scripts/evaluate_on_dataset.py \ Another example shows how one can provide a text-, audio-, or text-audio prompt to our STT model: ```bash -uv run scripts/transcribe_from_file_via_pytorch_with_prompt.py \ +uv run scripts/stt_from_file_pytorch_with_prompt.py \ --hf-repo kyutai/stt-2.6b-en \ --file bria.mp3 \ --prompt_file ./audio/loonah.mp3 \ @@ -131,12 +131,12 @@ moshi-server worker --config configs/config-stt-en_fr-hf.toml Once the server has started you can transcribe audio from your microphone with the following script. ```bash -uv run scripts/transcribe_from_mic_via_rust_server.py +uv run scripts/stt_from_mic_rust_server.py ``` We also provide a script for transcribing from an audio file. ```bash -uv run scripts/transcribe_from_file_via_rust_server.py audio/bria.mp3 +uv run scripts/stt_from_file_rust_server.py audio/bria.mp3 ``` The script limits the decoding speed to simulates real-time processing of the audio. @@ -181,7 +181,7 @@ and just prefix the command above with `uvx --with moshi-mlx`. If you want to transcribe audio from your microphone, use: ```bash -python scripts/transcribe_from_mic_via_mlx.py +python scripts/stt_from_mic_mlx.py ``` The MLX models can also be used in swift using the [moshi-swift @@ -190,7 +190,7 @@ tested to work fine on an iPhone 16 Pro. ## Kyutai Text-to-Speech - + Open In Colab diff --git a/scripts/evaluate_on_dataset.py b/scripts/stt_evaluate_on_dataset.py similarity index 100% rename from scripts/evaluate_on_dataset.py rename to scripts/stt_evaluate_on_dataset.py diff --git a/scripts/transcribe_from_file_via_pytorch.py b/scripts/stt_from_file_pytorch.py similarity index 100% rename from scripts/transcribe_from_file_via_pytorch.py rename to scripts/stt_from_file_pytorch.py diff --git a/scripts/transcribe_from_file_via_rust_server.py b/scripts/stt_from_file_rust_server.py similarity index 100% rename from scripts/transcribe_from_file_via_rust_server.py rename to scripts/stt_from_file_rust_server.py diff --git a/scripts/transcribe_from_file_via_pytorch_with_prompt.py b/scripts/stt_from_file_with_prompt_pytorch.py similarity index 97% rename from scripts/transcribe_from_file_via_pytorch_with_prompt.py rename to scripts/stt_from_file_with_prompt_pytorch.py index 5861116..63fe748 100644 --- a/scripts/transcribe_from_file_via_pytorch_with_prompt.py +++ b/scripts/stt_from_file_with_prompt_pytorch.py @@ -14,15 +14,7 @@ import tqdm class PromptHook: - def __init__( - self, - tokenizer, - prefix, - padding_tokens=( - 0, - 3, - ), - ): + def __init__(self, tokenizer, prefix, padding_tokens=(0, 3)): self.tokenizer = tokenizer self.prefix_enforce = deque(self.tokenizer.encode(prefix)) self.padding_tokens = padding_tokens @@ -141,7 +133,7 @@ def main(args): prompt_frames = audio_prompt.shape[1] // mimi.frame_size no_prompt_offset_seconds = audio_delay_seconds + audio_silence_prefix_seconds no_prompt_offset = int(no_prompt_offset_seconds * mimi.frame_rate) - text_tokens = text_tokens[prompt_frames + no_prompt_offset:] + text_tokens = text_tokens[prompt_frames + no_prompt_offset :] text = tokenizer.decode( text_tokens[text_tokens > padding_token_id].numpy().tolist() diff --git a/scripts/transcribe_from_mic_via_mlx.py b/scripts/stt_from_mic_mlx.py similarity index 100% rename from scripts/transcribe_from_mic_via_mlx.py rename to scripts/stt_from_mic_mlx.py diff --git a/scripts/transcribe_from_mic_via_rust_server.py b/scripts/stt_from_mic_rust_server.py similarity index 100% rename from scripts/transcribe_from_mic_via_rust_server.py rename to scripts/stt_from_mic_rust_server.py diff --git a/transcribe_via_pytorch.ipynb b/stt_pytorch.ipynb similarity index 99% rename from transcribe_via_pytorch.ipynb rename to stt_pytorch.ipynb index 4210d64..acad8e5 100644 --- a/transcribe_via_pytorch.ipynb +++ b/stt_pytorch.ipynb @@ -228,11 +228,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", + "language": "python", "name": "python3" - }, - "language_info": { - "name": "python" } }, "nbformat": 4,