diff --git a/README.md b/README.md index b546f3b..e3c8a18 100644 --- a/README.md +++ b/README.md @@ -59,18 +59,17 @@ Here is how to choose which one to use: For an example of how to use the model in a way where you can directly stream in PyTorch tensors, [see our Colab notebook](https://colab.research.google.com/github/kyutai-labs/delayed-streams-modeling/blob/main/transcribe_via_pytorch.ipynb). -If you just want to run the model on a file, you can use `moshi.run_inference`. This requires the [moshi package](https://pypi.org/project/moshi/) with version 0.2.6 or later, which can be installed via pip. +If you just want to run the model on a file, you can use `moshi.run_inference`. + ```bash python -m moshi.run_inference --hf-repo kyutai/stt-2.6b-en audio/bria.mp3 ``` -If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step and run directly: -```bash -uvx --with moshi python -m moshi.run_inference --hf-repo kyutai/stt-2.6b-en audio/bria.mp3 -``` +If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step +and just prefix the command above with `uvx --with moshi`. Additionally, we provide two scripts that highlight different usage scenarios. The first script illustrates how to extract word-level timestamps from the model's outputs: @@ -157,15 +156,20 @@ hardware acceleration on Apple silicon. This requires the [moshi-mlx package](https://pypi.org/project/moshi-mlx/) with version 0.2.6 or later, which can be installed via pip. +If you just want to run the model on a file, you can use `moshi_mlx.run_inference`: + ```bash python -m moshi_mlx.run_inference --hf-repo kyutai/stt-2.6b-en-mlx audio/bria.mp3 --temp 0 ``` -If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step and run directly: +If you have [uv](https://docs.astral.sh/uv/) installed, you can skip the installation step +and just prefix the command above with `uvx --with moshi-mlx`. + +If you want to transcribe audio from your microphone, use: + ```bash -uvx --with moshi-mlx python -m moshi_mlx.run_inference --hf-repo kyutai/stt-2.6b-en-mlx audio/bria.mp3 --temp 0 +python scripts/transcribe_from_mic_via_mlx.py ``` -It will install the moshi package in a temporary environment and run the speech-to-text. The MLX models can also be used in swift using the [moshi-swift codebase](https://github.com/kyutai-labs/moshi-swift), the 1b model has been diff --git a/scripts/evaluate_on_dataset.py b/scripts/evaluate_on_dataset.py index 3bef8aa..684fe5c 100644 --- a/scripts/evaluate_on_dataset.py +++ b/scripts/evaluate_on_dataset.py @@ -14,14 +14,6 @@ Example implementation of the streaming STT example. Here we group test utterances in batches (pre- and post-padded with silence) and and then feed these batches into the streaming STT model frame-by-frame. - -Example command: -``` -uv run scripts/streaming_stt.py \ - --dataset meanwhile \ - --hf-repo kyutai/stt-2.6b-en -``` - """ # The outputs I get on my H100 using this code with the 2.6B model, @@ -365,7 +357,7 @@ if __name__ == "__main__": ) parser.add_argument( - "--hf-repo", type=str, help="HF repo to load the STT model from. " + "--hf-repo", type=str, help="HF repo to load the STT model from." ) parser.add_argument("--tokenizer", type=str, help="Path to a local tokenizer file.") parser.add_argument( diff --git a/scripts/transcribe_from_file_via_pytorch.py b/scripts/transcribe_from_file_via_pytorch.py index e941da8..f113b3a 100644 --- a/scripts/transcribe_from_file_via_pytorch.py +++ b/scripts/transcribe_from_file_via_pytorch.py @@ -10,13 +10,6 @@ """An example script that illustrates how one can get per-word timestamps from Kyutai STT models. - -Usage: -``` -uv run scripts/streaming_stt_timestamps.py \ - --hf-repo kyutai/stt-2.6b-en \ - --file bria.mp3 -``` """ import argparse @@ -185,6 +178,8 @@ def main(args): if text_tokens is not None: text_tokens_accum.append(text_tokens) + print(tokenizer.decode(text_tokens.numpy().tolist())) + utterance_tokens = torch.concat(text_tokens_accum, dim=-1) timed_text = tokens_to_timestamped_text( utterance_tokens, @@ -201,11 +196,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Example streaming STT w/ timestamps.") - parser.add_argument( - "--file", - required=True, - help="File to transcribe.", - ) + parser.add_argument("in_file", help="The file to transcribe.") parser.add_argument( "--hf-repo", type=str, help="HF repo to load the STT model from. " diff --git a/scripts/transcribe_from_mic_via_mlx.py b/scripts/transcribe_from_mic_via_mlx.py index e8792e2..8f82af6 100644 --- a/scripts/transcribe_from_mic_via_mlx.py +++ b/scripts/transcribe_from_mic_via_mlx.py @@ -70,7 +70,7 @@ if __name__ == "__main__": def audio_callback(indata, _frames, _time, _status): block_queue.put(indata.copy()) - print("start recording the user input") + print("recording audio from microphone, speak to get your words transcribed") with sd.InputStream( channels=1, dtype="float32",