From 6f4ef1eae85fee9c4987e247564e7c8eb7e59b46 Mon Sep 17 00:00:00 2001 From: Gabriel de Marmiesse Date: Wed, 18 Jun 2025 12:45:33 +0200 Subject: [PATCH] Add uv instructions and ignore the sample audio files (#1) * Add uv instructions and ignore the sample audio file * Add french sample * Clarify real-time * Remove empty space --- .gitignore | 4 +++- README.md | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7b004e5..013ebc7 100644 --- a/.gitignore +++ b/.gitignore @@ -191,4 +191,6 @@ cython_debug/ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data # refer to https://docs.cursor.com/context/ignore-files .cursorignore -.cursorindexingignore \ No newline at end of file +.cursorindexingignore +bria.mp3 +sample_fr_hibiki_crepes.mp3 diff --git a/README.md b/README.md index c8bc0be..21ad38e 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,12 @@ with version 0.2.5 or later, which can be installed via pip. python -m moshi.run_inference --hf-repo kyutai/stt-2.6b-en bria.mp3 ``` +If you have `uv` installed, you can skip the installation step and run directly: +```bash +uvx --with moshi python -m moshi.run_inference --hf-repo kyutai/stt-2.6b-en bria.mp3 +``` +It will install the moshi package in a temporary environment and run the speech-to-text. + ### MLX implementation Hugging Face @@ -48,6 +54,12 @@ with version 0.2.5 or later, which can be installed via pip. python -m moshi_mlx.run_inference --hf-repo kyutai/stt-2.6b-en-mlx bria.mp3 --temp 0 ``` +If you have `uv` installed, you can skip the installation step and run directly: +```bash +uvx --with moshi-mlx python -m moshi_mlx.run_inference --hf-repo kyutai/stt-2.6b-en-mlx bria.mp3 --temp 0 +``` +It will install the moshi package in a temporary environment and run the speech-to-text. + ### Rust implementation Hugging Face @@ -91,8 +103,9 @@ script. uv run scripts/asr-streaming-query.py bria.mp3 ``` -The script simulates some real-time processing of the audio. Faster processing -can be triggered by setting the real-time factor, e.g. `--rtf 500` will process +The script limits the decoding speed to simulates real-time processing of the audio. +Faster processing can be triggered by setting +the real-time factor, e.g. `--rtf 500` will process the data as fast as possible. ## Text-to-Speech