From 35f133508fb72e4dc61ed8a21ed7ee67cfcb401c Mon Sep 17 00:00:00 2001 From: laurent Date: Thu, 31 Jul 2025 17:47:46 +0200 Subject: [PATCH] Use the proper repos when vad is on. --- scripts/stt_from_file_mlx.py | 7 ++++++- scripts/stt_from_file_pytorch.py | 3 +++ scripts/stt_from_mic_mlx.py | 7 ++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/stt_from_file_mlx.py b/scripts/stt_from_file_mlx.py index 4065952..26222f6 100644 --- a/scripts/stt_from_file_mlx.py +++ b/scripts/stt_from_file_mlx.py @@ -24,13 +24,18 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("in_file", help="The file to transcribe.") parser.add_argument("--max-steps", default=4096) - parser.add_argument("--hf-repo", default="kyutai/stt-1b-en_fr-mlx") + parser.add_argument("--hf-repo") parser.add_argument( "--vad", action="store_true", help="Enable VAD (Voice Activity Detection)." ) args = parser.parse_args() audio, _ = sphn.read(args.in_file, sample_rate=24000) + if args.hf_repo is None: + if args.vad: + args.hf_repo = "kyutai/stt-1b-en_fr-candle" + else: + args.hf_repo = "kyutai/stt-1b-en_fr-mlx" lm_config = hf_hub_download(args.hf_repo, "config.json") with open(lm_config, "r") as fobj: lm_config = json.load(fobj) diff --git a/scripts/stt_from_file_pytorch.py b/scripts/stt_from_file_pytorch.py index 7069761..cf3fb05 100644 --- a/scripts/stt_from_file_pytorch.py +++ b/scripts/stt_from_file_pytorch.py @@ -128,6 +128,9 @@ def tokens_to_timestamped_text( def main(args): + if args.vad and args.hf_repo is None: + args.hf_repo = "kyutai/stt-1b-en_fr-candle" + info = moshi.models.loaders.CheckpointInfo.from_hf_repo( args.hf_repo, moshi_weights=args.moshi_weight, diff --git a/scripts/stt_from_mic_mlx.py b/scripts/stt_from_mic_mlx.py index e068585..589a987 100644 --- a/scripts/stt_from_mic_mlx.py +++ b/scripts/stt_from_mic_mlx.py @@ -25,12 +25,17 @@ from moshi_mlx import models, utils if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--max-steps", default=4096) - parser.add_argument("--hf-repo", default="kyutai/stt-1b-en_fr-mlx") + parser.add_argument("--hf-repo") parser.add_argument( "--vad", action="store_true", help="Enable VAD (Voice Activity Detection)." ) args = parser.parse_args() + if args.hf_repo is None: + if args.vad: + args.hf_repo = "kyutai/stt-1b-en_fr-candle" + else: + args.hf_repo = "kyutai/stt-1b-en_fr-mlx" lm_config = hf_hub_download(args.hf_repo, "config.json") with open(lm_config, "r") as fobj: lm_config = json.load(fobj)