From 26b048ed44afdde9fa6c4042718ec8328e4a9bfd Mon Sep 17 00:00:00 2001 From: Pierre-Hugues Husson Date: Thu, 3 Jul 2025 15:13:08 +0200 Subject: [PATCH] Fix stt_from_file_pytorch 1. argparse declares in_file, but code reads file 2. text_tokens.numpy().tolist() is a list of list of list of int instead of the supported list of list of int. this is a debugging print just drop it --- scripts/stt_from_file_pytorch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/stt_from_file_pytorch.py b/scripts/stt_from_file_pytorch.py index f113b3a..fd10c67 100644 --- a/scripts/stt_from_file_pytorch.py +++ b/scripts/stt_from_file_pytorch.py @@ -150,7 +150,7 @@ def main(args): audio_delay_seconds = info.stt_config.get("audio_delay_seconds", 5.0) padding_token_id = info.raw_config.get("text_padding_token_id", 3) - audio, input_sample_rate = sphn.read(args.file) + audio, input_sample_rate = sphn.read(args.in_file) audio = torch.from_numpy(audio).to(args.device) audio = julius.resample_frac(audio, input_sample_rate, mimi.sample_rate) if audio.shape[-1] % mimi.frame_size != 0: @@ -178,8 +178,6 @@ def main(args): if text_tokens is not None: text_tokens_accum.append(text_tokens) - print(tokenizer.decode(text_tokens.numpy().tolist())) - utterance_tokens = torch.concat(text_tokens_accum, dim=-1) timed_text = tokens_to_timestamped_text( utterance_tokens,