From a4e9676e46c89002b42e7d056f40106a584bca0d Mon Sep 17 00:00:00 2001 From: laurent Date: Thu, 31 Jul 2025 12:42:35 +0200 Subject: [PATCH] Allow for using local voices in the pytorch examples. --- scripts/tts_pytorch.py | 5 ++++- scripts/tts_pytorch_streaming.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/tts_pytorch.py b/scripts/tts_pytorch.py index 4091100..1648638 100644 --- a/scripts/tts_pytorch.py +++ b/scripts/tts_pytorch.py @@ -68,7 +68,10 @@ def main(): # If you want to make a dialog, you can pass more than one turn [text_speaker_1, text_speaker_2, text_2_speaker_1, ...] entries = tts_model.prepare_script([text], padding_between=1) - voice_path = tts_model.get_voice_path(args.voice) + if args.voice.endswith(".safetensors"): + voice_path = args.voice + else: + voice_path = tts_model.get_voice_path(args.voice) # CFG coef goes here because the model was trained with CFG distillation, # so it's not _actually_ doing CFG at inference time. # Also, if you are generating a dialog, you should have two voices in the list. diff --git a/scripts/tts_pytorch_streaming.py b/scripts/tts_pytorch_streaming.py index f06ea8c..312cc34 100644 --- a/scripts/tts_pytorch_streaming.py +++ b/scripts/tts_pytorch_streaming.py @@ -183,7 +183,10 @@ def main(): checkpoint_info, n_q=32, temp=0.6, device=args.device ) - voice_path = tts_model.get_voice_path(args.voice) + if args.voice.endswith(".safetensors"): + voice_path = args.voice + else: + voice_path = tts_model.get_voice_path(args.voice) # CFG coef goes here because the model was trained with CFG distillation, # so it's not _actually_ doing CFG at inference time. # Also, if you are generating a dialog, you should have two voices in the list.