From 09468c239a7b1686f49bec9411508b2253c77d13 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 4 Aug 2025 09:24:31 +0200 Subject: [PATCH] Print the duration of the audio generated so far. (#107) --- scripts/tts_pytorch.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/scripts/tts_pytorch.py b/scripts/tts_pytorch.py index 1648638..424d600 100644 --- a/scripts/tts_pytorch.py +++ b/scripts/tts_pytorch.py @@ -78,6 +78,7 @@ def main(): condition_attributes = tts_model.make_condition_attributes( [voice_path], cfg_coef=2.0 ) + _frames_cnt = 0 if args.out == "-": # Stream the audio to the speakers using sounddevice. @@ -86,9 +87,12 @@ def main(): pcms = queue.Queue() def _on_frame(frame): + nonlocal _frames_cnt if (frame != -1).all(): pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy() pcms.put_nowait(np.clip(pcm[0, 0], -1, 1)) + _frames_cnt += 1 + print(f"generated {_frames_cnt / 12.5:.2f}s", end="\r", flush=True) def audio_callback(outdata, _a, _b, _c): try: @@ -113,7 +117,16 @@ def main(): break time.sleep(1) else: - result = tts_model.generate([entries], [condition_attributes]) + + def _on_frame(frame): + nonlocal _frames_cnt + if (frame != -1).all(): + _frames_cnt += 1 + print(f"generated {_frames_cnt / 12.5:.2f}s", end="\r", flush=True) + + result = tts_model.generate( + [entries], [condition_attributes], on_frame=_on_frame + ) with tts_model.mimi.streaming(1), torch.no_grad(): pcms = [] for frame in result.frames[tts_model.delay_steps :]: