From 09468c239a7b1686f49bec9411508b2253c77d13 Mon Sep 17 00:00:00 2001
From: Laurent Mazare <laurent.mazare@gmail.com>
Date: Mon, 4 Aug 2025 09:24:31 +0200
Subject: [PATCH] Print the duration of the audio generated so far. (#107)

---
 scripts/tts_pytorch.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/scripts/tts_pytorch.py b/scripts/tts_pytorch.py
index 1648638..424d600 100644
--- a/scripts/tts_pytorch.py
+++ b/scripts/tts_pytorch.py
@@ -78,6 +78,7 @@ def main():
     condition_attributes = tts_model.make_condition_attributes(
         [voice_path], cfg_coef=2.0
     )
+    _frames_cnt = 0
 
     if args.out == "-":
         # Stream the audio to the speakers using sounddevice.
@@ -86,9 +87,12 @@ def main():
         pcms = queue.Queue()
 
         def _on_frame(frame):
+            nonlocal _frames_cnt
             if (frame != -1).all():
                 pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
                 pcms.put_nowait(np.clip(pcm[0, 0], -1, 1))
+                _frames_cnt += 1
+                print(f"generated {_frames_cnt / 12.5:.2f}s", end="\r", flush=True)
 
         def audio_callback(outdata, _a, _b, _c):
             try:
@@ -113,7 +117,16 @@ def main():
                     break
                 time.sleep(1)
     else:
-        result = tts_model.generate([entries], [condition_attributes])
+
+        def _on_frame(frame):
+            nonlocal _frames_cnt
+            if (frame != -1).all():
+                _frames_cnt += 1
+                print(f"generated {_frames_cnt / 12.5:.2f}s", end="\r", flush=True)
+
+        result = tts_model.generate(
+            [entries], [condition_attributes], on_frame=_on_frame
+        )
         with tts_model.mimi.streaming(1), torch.no_grad():
             pcms = []
             for frame in result.frames[tts_model.delay_steps :]: