From 0ee2354176c6f33b51797507e077e662e5b409a0 Mon Sep 17 00:00:00 2001 From: laurent Date: Thu, 3 Jul 2025 12:56:00 +0200 Subject: [PATCH] Chunk decoding in the pth notebook. --- tts_pytorch.ipynb | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tts_pytorch.ipynb b/tts_pytorch.ipynb index f987132..cd4caec 100644 --- a/tts_pytorch.ipynb +++ b/tts_pytorch.ipynb @@ -74,15 +74,17 @@ "source": [ "print(\"Generating audio...\")\n", "\n", - "# This doesn't do streaming generation,\n", - "result = tts_model.generate([entries], [condition_attributes])\n", + "pcms = []\n", + "def _on_frame(frame):\n", + " print(\"Step\", len(pcms), end=\"\\r\")\n", + " if (frame != -1).all():\n", + " pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()\n", + " pcms.append(np.clip(pcm[0, 0], -1, 1))\n", "\n", - "frames = torch.cat(result.frames, dim=-1)\n", - "audio_tokens = frames[:, tts_model.lm.audio_offset :, tts_model.delay_steps :]\n", - "with torch.no_grad():\n", - " audios = tts_model.mimi.decode(audio_tokens)\n", + "result = tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)\n", "\n", - "audio = audios[0].cpu().numpy()" + "print(\"Done generating.\")\n", + "audio = np.concatenate(pcms, axis=-1)", ] }, {