diff --git a/scripts/stt_from_file_with_prompt_pytorch.py b/scripts/stt_from_file_with_prompt_pytorch.py index 63fe748..6345303 100644 --- a/scripts/stt_from_file_with_prompt_pytorch.py +++ b/scripts/stt_from_file_with_prompt_pytorch.py @@ -1,7 +1,6 @@ """An example script that illustrates how one can prompt Kyutai STT models.""" import argparse -import dataclasses import itertools import math from collections import deque diff --git a/scripts/tts_pytorch.py b/scripts/tts_pytorch.py index 5478078..515860e 100644 --- a/scripts/tts_pytorch.py +++ b/scripts/tts_pytorch.py @@ -73,6 +73,7 @@ def main(): if args.out == "-": # Stream the audio to the speakers using sounddevice. import sounddevice as sd + pcms = queue.Queue() def _on_frame(frame): @@ -86,10 +87,13 @@ def main(): outdata[:, 0] = pcm_data except queue.Empty: outdata[:] = 0 - with sd.OutputStream(samplerate=tts_model.mimi.sample_rate, - blocksize=1920, - channels=1, - callback=audio_callback): + + with sd.OutputStream( + samplerate=tts_model.mimi.sample_rate, + blocksize=1920, + channels=1, + callback=audio_callback, + ): tts_model.generate([entries], [condition_attributes], on_frame=_on_frame) time.sleep(3) while True: @@ -100,7 +104,7 @@ def main(): result = tts_model.generate([entries], [condition_attributes]) with torch.no_grad(): pcms = [] - for frame in result.frames[tts_model.delay_steps:]: + for frame in result.frames[tts_model.delay_steps :]: pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy() pcms.append(np.clip(pcm[0, 0], -1, 1)) pcm = np.concatenate(pcms, axis=-1) diff --git a/tts_pytorch.ipynb b/tts_pytorch.ipynb index 9c89892..e62dcd1 100644 --- a/tts_pytorch.ipynb +++ b/tts_pytorch.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b7eed16", + "id": "0", "metadata": {}, "outputs": [], "source": [ @@ -12,8 +12,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "353b9498", + "execution_count": null, + "id": "1", "metadata": {}, "outputs": [], "source": [ @@ -31,18 +31,10 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "8846418a", + "execution_count": null, + "id": "2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "See https://huggingface.co/datasets/kyutai/tts-voices for available voices.\n" - ] - } - ], + "outputs": [], "source": [ "# Configuration\n", "text = \"Hey there! How are you? I had the craziest day today.\"\n", @@ -52,8 +44,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "b9f022ec", + "execution_count": null, + "id": "3", "metadata": {}, "outputs": [], "source": [ @@ -75,18 +67,10 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "f4f76c73", + "execution_count": null, + "id": "4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generating audio...\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Generating audio...\")\n", "\n", @@ -103,28 +87,10 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "732e4b4b", + "execution_count": null, + "id": "5", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "display(\n", " Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n", @@ -134,7 +100,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2dbdd275", + "id": "6", "metadata": {}, "outputs": [], "source": []