diff --git a/scripts/stt_from_file_with_prompt_pytorch.py b/scripts/stt_from_file_with_prompt_pytorch.py
index 63fe748..6345303 100644
--- a/scripts/stt_from_file_with_prompt_pytorch.py
+++ b/scripts/stt_from_file_with_prompt_pytorch.py
@@ -1,7 +1,6 @@
"""An example script that illustrates how one can prompt Kyutai STT models."""
import argparse
-import dataclasses
import itertools
import math
from collections import deque
diff --git a/scripts/tts_pytorch.py b/scripts/tts_pytorch.py
index 5478078..515860e 100644
--- a/scripts/tts_pytorch.py
+++ b/scripts/tts_pytorch.py
@@ -73,6 +73,7 @@ def main():
if args.out == "-":
# Stream the audio to the speakers using sounddevice.
import sounddevice as sd
+
pcms = queue.Queue()
def _on_frame(frame):
@@ -86,10 +87,13 @@ def main():
outdata[:, 0] = pcm_data
except queue.Empty:
outdata[:] = 0
- with sd.OutputStream(samplerate=tts_model.mimi.sample_rate,
- blocksize=1920,
- channels=1,
- callback=audio_callback):
+
+ with sd.OutputStream(
+ samplerate=tts_model.mimi.sample_rate,
+ blocksize=1920,
+ channels=1,
+ callback=audio_callback,
+ ):
tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)
time.sleep(3)
while True:
@@ -100,7 +104,7 @@ def main():
result = tts_model.generate([entries], [condition_attributes])
with torch.no_grad():
pcms = []
- for frame in result.frames[tts_model.delay_steps:]:
+ for frame in result.frames[tts_model.delay_steps :]:
pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
pcms.append(np.clip(pcm[0, 0], -1, 1))
pcm = np.concatenate(pcms, axis=-1)
diff --git a/tts_pytorch.ipynb b/tts_pytorch.ipynb
index 9c89892..e62dcd1 100644
--- a/tts_pytorch.ipynb
+++ b/tts_pytorch.ipynb
@@ -3,7 +3,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "0b7eed16",
+ "id": "0",
"metadata": {},
"outputs": [],
"source": [
@@ -12,8 +12,8 @@
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "353b9498",
+ "execution_count": null,
+ "id": "1",
"metadata": {},
"outputs": [],
"source": [
@@ -31,18 +31,10 @@
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "8846418a",
+ "execution_count": null,
+ "id": "2",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "See https://huggingface.co/datasets/kyutai/tts-voices for available voices.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Configuration\n",
"text = \"Hey there! How are you? I had the craziest day today.\"\n",
@@ -52,8 +44,8 @@
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "b9f022ec",
+ "execution_count": null,
+ "id": "3",
"metadata": {},
"outputs": [],
"source": [
@@ -75,18 +67,10 @@
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "f4f76c73",
+ "execution_count": null,
+ "id": "4",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Generating audio...\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"print(\"Generating audio...\")\n",
"\n",
@@ -103,28 +87,10 @@
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "732e4b4b",
+ "execution_count": null,
+ "id": "5",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"display(\n",
" Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True)\n",
@@ -134,7 +100,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "2dbdd275",
+ "id": "6",
"metadata": {},
"outputs": [],
"source": []