Another formatting tweak.

Formatting.
Use a streaming input in the rust example.
2025-07-31 17:40:35 +02:00 · 2025-07-31 17:38:34 +02:00 · 2025-07-31 17:37:43 +02:00
10 changed files with 3 additions and 537 deletions
--- a/app/api_server.py
+++ b/app/api_server.py
@ -1,298 +0,0 @@
 #!/usr/bin/env python3
 """
 OpenAI-Compatible Kyutai TTS API Server with Model Caching
 Improved version that loads the model once and keeps it in memory
 """
 import os
 import io
 import time
 import asyncio
 import subprocess
 from pathlib import Path
 from typing import Optional, Literal
 import logging
 import torch
 import soundfile as sf
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import Response
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 import uvicorn
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global model variables - loaded once at startup
 tts_model = None
 device = None
 sample_rate = None
 class SpeechRequest(BaseModel):
    model: Literal["tts-1", "tts-1-hd"] = Field("tts-1", description="TTS model to use")
    input: str = Field(..., min_length=1, max_length=4096, description="Text to generate audio for")
    voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = Field("alloy", description="Voice to use")
    response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field("mp3", description="Audio format")
    speed: Optional[float] = Field(1.0, ge=0.25, le=4.0, description="Speed of generated audio")
 app = FastAPI(
    title="OpenAI-Compatible TTS API (Cached)",
    description="OpenAI Audio Speech API compatible endpoint using Kyutai TTS with model caching",
    version="2.0.0"
 )
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 OUTPUT_DIR = Path("/app/api_output")
 OUTPUT_DIR.mkdir(exist_ok=True)
 def load_tts_model():
    """Load TTS model once at startup and keep in memory"""
    global tts_model, device, sample_rate
    if tts_model is not None:
        logger.info("TTS model already loaded")
        return
    try:
        logger.info("🚀 Loading Kyutai TTS model (one-time initialization)...")
        # Import Kyutai TTS modules
        from moshi.models.loaders import CheckpointInfo
        from moshi.models.tts import DEFAULT_DSM_TTS_REPO, TTSModel
        # Set device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {device}")
        # Load the TTS model
        checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
        tts_model = TTSModel.from_checkpoint_info(
            checkpoint_info, 
            n_q=32, 
            temp=0.6, 
            device=device
        )
        # Get sample rate
        sample_rate = tts_model.mimi.sample_rate
        logger.info(f"✅ TTS model loaded successfully!")
        logger.info(f"   Model: {DEFAULT_DSM_TTS_REPO}")
        logger.info(f"   Device: {device}")
        logger.info(f"   Sample Rate: {sample_rate}")
    except Exception as e:
        logger.error(f"❌ Failed to load TTS model: {e}")
        raise
 def generate_audio_fast(text: str, voice: str = "alloy", speed: float = 1.0) -> bytes:
    """Generate audio using cached TTS model"""
    global tts_model, device, sample_rate
    if tts_model is None:
        raise HTTPException(status_code=500, detail="TTS model not loaded")
    try:
        logger.info(f"🎵 Generating audio for: '{text[:50]}{'...' if len(text) > 50 else ''}'")
        # Prepare the script (text input)
        entries = tts_model.prepare_script([text], padding_between=1)
        # Voice mapping for OpenAI compatibility
        voice_mapping = {
            "alloy": "expresso/ex03-ex01_happy_001_channel1_334s.wav",
            "echo": "expresso/ex04-ex01_happy_001_channel1_334s.wav",
            "fable": "expresso/ex05-ex01_happy_001_channel1_334s.wav", 
            "onyx": "expresso/ex06-ex01_happy_001_channel1_334s.wav",
            "nova": "expresso/ex07-ex01_happy_001_channel1_334s.wav",
            "shimmer": "expresso/ex08-ex01_happy_001_channel1_334s.wav"
        }
        selected_voice = voice_mapping.get(voice, voice_mapping["alloy"])
        try:
            voice_path = tts_model.get_voice_path(selected_voice)
        except:
            # Fallback to default if voice not found
            voice_path = tts_model.get_voice_path("expresso/ex03-ex01_happy_001_channel1_334s.wav")
        # Prepare condition attributes
        condition_attributes = tts_model.make_condition_attributes(
            [voice_path], cfg_coef=2.0
        )
        # Generate audio
        pcms = []
        def on_frame(frame):
            if (frame != -1).all():
                pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
                pcms.append(torch.clamp(torch.from_numpy(pcm[0, 0]), -1, 1).numpy())
        all_entries = [entries]
        all_condition_attributes = [condition_attributes]
        with tts_model.mimi.streaming(len(all_entries)):
            result = tts_model.generate(all_entries, all_condition_attributes, on_frame=on_frame)
        # Concatenate all audio frames
        if pcms:
            import numpy as np
            audio = np.concatenate(pcms, axis=-1)
            # Apply speed adjustment if needed
            if speed != 1.0:
                # Simple speed adjustment by resampling
                from scipy import signal
                audio_length = len(audio)
                new_length = int(audio_length / speed)
                audio = signal.resample(audio, new_length)
            # Convert to bytes
            audio_bytes = io.BytesIO()
            sf.write(audio_bytes, audio, samplerate=sample_rate, format='WAV')
            audio_bytes.seek(0)
            logger.info(f"✅ Audio generated successfully ({len(audio)/sample_rate:.2f}s)")
            return audio_bytes.read()
        else:
            raise Exception("No audio frames generated")
    except Exception as e:
        logger.error(f"❌ TTS generation error: {e}")
        raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
 def convert_audio_format(audio_wav_bytes: bytes, output_format: str) -> bytes:
    """Convert WAV audio to requested format using ffmpeg"""
    try:
        if output_format == "wav":
            return audio_wav_bytes
        # Use ffmpeg to convert
        cmd = ["ffmpeg", "-f", "wav", "-i", "pipe:0", "-f", output_format, "pipe:1"]
        result = subprocess.run(
            cmd, 
            input=audio_wav_bytes, 
            capture_output=True, 
            check=True
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        logger.error(f"Audio conversion failed: {e}")
        raise HTTPException(status_code=500, detail=f"Audio conversion failed: {e}")
@app.post("/v1/audio/speech")
 async def create_speech(request: SpeechRequest):
    """
    OpenAI-compatible audio speech endpoint
    Uses cached TTS model for fast generation
    """
    try:
        start_time = time.time()
        # Generate audio with cached model
        audio_wav_bytes = generate_audio_fast(
            text=request.input,
            voice=request.voice,
            speed=request.speed
        )
        # Convert to requested format
        audio_data = convert_audio_format(audio_wav_bytes, request.response_format)
        generation_time = time.time() - start_time
        logger.info(f"⚡ Total generation time: {generation_time:.2f}s")
        # Set appropriate content type
        content_types = {
            "mp3": "audio/mpeg",
            "opus": "audio/opus", 
            "aac": "audio/aac",
            "flac": "audio/flac",
            "wav": "audio/wav",
            "pcm": "audio/pcm"
        }
        return Response(
            content=audio_data,
            media_type=content_types.get(request.response_format, "audio/wav"),
            headers={
                "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
                "X-Generation-Time": str(generation_time)
            }
        )
    except Exception as e:
        logger.error(f"Speech generation failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/v1/models")
 async def list_models():
    """List available models (OpenAI-compatible)"""
    return {
        "object": "list",
        "data": [
            {
                "id": "tts-1",
                "object": "model",
                "created": 1677610602,
                "owned_by": "kyutai",
                "permission": [],
                "root": "tts-1",
                "parent": None
            },
            {
                "id": "tts-1-hd", 
                "object": "model",
                "created": 1677610602,
                "owned_by": "kyutai",
                "permission": [],
                "root": "tts-1-hd",
                "parent": None
            }
        ]
    }
@app.get("/health")
 async def health_check():
    """Health check endpoint with model status"""
    model_loaded = tts_model is not None
    return {
        "status": "healthy" if model_loaded else "loading",
        "model_loaded": model_loaded,
        "cuda_available": torch.cuda.is_available(),
        "device": str(device) if device else None,
        "service": "kyutai-tts-openai-compatible-cached"
    }
@app.get("/reload-model")
 async def reload_model():
    """Reload the TTS model (admin endpoint)"""
    global tts_model
    try:
        tts_model = None
        load_tts_model()
        return {"status": "success", "message": "Model reloaded successfully"}
    except Exception as e:
        return {"status": "error", "message": str(e)}
@app.on_event("startup")
 async def startup_event():
    """Load model on startup"""
    logger.info("🚀 Starting TTS API server with model caching...")
    load_tts_model()
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/app/dependency_check.py
+++ b/app/dependency_check.py
@ -1,67 +0,0 @@
 #!/usr/bin/env python3
 """
 Check if all Kyutai TTS dependencies are properly installed
 """
 import sys
 def check_dependencies():
    print("🔍 Checking Kyutai TTS Dependencies")
    print("=" * 40)
    dependencies = [
        "torch",
        "numpy", 
        "einops",
        "transformers",
        "accelerate",
        "soundfile",
        "librosa",
        "huggingface_hub",
        "moshi",
        "sphn"
    ]
    missing = []
    installed = []
    for dep in dependencies:
        try:
            __import__(dep)
            installed.append(dep)
            print(f"✓ {dep}")
        except ImportError as e:
            missing.append((dep, str(e)))
            print(f"✗ {dep}: {e}")
    print(f"\n📊 Summary:")
    print(f"✓ Installed: {len(installed)}")
    print(f"✗ Missing: {len(missing)}")
    if missing:
        print(f"\n🔧 To fix missing dependencies:")
        for dep, error in missing:
            print(f"pip install {dep}")
    print(f"\n🧪 Testing Kyutai TTS imports:")
    try:
        from moshi.models.loaders import CheckpointInfo
        print("✓ CheckpointInfo import successful")
    except Exception as e:
        print(f"✗ CheckpointInfo import failed: {e}")
    try:
        from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel
        print("✓ TTSModel imports successful")
    except Exception as e:
        print(f"✗ TTSModel imports failed: {e}")
    return len(missing) == 0
 if __name__ == "__main__":
    success = check_dependencies()
    if success:
        print("\n🎉 All dependencies are installed correctly!")
    else:
        print("\n❌ Some dependencies are missing. Please install them first.")
        sys.exit(1)
--- a/app/scripts/tts_runner.py
+++ b/app/scripts/tts_runner.py
@ -1,59 +0,0 @@
 #!/usr/bin/env python3
 """
 Kyutai TTS PyTorch Runner
 Dockerized implementation for text-to-speech generation
 """
 import sys
 import os
 import argparse
 import torch
 from pathlib import Path
 def main():
    parser = argparse.ArgumentParser(description='Kyutai TTS PyTorch Runner')
    parser.add_argument('input_file', help='Input text file or "-" for stdin')
    parser.add_argument('output_file', help='Output audio file')
    parser.add_argument('--model', default='kyutai/tts-1.6b-en_fr', help='TTS model to use')
    parser.add_argument('--device', default='cuda' if torch.cuda.is_available() else 'cpu', help='Device to use')
    args = parser.parse_args()
    print(f"Using device: {args.device}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    # Handle stdin input
    if args.input_file == '-':
        # Read from stdin and create temporary file
        text = sys.stdin.read().strip()
        temp_file = '/tmp/temp_input.txt'
        with open(temp_file, 'w') as f:
            f.write(text)
        input_file = temp_file
    else:
        input_file = args.input_file
    # Check if the original TTS script exists
    tts_script = Path('/app/scripts/tts_pytorch.py')
    if tts_script.exists():
        print("Using original TTS script from Kyutai repository")
        import subprocess
        cmd = ['python', str(tts_script), input_file, args.output_file]
        subprocess.run(cmd, check=True)
    else:
        print("Using moshi package for TTS generation")
        import subprocess
        cmd = [
            'python', '-m', 'moshi.run_inference', 
            '--hf-repo', args.model,
            input_file,
            args.output_file
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Error: {result.stderr}")
            sys.exit(1)
        print(f"Audio generated: {args.output_file}")
 if __name__ == '__main__':
    main()
 EOF
--- a/install.sh
+++ b/install.sh
@ -1,78 +0,0 @@
 # Set environment variables
 export DEBIAN_FRONTEND=noninteractive
 export PYTHONUNBUFFERED=1
 export CUDA_VISIBLE_DEVICES=0
 # Install system dependencies
 apt-get update && apt-get install -y \
    wget \
    curl \
    git \
    build-essential \
    libsndfile1 \
    ffmpeg \
    sox \
    alsa-utils \
    pulseaudio \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies first (for better caching)
 pip install --no-cache-dir --upgrade pip
 # Create virtual environment
 apt install python3.12-venv python3.12-dev
 python3.12 -m venv ~/venv-tts-kyutai
 source ~/venv-tts-kyutai/bin/activate
 # Install Python dependencies first (for better caching)
 pip install --no-cache-dir --upgrade pip
 # Install PyTorch with CUDA support for Python 3.12
 pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
 # Install core dependencies
 pip install --no-cache-dir \
    numpy \
    scipy \
    librosa \
    soundfile \
    huggingface_hub \
    einops \
    transformers \
    accelerate
 # Install API dependencies
 pip install --no-cache-dir \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    pydantic
 # Install moshi package with all dependencies (following Colab notebook)
 pip install --no-cache-dir 'sphn<0.2'
 pip install --no-cache-dir "moshi==0.2.8"
 # Create directories for input/output
 mkdir -p /app/input /app/output /app/scripts /app/api_output
 # Download the Kyutai delayed-streams-modeling repository
 #git clone https://github.com/kyutai-labs/delayed-streams-modeling.git /app/kyutai-repo
 # Copy the TTS script from the repository
 cp /app/kyutai-repo/scripts/tts_pytorch.py /app/scripts/ || echo "TTS script not found, will create custom one"
 # Create directories for input/output
 mkdir -p /app/input /app/output /app/scripts /app/api_output
 # Download the Kyutai delayed-streams-modeling repository
 #git clone https://github.com/kyutai-labs/delayed-streams-modeling.git /app/kyutai-repo
 # Copy the TTS script from the repository
 cp scripts/tts_pytorch.py /app/scripts/ || echo "TTS script not found, will create custom one"
 # Create directories for input/output
 mkdir -p /app/input /app/output /app/scripts /app/api_output
 # Start TTS-Server
 python /app/api_server.py
--- a/scripts/stt_from_file_mlx.py
+++ b/scripts/stt_from_file_mlx.py
@ -24,18 +24,13 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("in_file", help="The file to transcribe.")
    parser.add_argument("--max-steps", default=4096)
-    parser.add_argument("--hf-repo")
+    parser.add_argument("--hf-repo", default="kyutai/stt-1b-en_fr-mlx")
    parser.add_argument(
        "--vad", action="store_true", help="Enable VAD (Voice Activity Detection)."
    )
    args = parser.parse_args()
    audio, _ = sphn.read(args.in_file, sample_rate=24000)
    if args.hf_repo is None:
        if args.vad:
            args.hf_repo = "kyutai/stt-1b-en_fr-candle"
        else:
            args.hf_repo = "kyutai/stt-1b-en_fr-mlx"
    lm_config = hf_hub_download(args.hf_repo, "config.json")
    with open(lm_config, "r") as fobj:
        lm_config = json.load(fobj)
--- a/scripts/stt_from_file_pytorch.py
+++ b/scripts/stt_from_file_pytorch.py
@ -128,9 +128,6 @@ def tokens_to_timestamped_text(
 def main(args):
    if args.vad and args.hf_repo is None:
        args.hf_repo = "kyutai/stt-1b-en_fr-candle"
    info = moshi.models.loaders.CheckpointInfo.from_hf_repo(
        args.hf_repo,
        moshi_weights=args.moshi_weight,
--- a/scripts/stt_from_mic_mlx.py
+++ b/scripts/stt_from_mic_mlx.py
@ -25,17 +25,12 @@ from moshi_mlx import models, utils
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--max-steps", default=4096)
-    parser.add_argument("--hf-repo")
+    parser.add_argument("--hf-repo", default="kyutai/stt-1b-en_fr-mlx")
    parser.add_argument(
        "--vad", action="store_true", help="Enable VAD (Voice Activity Detection)."
    )
    args = parser.parse_args()
    if args.hf_repo is None:
        if args.vad:
            args.hf_repo = "kyutai/stt-1b-en_fr-candle"
        else:
            args.hf_repo = "kyutai/stt-1b-en_fr-mlx"
    lm_config = hf_hub_download(args.hf_repo, "config.json")
    with open(lm_config, "r") as fobj:
        lm_config = json.load(fobj)
--- a/scripts/tts_mlx.py
+++ b/scripts/tts_mlx.py
@ -76,9 +76,6 @@ def main():
    moshi_weights = hf_get(moshi_name, args.hf_repo)
    tokenizer = hf_get(raw_config["tokenizer_name"], args.hf_repo)
    lm_config = models.LmConfig.from_config_dict(raw_config)
    # There is a bug in moshi_mlx <= 0.3.0 handling of the ring kv cache.
    # The following line gets around it for now.
    lm_config.transformer.max_seq_len = lm_config.transformer.context
    model = models.Lm(lm_config)
    model.set_dtype(mx.bfloat16)
--- a/scripts/tts_mlx_streaming.py
+++ b/scripts/tts_mlx_streaming.py
@ -205,9 +205,6 @@ def main():
    moshi_weights = hf_get(moshi_name, args.hf_repo)
    tokenizer = hf_get(raw_config["tokenizer_name"], args.hf_repo)
    lm_config = models.LmConfig.from_config_dict(raw_config)
    # There is a bug in moshi_mlx <= 0.3.0 handling of the ring kv cache.
    # The following line gets around it for now.
    lm_config.transformer.max_seq_len = lm_config.transformer.context
    model = models.Lm(lm_config)
    model.set_dtype(mx.bfloat16)
--- a/scripts/tts_pytorch.py
+++ b/scripts/tts_pytorch.py
@ -78,7 +78,6 @@ def main():
    condition_attributes = tts_model.make_condition_attributes(
        [voice_path], cfg_coef=2.0
    )
    _frames_cnt = 0
    if args.out == "-":
        # Stream the audio to the speakers using sounddevice.
@ -87,12 +86,9 @@ def main():
        pcms = queue.Queue()
        def _on_frame(frame):
            nonlocal _frames_cnt
            if (frame != -1).all():
                pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
                pcms.put_nowait(np.clip(pcm[0, 0], -1, 1))
                _frames_cnt += 1
                print(f"generated {_frames_cnt / 12.5:.2f}s", end="\r", flush=True)
        def audio_callback(outdata, _a, _b, _c):
            try:
@ -117,16 +113,7 @@ def main():
                    break
                time.sleep(1)
    else:
-
+        result = tts_model.generate([entries], [condition_attributes])
        def _on_frame(frame):
            nonlocal _frames_cnt
            if (frame != -1).all():
                _frames_cnt += 1
                print(f"generated {_frames_cnt / 12.5:.2f}s", end="\r", flush=True)
        result = tts_model.generate(
            [entries], [condition_attributes], on_frame=_on_frame
        )
        with tts_model.mimi.streaming(1), torch.no_grad():
            pcms = []
            for frame in result.frames[tts_model.delay_steps :]:
Author	SHA1	Message	Date
laurent	a1af3a78cb	Another formatting tweak.	2025-07-31 17:40:35 +02:00
laurent	ecc002f7af	Formatting.	2025-07-31 17:38:34 +02:00
laurent	94e271b69a	Use a streaming input in the rust example.	2025-07-31 17:37:43 +02:00