vibe-bot/vibe_bot/tts.py

"""Text-to-speech engine using Kokoro TTS."""

from __future__ import annotations

import logging
from io import BytesIO

import numpy as np
import soundfile as sf  # type: ignore[import-untyped]
from kokoro_tts import (  # type: ignore[import-untyped]
    Kokoro,
    chunk_text,
    process_chunk_sequential,
)

logger = logging.getLogger(__name__)

# Default voice settings
DEFAULT_VOICE = "af_sarah"
DEFAULT_SPEED = 1.0
DEFAULT_LANG = "en-us"


class TTSEngine:
    """Text-to-speech engine wrapper around Kokoro TTS."""

    def __init__(self, model_path: str, voices_path: str) -> None:
        """Initialize the TTS engine with model and voices paths.

        Args:
            model_path: Path to the Kokoro model file.
            voices_path: Path to the voices file.

        """
        self.model_path = model_path
        self.voices_path = voices_path
        self.kokoro = Kokoro(model_path, voices_path)
        logger.info("Kokoro TTS engine initialized")

    def generate_audio(
        self,
        text: str,
        voice: str = DEFAULT_VOICE,
        speed: float = DEFAULT_SPEED,
        lang: str = DEFAULT_LANG,
    ) -> BytesIO:
        """Convert text to audio and return as BytesIO (MP3 format)."""
        all_samples: list[np.ndarray] = []
        sample_rate: int | None = None

        chunks: list[str] = list(chunk_text(text))
        logger.info("Split text into %d chunks", len(chunks))

        for i, chunk in enumerate(chunks):
            try:
                samples, sr = process_chunk_sequential(
                    chunk,
                    self.kokoro,
                    voice,
                    speed,
                    lang,
                )
                if samples is not None:
                    if sample_rate is None:
                        sample_rate = sr
                    all_samples.append(np.asarray(samples))
                    logger.info("Processed chunk %d/%d", i + 1, len(chunks))
            except Exception:
                logger.exception("Error processing chunk %d", i + 1)
                continue

        if not all_samples:
            msg = "No audio samples generated - text may be invalid or too long"
            raise ValueError(msg)

        combined = np.concatenate(all_samples)

        buffer = BytesIO()
        sf.write(  # pyright: ignore[reportUnknownMemberType]
            buffer,
            combined,
            sample_rate,
            format="MP3",
            subtype="MPEG_LAYER_III",
        )
        buffer.seek(0)

        logger.info(
            "Generated MP3 audio: %d samples at %dHz",
            len(combined),
            sample_rate or 0,
        )
        return buffer