import numpy as np import soundfile as sf from io import BytesIO import os import logging from kokoro_tts import Kokoro, chunk_text, process_chunk_sequential logger = logging.getLogger(__name__) # Default voice settings DEFAULT_VOICE = "af_sarah" DEFAULT_SPEED = 1.0 DEFAULT_LANG = "en-us" class TTSEngine: def __init__(self, model_path: str, voices_path: str): self.model_path = model_path self.voices_path = voices_path self.kokoro = Kokoro(model_path, voices_path) logger.info("Kokoro TTS engine initialized") def generate_audio(self, text: str, voice: str = DEFAULT_VOICE, speed: float = DEFAULT_SPEED, lang: str = DEFAULT_LANG) -> BytesIO: """Convert text to audio and return as BytesIO (MP3 format).""" all_samples = [] sample_rate = None chunks = chunk_text(text) logger.info(f"Split text into {len(chunks)} chunks") for i, chunk in enumerate(chunks): try: samples, sr = process_chunk_sequential(chunk, self.kokoro, voice, speed, lang) if samples is not None: if sample_rate is None: sample_rate = sr all_samples.append(samples) logger.info(f"Processed chunk {i+1}/{len(chunks)}") except Exception as e: logger.error(f"Error processing chunk {i+1}: {e}") continue if not all_samples: raise ValueError("No audio samples generated - text may be invalid or too long") combined = np.concatenate(all_samples) buffer = BytesIO() sf.write(buffer, combined, sample_rate, format="MP3", subtype="MPEG_LAYER_III") buffer.seek(0) logger.info(f"Generated MP3 audio: {len(combined)} samples at {sample_rate}Hz") return buffer