54 lines
1.8 KiB
Python
54 lines
1.8 KiB
Python
import numpy as np
|
|
import soundfile as sf
|
|
from io import BytesIO
|
|
import os
|
|
import logging
|
|
from kokoro_tts import Kokoro, chunk_text, process_chunk_sequential
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default voice settings
|
|
DEFAULT_VOICE = "af_sarah"
|
|
DEFAULT_SPEED = 1.0
|
|
DEFAULT_LANG = "en-us"
|
|
|
|
|
|
class TTSEngine:
|
|
def __init__(self, model_path: str, voices_path: str):
|
|
self.model_path = model_path
|
|
self.voices_path = voices_path
|
|
self.kokoro = Kokoro(model_path, voices_path)
|
|
logger.info("Kokoro TTS engine initialized")
|
|
|
|
def generate_audio(self, text: str, voice: str = DEFAULT_VOICE, speed: float = DEFAULT_SPEED, lang: str = DEFAULT_LANG) -> BytesIO:
|
|
"""Convert text to audio and return as BytesIO (MP3 format)."""
|
|
all_samples = []
|
|
sample_rate = None
|
|
|
|
chunks = chunk_text(text)
|
|
logger.info(f"Split text into {len(chunks)} chunks")
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
try:
|
|
samples, sr = process_chunk_sequential(chunk, self.kokoro, voice, speed, lang)
|
|
if samples is not None:
|
|
if sample_rate is None:
|
|
sample_rate = sr
|
|
all_samples.append(samples)
|
|
logger.info(f"Processed chunk {i+1}/{len(chunks)}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing chunk {i+1}: {e}")
|
|
continue
|
|
|
|
if not all_samples:
|
|
raise ValueError("No audio samples generated - text may be invalid or too long")
|
|
|
|
combined = np.concatenate(all_samples)
|
|
|
|
buffer = BytesIO()
|
|
sf.write(buffer, combined, sample_rate, format="MP3", subtype="MPEG_LAYER_III")
|
|
buffer.seek(0)
|
|
|
|
logger.info(f"Generated MP3 audio: {len(combined)} samples at {sample_rate}Hz")
|
|
return buffer
|