add custom voices

2026-05-24 00:44:39 -04:00
parent 833927c66e
commit 083b1fd43a
3 changed files with 166 additions and 12 deletions
@@ -52,10 +52,13 @@ Once you create a custom bot, interact with it by prefixing your message with th
 ### Text-to-Speech
-| Command                    | Description                             | Example Usage                   |
+| Command                              | Description                                           | Example Usage                              |
-| -------------------------- | --------------------------------------- | ------------------------------- |
+| ------------------------------------ | ----------------------------------------------------- | ------------------------------------------ |
-| `!speak <text>`            | Convert text to speech (MP3 attachment) | `!speak hello world`            |
+| `!speak <text>`                      | Convert text to speech (MP3 attachment)               | `!speak hello world`                       |
-| `!speak <bot_name> <text>` | Have a custom bot respond and speak     | `!speak alfred what time is it` |
+| `!speak <text> --voice <voice>`      | Convert text to speech with a specific voice          | `!speak hello world --voice af_bella`      |
 | `!speak <bot_name> <text>`           | Have a custom bot respond and speak                   | `!speak alfred what time is it`            |
 | `!speak <bot_name> <text> --voice`   | Have a custom bot respond and speak with a voice      | `!speak alfred what time is it --voice am_puck` |
 | `!voices`                            | List all available TTS voices by category             | `!voices`                                  |
 ### Image Commands
@@ -101,6 +101,83 @@ TTS_VOICES_PATH: str = os.getenv("TTS_VOICES_PATH", "voices-v1.0.bin")
 TTS_VOICE: str = os.getenv("TTS_VOICE", "af_sarah")
 TTS_SPEED: float = float(os.getenv("TTS_SPEED", "1.0"))
 # Available voices organized by category
 VOICES_LIST: dict[str, dict[str, str | list[str]]] = {
    "🇺🇸 👩": {
        "language": "en-us",
        "voices": [
            "af_alloy",
            "af_aoede",
            "af_bella",
            "af_heart",
            "af_jessica",
            "af_kore",
            "af_nicole",
            "af_nova",
            "af_river",
            "af_sarah",
            "af_sky",
        ],
    },
    "🇺🇸 👨": {
        "language": "en-us",
        "voices": [
            "am_adam",
            "am_echo",
            "am_eric",
            "am_fenrir",
            "am_liam",
            "am_michael",
            "am_onyx",
            "am_puck",
        ],
    },
    "🇬🇧": {
        "language": "en-gb",
        "voices": [
            "bf_alice",
            "bf_emma",
            "bf_isabella",
            "bf_lily",
            "bm_daniel",
            "bm_fable",
            "bm_george",
            "bm_lewis",
        ],
    },
    "🇫🇷": {
        "language": "fr-fr",
        "voices": ["ff_siwis"],
    },
    "🇮🇹": {
        "language": "it",
        "voices": ["if_sara", "im_nicola"],
    },
    "🇯🇵": {
        "language": "ja",
        "voices": [
            "jf_alpha",
            "jf_gongitsune",
            "jf_nezumi",
            "jf_tebukuro",
            "jm_kumo",
        ],
    },
    "🇨🇳": {
        "language": "cmn",
        "voices": [
            "zf_xiaobei",
            "zf_xiaoni",
            "zf_xiaoxiao",
            "zf_xiaoyi",
            "zm_yunjian",
            "zm_yunxi",
            "zm_yunxia",
            "zm_yunyang",
        ],
    },
 }
 logger.info("CHAT_ENDPOINT set to %s", CHAT_ENDPOINT)
 logger.info("COMPLETION_ENDPOINT set to %s", COMPLETION_ENDPOINT)
 logger.info("IMAGE_GEN_ENDPOINT set to %s", IMAGE_GEN_ENDPOINT)
@@ -13,6 +13,7 @@ from discord import Message
 from discord.ext import commands
 from vibe_bot import llama_wrapper, tts
 from vibe_bot.tts import DEFAULT_LANG
 from vibe_bot.config import (
    CHAT_ENDPOINT,
    CHAT_ENDPOINT_KEY,
@@ -29,6 +30,7 @@ from vibe_bot.config import (
    TTS_SPEED,
    TTS_VOICE,
    TTS_VOICES_PATH,
    VOICES_LIST,
 )
 from vibe_bot.database import CustomBotManager, get_database
@@ -326,14 +328,47 @@ async def on_message(message: Message) -> None:
    await bot.process_commands(message)
@bot.command(name="voices")
 async def voices(ctx: CommandsContext[Bot]) -> None:
    """List all available TTS voices organized by category."""
    voice_list = "Available Voices:\n\n"
    for category, info in VOICES_LIST.items():
        voice_list += f"{category} ({info['language']}):\n"
        for v in info["voices"]:
            voice_list += f"  - {v}\n"
        voice_list += "\n"
    voice_list += "Use `!speak <text> --voice <voice_name>` to choose a voice."
    chunk_size = 1900
    chunks: list[str] = []
    current_chunk = voice_list
    while current_chunk:
        if len(current_chunk) <= chunk_size:
            chunks.append(current_chunk)
            break
        split_pos = current_chunk.rfind("\n", 0, chunk_size)
        if split_pos == -1:
            split_pos = chunk_size
        chunks.append(current_chunk[:split_pos])
        current_chunk = current_chunk[split_pos:].lstrip("\n")
    for chunk in chunks:
        await ctx.send(chunk)
@bot.command(name="speak")
-async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
+async def speak(
    ctx: CommandsContext[Bot],
    *,
    message: str,
 ) -> None:
    """Have the bot speak the given text using Kokoro TTS, or have a custom bot speak.
-    Usage: !speak <text> - plain text to speech
+    Usage: !speak <text> --voice <voice_name> - plain text to speech
-    Usage: !speak <bot_name> <text> - have a custom bot respond and speak
+    Usage: !speak <bot_name> <text> --voice <voice_name> - have a custom bot respond and speak
    Example: !speak hello world
-    Example: !speak alfred what time is it
+    Example: !speak hello world --voice af_bella
    Example: !speak alfred what time is it --voice am_puck
    """
    if tts_engine is None:
        await ctx.send(
@@ -342,19 +377,37 @@ async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
        )
        return
    # Parse --voice flag from the message
    voice = None
    voice_match = message.rsplit("--voice ", 1)
    if len(voice_match) == 2:
        voice = voice_match[1].strip()
        message = voice_match[0].rstrip()
    if not message or not message.strip():
        await ctx.send("Please provide text to speak.")
        return
    # Validate voice if provided
    if voice:
        all_voices = [v for cat in VOICES_LIST.values() for v in cat["voices"]]
        if voice not in all_voices:
            await ctx.send(
                f"Unknown voice '{voice}'. Use `!voices` to see available voices."
            )
            return
    custom_bot_manager = CustomBotManager()
    custom_bots = custom_bot_manager.list_custom_bots()
    bot_names = [b[0] for b in custom_bots]
    first_word = message.split(maxsplit=1)[0] if message.split() else ""
    if first_word in bot_names:
-        await _speak_with_bot(ctx, first_word, message, tts_engine, custom_bot_manager)
+        await _speak_with_bot(
            ctx, first_word, message, tts_engine, custom_bot_manager, voice
        )
    else:
-        await _speak_plain(ctx, message, tts_engine)
+        await _speak_plain(ctx, message, tts_engine, voice)
 async def _speak_with_bot(
@@ -363,6 +416,7 @@ async def _speak_with_bot(
    message: str,
    engine: tts.TTSEngine,
    custom_bot_manager: CustomBotManager,
    voice: str | None = None,
 ) -> None:
    """Handle speak command for a custom bot."""
    text_to_speak = message[len(bot_name) :].lstrip()
@@ -380,6 +434,14 @@ async def _speak_with_bot(
    _, system_prompt, _, _ = bot_info
    system_prompt_edit = f"{system_prompt}\nKeep your responses under 2-3 sentences."
    # Determine language for the chosen voice
    chosen_voice = voice or TTS_VOICE
    lang = DEFAULT_LANG
    for cat in VOICES_LIST.values():
        if chosen_voice in cat["voices"]:
            lang = str(cat["language"])
            break
    try:
        db = get_database()
        context = db.get_conversation_context(
@@ -429,8 +491,9 @@ async def _speak_with_bot(
        await ctx.send(f"Generating speech for **{bot_name}**...")
        audio_buffer = engine.generate_audio(
            bot_response,
-            voice=TTS_VOICE,
+            voice=chosen_voice,
            speed=TTS_SPEED,
            lang=lang,
        )
        audio_file = discord.File(audio_buffer, filename="speech.mp3")
@@ -447,14 +510,25 @@ async def _speak_plain(
    ctx: CommandsContext[Bot],
    message: str,
    engine: tts.TTSEngine,
    voice: str | None = None,
 ) -> None:
    """Handle speak command for plain text."""
    chosen_voice = voice or TTS_VOICE
    # Determine language for the chosen voice
    lang = DEFAULT_LANG
    for cat in VOICES_LIST.values():
        if chosen_voice in cat["voices"]:
            lang = str(cat["language"])
            break
    try:
        await ctx.send("Generating speech...")
        audio_buffer = engine.generate_audio(
            message,
-            voice=TTS_VOICE,
+            voice=chosen_voice,
            speed=TTS_SPEED,
            lang=lang,
        )
        audio_file = discord.File(audio_buffer, filename="speech.mp3")