add custom voices

2026-05-24 00:44:39 -04:00
parent 833927c66e
commit 083b1fd43a
3 changed files with 166 additions and 12 deletions
@@ -52,10 +52,13 @@ Once you create a custom bot, interact with it by prefixing your message with th

 ### Text-to-Speech

-| Command                    | Description                             | Example Usage                   |
-| -------------------------- | --------------------------------------- | ------------------------------- |
-| `!speak <text>`            | Convert text to speech (MP3 attachment) | `!speak hello world`            |
-| `!speak <bot_name> <text>` | Have a custom bot respond and speak     | `!speak alfred what time is it` |
+| Command                              | Description                                           | Example Usage                              |
+| ------------------------------------ | ----------------------------------------------------- | ------------------------------------------ |
+| `!speak <text>`                      | Convert text to speech (MP3 attachment)               | `!speak hello world`                       |
+| `!speak <text> --voice <voice>`      | Convert text to speech with a specific voice          | `!speak hello world --voice af_bella`      |
+| `!speak <bot_name> <text>`           | Have a custom bot respond and speak                   | `!speak alfred what time is it`            |
+| `!speak <bot_name> <text> --voice`   | Have a custom bot respond and speak with a voice      | `!speak alfred what time is it --voice am_puck` |
+| `!voices`                            | List all available TTS voices by category             | `!voices`                                  |

 ### Image Commands

@@ -101,6 +101,83 @@ TTS_VOICES_PATH: str = os.getenv("TTS_VOICES_PATH", "voices-v1.0.bin")
 TTS_VOICE: str = os.getenv("TTS_VOICE", "af_sarah")
 TTS_SPEED: float = float(os.getenv("TTS_SPEED", "1.0"))

+# Available voices organized by category
+VOICES_LIST: dict[str, dict[str, str | list[str]]] = {
+    "🇺🇸 👩": {
+        "language": "en-us",
+        "voices": [
+            "af_alloy",
+            "af_aoede",
+            "af_bella",
+            "af_heart",
+            "af_jessica",
+            "af_kore",
+            "af_nicole",
+            "af_nova",
+            "af_river",
+            "af_sarah",
+            "af_sky",
+        ],
+    },
+    "🇺🇸 👨": {
+        "language": "en-us",
+        "voices": [
+            "am_adam",
+            "am_echo",
+            "am_eric",
+            "am_fenrir",
+            "am_liam",
+            "am_michael",
+            "am_onyx",
+            "am_puck",
+        ],
+    },
+    "🇬🇧": {
+        "language": "en-gb",
+        "voices": [
+            "bf_alice",
+            "bf_emma",
+            "bf_isabella",
+            "bf_lily",
+            "bm_daniel",
+            "bm_fable",
+            "bm_george",
+            "bm_lewis",
+        ],
+    },
+    "🇫🇷": {
+        "language": "fr-fr",
+        "voices": ["ff_siwis"],
+    },
+    "🇮🇹": {
+        "language": "it",
+        "voices": ["if_sara", "im_nicola"],
+    },
+    "🇯🇵": {
+        "language": "ja",
+        "voices": [
+            "jf_alpha",
+            "jf_gongitsune",
+            "jf_nezumi",
+            "jf_tebukuro",
+            "jm_kumo",
+        ],
+    },
+    "🇨🇳": {
+        "language": "cmn",
+        "voices": [
+            "zf_xiaobei",
+            "zf_xiaoni",
+            "zf_xiaoxiao",
+            "zf_xiaoyi",
+            "zm_yunjian",
+            "zm_yunxi",
+            "zm_yunxia",
+            "zm_yunyang",
+        ],
+    },
+}
+
 logger.info("CHAT_ENDPOINT set to %s", CHAT_ENDPOINT)
 logger.info("COMPLETION_ENDPOINT set to %s", COMPLETION_ENDPOINT)
 logger.info("IMAGE_GEN_ENDPOINT set to %s", IMAGE_GEN_ENDPOINT)
@@ -13,6 +13,7 @@ from discord import Message
 from discord.ext import commands

 from vibe_bot import llama_wrapper, tts
+from vibe_bot.tts import DEFAULT_LANG
 from vibe_bot.config import (
    CHAT_ENDPOINT,
    CHAT_ENDPOINT_KEY,
@@ -29,6 +30,7 @@ from vibe_bot.config import (
    TTS_SPEED,
    TTS_VOICE,
    TTS_VOICES_PATH,
+    VOICES_LIST,
 )
 from vibe_bot.database import CustomBotManager, get_database

@@ -326,14 +328,47 @@ async def on_message(message: Message) -> None:
    await bot.process_commands(message)


+@bot.command(name="voices")
+async def voices(ctx: CommandsContext[Bot]) -> None:
+    """List all available TTS voices organized by category."""
+    voice_list = "Available Voices:\n\n"
+    for category, info in VOICES_LIST.items():
+        voice_list += f"{category} ({info['language']}):\n"
+        for v in info["voices"]:
+            voice_list += f"  - {v}\n"
+        voice_list += "\n"
+    voice_list += "Use `!speak <text> --voice <voice_name>` to choose a voice."
+
+    chunk_size = 1900
+    chunks: list[str] = []
+    current_chunk = voice_list
+    while current_chunk:
+        if len(current_chunk) <= chunk_size:
+            chunks.append(current_chunk)
+            break
+        split_pos = current_chunk.rfind("\n", 0, chunk_size)
+        if split_pos == -1:
+            split_pos = chunk_size
+        chunks.append(current_chunk[:split_pos])
+        current_chunk = current_chunk[split_pos:].lstrip("\n")
+
+    for chunk in chunks:
+        await ctx.send(chunk)
+
+
@bot.command(name="speak")
-async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
+async def speak(
+    ctx: CommandsContext[Bot],
+    *,
+    message: str,
+) -> None:
    """Have the bot speak the given text using Kokoro TTS, or have a custom bot speak.

-    Usage: !speak <text> - plain text to speech
-    Usage: !speak <bot_name> <text> - have a custom bot respond and speak
+    Usage: !speak <text> --voice <voice_name> - plain text to speech
+    Usage: !speak <bot_name> <text> --voice <voice_name> - have a custom bot respond and speak
    Example: !speak hello world
-    Example: !speak alfred what time is it
+    Example: !speak hello world --voice af_bella
+    Example: !speak alfred what time is it --voice am_puck
    """
    if tts_engine is None:
        await ctx.send(
@@ -342,19 +377,37 @@ async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
        )
        return

+    # Parse --voice flag from the message
+    voice = None
+    voice_match = message.rsplit("--voice ", 1)
+    if len(voice_match) == 2:
+        voice = voice_match[1].strip()
+        message = voice_match[0].rstrip()
+
    if not message or not message.strip():
        await ctx.send("Please provide text to speak.")
        return

+    # Validate voice if provided
+    if voice:
+        all_voices = [v for cat in VOICES_LIST.values() for v in cat["voices"]]
+        if voice not in all_voices:
+            await ctx.send(
+                f"Unknown voice '{voice}'. Use `!voices` to see available voices."
+            )
+            return
+
    custom_bot_manager = CustomBotManager()
    custom_bots = custom_bot_manager.list_custom_bots()
    bot_names = [b[0] for b in custom_bots]

    first_word = message.split(maxsplit=1)[0] if message.split() else ""
    if first_word in bot_names:
-        await _speak_with_bot(ctx, first_word, message, tts_engine, custom_bot_manager)
+        await _speak_with_bot(
+            ctx, first_word, message, tts_engine, custom_bot_manager, voice
+        )
    else:
-        await _speak_plain(ctx, message, tts_engine)
+        await _speak_plain(ctx, message, tts_engine, voice)


 async def _speak_with_bot(
@@ -363,6 +416,7 @@ async def _speak_with_bot(
    message: str,
    engine: tts.TTSEngine,
    custom_bot_manager: CustomBotManager,
+    voice: str | None = None,
 ) -> None:
    """Handle speak command for a custom bot."""
    text_to_speak = message[len(bot_name) :].lstrip()
@@ -380,6 +434,14 @@ async def _speak_with_bot(
    _, system_prompt, _, _ = bot_info
    system_prompt_edit = f"{system_prompt}\nKeep your responses under 2-3 sentences."

+    # Determine language for the chosen voice
+    chosen_voice = voice or TTS_VOICE
+    lang = DEFAULT_LANG
+    for cat in VOICES_LIST.values():
+        if chosen_voice in cat["voices"]:
+            lang = str(cat["language"])
+            break
+
    try:
        db = get_database()
        context = db.get_conversation_context(
@@ -429,8 +491,9 @@ async def _speak_with_bot(
        await ctx.send(f"Generating speech for **{bot_name}**...")
        audio_buffer = engine.generate_audio(
            bot_response,
-            voice=TTS_VOICE,
+            voice=chosen_voice,
            speed=TTS_SPEED,
+            lang=lang,
        )

        audio_file = discord.File(audio_buffer, filename="speech.mp3")
@@ -447,14 +510,25 @@ async def _speak_plain(
    ctx: CommandsContext[Bot],
    message: str,
    engine: tts.TTSEngine,
+    voice: str | None = None,
 ) -> None:
    """Handle speak command for plain text."""
+    chosen_voice = voice or TTS_VOICE
+
+    # Determine language for the chosen voice
+    lang = DEFAULT_LANG
+    for cat in VOICES_LIST.values():
+        if chosen_voice in cat["voices"]:
+            lang = str(cat["language"])
+            break
+
    try:
        await ctx.send("Generating speech...")
        audio_buffer = engine.generate_audio(
            message,
-            voice=TTS_VOICE,
+            voice=chosen_voice,
            speed=TTS_SPEED,
+            lang=lang,
        )

        audio_file = discord.File(audio_buffer, filename="speech.mp3")