diff --git a/README.md b/README.md index 511416a..3b8918d 100644 --- a/README.md +++ b/README.md @@ -52,10 +52,13 @@ Once you create a custom bot, interact with it by prefixing your message with th ### Text-to-Speech -| Command | Description | Example Usage | -| -------------------------- | --------------------------------------- | ------------------------------- | -| `!speak ` | Convert text to speech (MP3 attachment) | `!speak hello world` | -| `!speak ` | Have a custom bot respond and speak | `!speak alfred what time is it` | +| Command | Description | Example Usage | +| ------------------------------------ | ----------------------------------------------------- | ------------------------------------------ | +| `!speak ` | Convert text to speech (MP3 attachment) | `!speak hello world` | +| `!speak --voice ` | Convert text to speech with a specific voice | `!speak hello world --voice af_bella` | +| `!speak ` | Have a custom bot respond and speak | `!speak alfred what time is it` | +| `!speak --voice` | Have a custom bot respond and speak with a voice | `!speak alfred what time is it --voice am_puck` | +| `!voices` | List all available TTS voices by category | `!voices` | ### Image Commands diff --git a/vibe_bot/config.py b/vibe_bot/config.py index 0288e55..90cbcd4 100644 --- a/vibe_bot/config.py +++ b/vibe_bot/config.py @@ -101,6 +101,83 @@ TTS_VOICES_PATH: str = os.getenv("TTS_VOICES_PATH", "voices-v1.0.bin") TTS_VOICE: str = os.getenv("TTS_VOICE", "af_sarah") TTS_SPEED: float = float(os.getenv("TTS_SPEED", "1.0")) +# Available voices organized by category +VOICES_LIST: dict[str, dict[str, str | list[str]]] = { + "🇺🇸 👩": { + "language": "en-us", + "voices": [ + "af_alloy", + "af_aoede", + "af_bella", + "af_heart", + "af_jessica", + "af_kore", + "af_nicole", + "af_nova", + "af_river", + "af_sarah", + "af_sky", + ], + }, + "🇺🇸 👨": { + "language": "en-us", + "voices": [ + "am_adam", + "am_echo", + "am_eric", + "am_fenrir", + "am_liam", + "am_michael", + "am_onyx", + "am_puck", + ], + }, + "🇬🇧": { + "language": "en-gb", + "voices": [ + "bf_alice", + "bf_emma", + "bf_isabella", + "bf_lily", + "bm_daniel", + "bm_fable", + "bm_george", + "bm_lewis", + ], + }, + "🇫🇷": { + "language": "fr-fr", + "voices": ["ff_siwis"], + }, + "🇮🇹": { + "language": "it", + "voices": ["if_sara", "im_nicola"], + }, + "🇯🇵": { + "language": "ja", + "voices": [ + "jf_alpha", + "jf_gongitsune", + "jf_nezumi", + "jf_tebukuro", + "jm_kumo", + ], + }, + "🇨🇳": { + "language": "cmn", + "voices": [ + "zf_xiaobei", + "zf_xiaoni", + "zf_xiaoxiao", + "zf_xiaoyi", + "zm_yunjian", + "zm_yunxi", + "zm_yunxia", + "zm_yunyang", + ], + }, +} + logger.info("CHAT_ENDPOINT set to %s", CHAT_ENDPOINT) logger.info("COMPLETION_ENDPOINT set to %s", COMPLETION_ENDPOINT) logger.info("IMAGE_GEN_ENDPOINT set to %s", IMAGE_GEN_ENDPOINT) diff --git a/vibe_bot/main.py b/vibe_bot/main.py index 8bac2ac..f4f34c1 100644 --- a/vibe_bot/main.py +++ b/vibe_bot/main.py @@ -13,6 +13,7 @@ from discord import Message from discord.ext import commands from vibe_bot import llama_wrapper, tts +from vibe_bot.tts import DEFAULT_LANG from vibe_bot.config import ( CHAT_ENDPOINT, CHAT_ENDPOINT_KEY, @@ -29,6 +30,7 @@ from vibe_bot.config import ( TTS_SPEED, TTS_VOICE, TTS_VOICES_PATH, + VOICES_LIST, ) from vibe_bot.database import CustomBotManager, get_database @@ -326,14 +328,47 @@ async def on_message(message: Message) -> None: await bot.process_commands(message) +@bot.command(name="voices") +async def voices(ctx: CommandsContext[Bot]) -> None: + """List all available TTS voices organized by category.""" + voice_list = "Available Voices:\n\n" + for category, info in VOICES_LIST.items(): + voice_list += f"{category} ({info['language']}):\n" + for v in info["voices"]: + voice_list += f" - {v}\n" + voice_list += "\n" + voice_list += "Use `!speak --voice ` to choose a voice." + + chunk_size = 1900 + chunks: list[str] = [] + current_chunk = voice_list + while current_chunk: + if len(current_chunk) <= chunk_size: + chunks.append(current_chunk) + break + split_pos = current_chunk.rfind("\n", 0, chunk_size) + if split_pos == -1: + split_pos = chunk_size + chunks.append(current_chunk[:split_pos]) + current_chunk = current_chunk[split_pos:].lstrip("\n") + + for chunk in chunks: + await ctx.send(chunk) + + @bot.command(name="speak") -async def speak(ctx: CommandsContext[Bot], *, message: str) -> None: +async def speak( + ctx: CommandsContext[Bot], + *, + message: str, +) -> None: """Have the bot speak the given text using Kokoro TTS, or have a custom bot speak. - Usage: !speak - plain text to speech - Usage: !speak - have a custom bot respond and speak + Usage: !speak --voice - plain text to speech + Usage: !speak --voice - have a custom bot respond and speak Example: !speak hello world - Example: !speak alfred what time is it + Example: !speak hello world --voice af_bella + Example: !speak alfred what time is it --voice am_puck """ if tts_engine is None: await ctx.send( @@ -342,19 +377,37 @@ async def speak(ctx: CommandsContext[Bot], *, message: str) -> None: ) return + # Parse --voice flag from the message + voice = None + voice_match = message.rsplit("--voice ", 1) + if len(voice_match) == 2: + voice = voice_match[1].strip() + message = voice_match[0].rstrip() + if not message or not message.strip(): await ctx.send("Please provide text to speak.") return + # Validate voice if provided + if voice: + all_voices = [v for cat in VOICES_LIST.values() for v in cat["voices"]] + if voice not in all_voices: + await ctx.send( + f"Unknown voice '{voice}'. Use `!voices` to see available voices." + ) + return + custom_bot_manager = CustomBotManager() custom_bots = custom_bot_manager.list_custom_bots() bot_names = [b[0] for b in custom_bots] first_word = message.split(maxsplit=1)[0] if message.split() else "" if first_word in bot_names: - await _speak_with_bot(ctx, first_word, message, tts_engine, custom_bot_manager) + await _speak_with_bot( + ctx, first_word, message, tts_engine, custom_bot_manager, voice + ) else: - await _speak_plain(ctx, message, tts_engine) + await _speak_plain(ctx, message, tts_engine, voice) async def _speak_with_bot( @@ -363,6 +416,7 @@ async def _speak_with_bot( message: str, engine: tts.TTSEngine, custom_bot_manager: CustomBotManager, + voice: str | None = None, ) -> None: """Handle speak command for a custom bot.""" text_to_speak = message[len(bot_name) :].lstrip() @@ -380,6 +434,14 @@ async def _speak_with_bot( _, system_prompt, _, _ = bot_info system_prompt_edit = f"{system_prompt}\nKeep your responses under 2-3 sentences." + # Determine language for the chosen voice + chosen_voice = voice or TTS_VOICE + lang = DEFAULT_LANG + for cat in VOICES_LIST.values(): + if chosen_voice in cat["voices"]: + lang = str(cat["language"]) + break + try: db = get_database() context = db.get_conversation_context( @@ -429,8 +491,9 @@ async def _speak_with_bot( await ctx.send(f"Generating speech for **{bot_name}**...") audio_buffer = engine.generate_audio( bot_response, - voice=TTS_VOICE, + voice=chosen_voice, speed=TTS_SPEED, + lang=lang, ) audio_file = discord.File(audio_buffer, filename="speech.mp3") @@ -447,14 +510,25 @@ async def _speak_plain( ctx: CommandsContext[Bot], message: str, engine: tts.TTSEngine, + voice: str | None = None, ) -> None: """Handle speak command for plain text.""" + chosen_voice = voice or TTS_VOICE + + # Determine language for the chosen voice + lang = DEFAULT_LANG + for cat in VOICES_LIST.values(): + if chosen_voice in cat["voices"]: + lang = str(cat["language"]) + break + try: await ctx.send("Generating speech...") audio_buffer = engine.generate_audio( message, - voice=TTS_VOICE, + voice=chosen_voice, speed=TTS_SPEED, + lang=lang, ) audio_file = discord.File(audio_buffer, filename="speech.mp3")