add custom voices

This commit is contained in:
2026-05-24 00:44:39 -04:00
parent 833927c66e
commit 083b1fd43a
3 changed files with 166 additions and 12 deletions
+7 -4
View File
@@ -52,10 +52,13 @@ Once you create a custom bot, interact with it by prefixing your message with th
### Text-to-Speech
| Command | Description | Example Usage |
| -------------------------- | --------------------------------------- | ------------------------------- |
| `!speak <text>` | Convert text to speech (MP3 attachment) | `!speak hello world` |
| `!speak <bot_name> <text>` | Have a custom bot respond and speak | `!speak alfred what time is it` |
| Command | Description | Example Usage |
| ------------------------------------ | ----------------------------------------------------- | ------------------------------------------ |
| `!speak <text>` | Convert text to speech (MP3 attachment) | `!speak hello world` |
| `!speak <text> --voice <voice>` | Convert text to speech with a specific voice | `!speak hello world --voice af_bella` |
| `!speak <bot_name> <text>` | Have a custom bot respond and speak | `!speak alfred what time is it` |
| `!speak <bot_name> <text> --voice` | Have a custom bot respond and speak with a voice | `!speak alfred what time is it --voice am_puck` |
| `!voices` | List all available TTS voices by category | `!voices` |
### Image Commands
+77
View File
@@ -101,6 +101,83 @@ TTS_VOICES_PATH: str = os.getenv("TTS_VOICES_PATH", "voices-v1.0.bin")
TTS_VOICE: str = os.getenv("TTS_VOICE", "af_sarah")
TTS_SPEED: float = float(os.getenv("TTS_SPEED", "1.0"))
# Available voices organized by category
VOICES_LIST: dict[str, dict[str, str | list[str]]] = {
"🇺🇸 👩": {
"language": "en-us",
"voices": [
"af_alloy",
"af_aoede",
"af_bella",
"af_heart",
"af_jessica",
"af_kore",
"af_nicole",
"af_nova",
"af_river",
"af_sarah",
"af_sky",
],
},
"🇺🇸 👨": {
"language": "en-us",
"voices": [
"am_adam",
"am_echo",
"am_eric",
"am_fenrir",
"am_liam",
"am_michael",
"am_onyx",
"am_puck",
],
},
"🇬🇧": {
"language": "en-gb",
"voices": [
"bf_alice",
"bf_emma",
"bf_isabella",
"bf_lily",
"bm_daniel",
"bm_fable",
"bm_george",
"bm_lewis",
],
},
"🇫🇷": {
"language": "fr-fr",
"voices": ["ff_siwis"],
},
"🇮🇹": {
"language": "it",
"voices": ["if_sara", "im_nicola"],
},
"🇯🇵": {
"language": "ja",
"voices": [
"jf_alpha",
"jf_gongitsune",
"jf_nezumi",
"jf_tebukuro",
"jm_kumo",
],
},
"🇨🇳": {
"language": "cmn",
"voices": [
"zf_xiaobei",
"zf_xiaoni",
"zf_xiaoxiao",
"zf_xiaoyi",
"zm_yunjian",
"zm_yunxi",
"zm_yunxia",
"zm_yunyang",
],
},
}
logger.info("CHAT_ENDPOINT set to %s", CHAT_ENDPOINT)
logger.info("COMPLETION_ENDPOINT set to %s", COMPLETION_ENDPOINT)
logger.info("IMAGE_GEN_ENDPOINT set to %s", IMAGE_GEN_ENDPOINT)
+82 -8
View File
@@ -13,6 +13,7 @@ from discord import Message
from discord.ext import commands
from vibe_bot import llama_wrapper, tts
from vibe_bot.tts import DEFAULT_LANG
from vibe_bot.config import (
CHAT_ENDPOINT,
CHAT_ENDPOINT_KEY,
@@ -29,6 +30,7 @@ from vibe_bot.config import (
TTS_SPEED,
TTS_VOICE,
TTS_VOICES_PATH,
VOICES_LIST,
)
from vibe_bot.database import CustomBotManager, get_database
@@ -326,14 +328,47 @@ async def on_message(message: Message) -> None:
await bot.process_commands(message)
@bot.command(name="voices")
async def voices(ctx: CommandsContext[Bot]) -> None:
"""List all available TTS voices organized by category."""
voice_list = "Available Voices:\n\n"
for category, info in VOICES_LIST.items():
voice_list += f"{category} ({info['language']}):\n"
for v in info["voices"]:
voice_list += f" - {v}\n"
voice_list += "\n"
voice_list += "Use `!speak <text> --voice <voice_name>` to choose a voice."
chunk_size = 1900
chunks: list[str] = []
current_chunk = voice_list
while current_chunk:
if len(current_chunk) <= chunk_size:
chunks.append(current_chunk)
break
split_pos = current_chunk.rfind("\n", 0, chunk_size)
if split_pos == -1:
split_pos = chunk_size
chunks.append(current_chunk[:split_pos])
current_chunk = current_chunk[split_pos:].lstrip("\n")
for chunk in chunks:
await ctx.send(chunk)
@bot.command(name="speak")
async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
async def speak(
ctx: CommandsContext[Bot],
*,
message: str,
) -> None:
"""Have the bot speak the given text using Kokoro TTS, or have a custom bot speak.
Usage: !speak <text> - plain text to speech
Usage: !speak <bot_name> <text> - have a custom bot respond and speak
Usage: !speak <text> --voice <voice_name> - plain text to speech
Usage: !speak <bot_name> <text> --voice <voice_name> - have a custom bot respond and speak
Example: !speak hello world
Example: !speak alfred what time is it
Example: !speak hello world --voice af_bella
Example: !speak alfred what time is it --voice am_puck
"""
if tts_engine is None:
await ctx.send(
@@ -342,19 +377,37 @@ async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
)
return
# Parse --voice flag from the message
voice = None
voice_match = message.rsplit("--voice ", 1)
if len(voice_match) == 2:
voice = voice_match[1].strip()
message = voice_match[0].rstrip()
if not message or not message.strip():
await ctx.send("Please provide text to speak.")
return
# Validate voice if provided
if voice:
all_voices = [v for cat in VOICES_LIST.values() for v in cat["voices"]]
if voice not in all_voices:
await ctx.send(
f"Unknown voice '{voice}'. Use `!voices` to see available voices."
)
return
custom_bot_manager = CustomBotManager()
custom_bots = custom_bot_manager.list_custom_bots()
bot_names = [b[0] for b in custom_bots]
first_word = message.split(maxsplit=1)[0] if message.split() else ""
if first_word in bot_names:
await _speak_with_bot(ctx, first_word, message, tts_engine, custom_bot_manager)
await _speak_with_bot(
ctx, first_word, message, tts_engine, custom_bot_manager, voice
)
else:
await _speak_plain(ctx, message, tts_engine)
await _speak_plain(ctx, message, tts_engine, voice)
async def _speak_with_bot(
@@ -363,6 +416,7 @@ async def _speak_with_bot(
message: str,
engine: tts.TTSEngine,
custom_bot_manager: CustomBotManager,
voice: str | None = None,
) -> None:
"""Handle speak command for a custom bot."""
text_to_speak = message[len(bot_name) :].lstrip()
@@ -380,6 +434,14 @@ async def _speak_with_bot(
_, system_prompt, _, _ = bot_info
system_prompt_edit = f"{system_prompt}\nKeep your responses under 2-3 sentences."
# Determine language for the chosen voice
chosen_voice = voice or TTS_VOICE
lang = DEFAULT_LANG
for cat in VOICES_LIST.values():
if chosen_voice in cat["voices"]:
lang = str(cat["language"])
break
try:
db = get_database()
context = db.get_conversation_context(
@@ -429,8 +491,9 @@ async def _speak_with_bot(
await ctx.send(f"Generating speech for **{bot_name}**...")
audio_buffer = engine.generate_audio(
bot_response,
voice=TTS_VOICE,
voice=chosen_voice,
speed=TTS_SPEED,
lang=lang,
)
audio_file = discord.File(audio_buffer, filename="speech.mp3")
@@ -447,14 +510,25 @@ async def _speak_plain(
ctx: CommandsContext[Bot],
message: str,
engine: tts.TTSEngine,
voice: str | None = None,
) -> None:
"""Handle speak command for plain text."""
chosen_voice = voice or TTS_VOICE
# Determine language for the chosen voice
lang = DEFAULT_LANG
for cat in VOICES_LIST.values():
if chosen_voice in cat["voices"]:
lang = str(cat["language"])
break
try:
await ctx.send("Generating speech...")
audio_buffer = engine.generate_audio(
message,
voice=TTS_VOICE,
voice=chosen_voice,
speed=TTS_SPEED,
lang=lang,
)
audio_file = discord.File(audio_buffer, filename="speech.mp3")