add custom voices
This commit is contained in:
@@ -52,10 +52,13 @@ Once you create a custom bot, interact with it by prefixing your message with th
|
|||||||
|
|
||||||
### Text-to-Speech
|
### Text-to-Speech
|
||||||
|
|
||||||
| Command | Description | Example Usage |
|
| Command | Description | Example Usage |
|
||||||
| -------------------------- | --------------------------------------- | ------------------------------- |
|
| ------------------------------------ | ----------------------------------------------------- | ------------------------------------------ |
|
||||||
| `!speak <text>` | Convert text to speech (MP3 attachment) | `!speak hello world` |
|
| `!speak <text>` | Convert text to speech (MP3 attachment) | `!speak hello world` |
|
||||||
| `!speak <bot_name> <text>` | Have a custom bot respond and speak | `!speak alfred what time is it` |
|
| `!speak <text> --voice <voice>` | Convert text to speech with a specific voice | `!speak hello world --voice af_bella` |
|
||||||
|
| `!speak <bot_name> <text>` | Have a custom bot respond and speak | `!speak alfred what time is it` |
|
||||||
|
| `!speak <bot_name> <text> --voice` | Have a custom bot respond and speak with a voice | `!speak alfred what time is it --voice am_puck` |
|
||||||
|
| `!voices` | List all available TTS voices by category | `!voices` |
|
||||||
|
|
||||||
### Image Commands
|
### Image Commands
|
||||||
|
|
||||||
|
|||||||
@@ -101,6 +101,83 @@ TTS_VOICES_PATH: str = os.getenv("TTS_VOICES_PATH", "voices-v1.0.bin")
|
|||||||
TTS_VOICE: str = os.getenv("TTS_VOICE", "af_sarah")
|
TTS_VOICE: str = os.getenv("TTS_VOICE", "af_sarah")
|
||||||
TTS_SPEED: float = float(os.getenv("TTS_SPEED", "1.0"))
|
TTS_SPEED: float = float(os.getenv("TTS_SPEED", "1.0"))
|
||||||
|
|
||||||
|
# Available voices organized by category
|
||||||
|
VOICES_LIST: dict[str, dict[str, str | list[str]]] = {
|
||||||
|
"🇺🇸 👩": {
|
||||||
|
"language": "en-us",
|
||||||
|
"voices": [
|
||||||
|
"af_alloy",
|
||||||
|
"af_aoede",
|
||||||
|
"af_bella",
|
||||||
|
"af_heart",
|
||||||
|
"af_jessica",
|
||||||
|
"af_kore",
|
||||||
|
"af_nicole",
|
||||||
|
"af_nova",
|
||||||
|
"af_river",
|
||||||
|
"af_sarah",
|
||||||
|
"af_sky",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"🇺🇸 👨": {
|
||||||
|
"language": "en-us",
|
||||||
|
"voices": [
|
||||||
|
"am_adam",
|
||||||
|
"am_echo",
|
||||||
|
"am_eric",
|
||||||
|
"am_fenrir",
|
||||||
|
"am_liam",
|
||||||
|
"am_michael",
|
||||||
|
"am_onyx",
|
||||||
|
"am_puck",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"🇬🇧": {
|
||||||
|
"language": "en-gb",
|
||||||
|
"voices": [
|
||||||
|
"bf_alice",
|
||||||
|
"bf_emma",
|
||||||
|
"bf_isabella",
|
||||||
|
"bf_lily",
|
||||||
|
"bm_daniel",
|
||||||
|
"bm_fable",
|
||||||
|
"bm_george",
|
||||||
|
"bm_lewis",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"🇫🇷": {
|
||||||
|
"language": "fr-fr",
|
||||||
|
"voices": ["ff_siwis"],
|
||||||
|
},
|
||||||
|
"🇮🇹": {
|
||||||
|
"language": "it",
|
||||||
|
"voices": ["if_sara", "im_nicola"],
|
||||||
|
},
|
||||||
|
"🇯🇵": {
|
||||||
|
"language": "ja",
|
||||||
|
"voices": [
|
||||||
|
"jf_alpha",
|
||||||
|
"jf_gongitsune",
|
||||||
|
"jf_nezumi",
|
||||||
|
"jf_tebukuro",
|
||||||
|
"jm_kumo",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"🇨🇳": {
|
||||||
|
"language": "cmn",
|
||||||
|
"voices": [
|
||||||
|
"zf_xiaobei",
|
||||||
|
"zf_xiaoni",
|
||||||
|
"zf_xiaoxiao",
|
||||||
|
"zf_xiaoyi",
|
||||||
|
"zm_yunjian",
|
||||||
|
"zm_yunxi",
|
||||||
|
"zm_yunxia",
|
||||||
|
"zm_yunyang",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
logger.info("CHAT_ENDPOINT set to %s", CHAT_ENDPOINT)
|
logger.info("CHAT_ENDPOINT set to %s", CHAT_ENDPOINT)
|
||||||
logger.info("COMPLETION_ENDPOINT set to %s", COMPLETION_ENDPOINT)
|
logger.info("COMPLETION_ENDPOINT set to %s", COMPLETION_ENDPOINT)
|
||||||
logger.info("IMAGE_GEN_ENDPOINT set to %s", IMAGE_GEN_ENDPOINT)
|
logger.info("IMAGE_GEN_ENDPOINT set to %s", IMAGE_GEN_ENDPOINT)
|
||||||
|
|||||||
+82
-8
@@ -13,6 +13,7 @@ from discord import Message
|
|||||||
from discord.ext import commands
|
from discord.ext import commands
|
||||||
|
|
||||||
from vibe_bot import llama_wrapper, tts
|
from vibe_bot import llama_wrapper, tts
|
||||||
|
from vibe_bot.tts import DEFAULT_LANG
|
||||||
from vibe_bot.config import (
|
from vibe_bot.config import (
|
||||||
CHAT_ENDPOINT,
|
CHAT_ENDPOINT,
|
||||||
CHAT_ENDPOINT_KEY,
|
CHAT_ENDPOINT_KEY,
|
||||||
@@ -29,6 +30,7 @@ from vibe_bot.config import (
|
|||||||
TTS_SPEED,
|
TTS_SPEED,
|
||||||
TTS_VOICE,
|
TTS_VOICE,
|
||||||
TTS_VOICES_PATH,
|
TTS_VOICES_PATH,
|
||||||
|
VOICES_LIST,
|
||||||
)
|
)
|
||||||
from vibe_bot.database import CustomBotManager, get_database
|
from vibe_bot.database import CustomBotManager, get_database
|
||||||
|
|
||||||
@@ -326,14 +328,47 @@ async def on_message(message: Message) -> None:
|
|||||||
await bot.process_commands(message)
|
await bot.process_commands(message)
|
||||||
|
|
||||||
|
|
||||||
|
@bot.command(name="voices")
|
||||||
|
async def voices(ctx: CommandsContext[Bot]) -> None:
|
||||||
|
"""List all available TTS voices organized by category."""
|
||||||
|
voice_list = "Available Voices:\n\n"
|
||||||
|
for category, info in VOICES_LIST.items():
|
||||||
|
voice_list += f"{category} ({info['language']}):\n"
|
||||||
|
for v in info["voices"]:
|
||||||
|
voice_list += f" - {v}\n"
|
||||||
|
voice_list += "\n"
|
||||||
|
voice_list += "Use `!speak <text> --voice <voice_name>` to choose a voice."
|
||||||
|
|
||||||
|
chunk_size = 1900
|
||||||
|
chunks: list[str] = []
|
||||||
|
current_chunk = voice_list
|
||||||
|
while current_chunk:
|
||||||
|
if len(current_chunk) <= chunk_size:
|
||||||
|
chunks.append(current_chunk)
|
||||||
|
break
|
||||||
|
split_pos = current_chunk.rfind("\n", 0, chunk_size)
|
||||||
|
if split_pos == -1:
|
||||||
|
split_pos = chunk_size
|
||||||
|
chunks.append(current_chunk[:split_pos])
|
||||||
|
current_chunk = current_chunk[split_pos:].lstrip("\n")
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
await ctx.send(chunk)
|
||||||
|
|
||||||
|
|
||||||
@bot.command(name="speak")
|
@bot.command(name="speak")
|
||||||
async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
|
async def speak(
|
||||||
|
ctx: CommandsContext[Bot],
|
||||||
|
*,
|
||||||
|
message: str,
|
||||||
|
) -> None:
|
||||||
"""Have the bot speak the given text using Kokoro TTS, or have a custom bot speak.
|
"""Have the bot speak the given text using Kokoro TTS, or have a custom bot speak.
|
||||||
|
|
||||||
Usage: !speak <text> - plain text to speech
|
Usage: !speak <text> --voice <voice_name> - plain text to speech
|
||||||
Usage: !speak <bot_name> <text> - have a custom bot respond and speak
|
Usage: !speak <bot_name> <text> --voice <voice_name> - have a custom bot respond and speak
|
||||||
Example: !speak hello world
|
Example: !speak hello world
|
||||||
Example: !speak alfred what time is it
|
Example: !speak hello world --voice af_bella
|
||||||
|
Example: !speak alfred what time is it --voice am_puck
|
||||||
"""
|
"""
|
||||||
if tts_engine is None:
|
if tts_engine is None:
|
||||||
await ctx.send(
|
await ctx.send(
|
||||||
@@ -342,19 +377,37 @@ async def speak(ctx: CommandsContext[Bot], *, message: str) -> None:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Parse --voice flag from the message
|
||||||
|
voice = None
|
||||||
|
voice_match = message.rsplit("--voice ", 1)
|
||||||
|
if len(voice_match) == 2:
|
||||||
|
voice = voice_match[1].strip()
|
||||||
|
message = voice_match[0].rstrip()
|
||||||
|
|
||||||
if not message or not message.strip():
|
if not message or not message.strip():
|
||||||
await ctx.send("Please provide text to speak.")
|
await ctx.send("Please provide text to speak.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Validate voice if provided
|
||||||
|
if voice:
|
||||||
|
all_voices = [v for cat in VOICES_LIST.values() for v in cat["voices"]]
|
||||||
|
if voice not in all_voices:
|
||||||
|
await ctx.send(
|
||||||
|
f"Unknown voice '{voice}'. Use `!voices` to see available voices."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
custom_bot_manager = CustomBotManager()
|
custom_bot_manager = CustomBotManager()
|
||||||
custom_bots = custom_bot_manager.list_custom_bots()
|
custom_bots = custom_bot_manager.list_custom_bots()
|
||||||
bot_names = [b[0] for b in custom_bots]
|
bot_names = [b[0] for b in custom_bots]
|
||||||
|
|
||||||
first_word = message.split(maxsplit=1)[0] if message.split() else ""
|
first_word = message.split(maxsplit=1)[0] if message.split() else ""
|
||||||
if first_word in bot_names:
|
if first_word in bot_names:
|
||||||
await _speak_with_bot(ctx, first_word, message, tts_engine, custom_bot_manager)
|
await _speak_with_bot(
|
||||||
|
ctx, first_word, message, tts_engine, custom_bot_manager, voice
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
await _speak_plain(ctx, message, tts_engine)
|
await _speak_plain(ctx, message, tts_engine, voice)
|
||||||
|
|
||||||
|
|
||||||
async def _speak_with_bot(
|
async def _speak_with_bot(
|
||||||
@@ -363,6 +416,7 @@ async def _speak_with_bot(
|
|||||||
message: str,
|
message: str,
|
||||||
engine: tts.TTSEngine,
|
engine: tts.TTSEngine,
|
||||||
custom_bot_manager: CustomBotManager,
|
custom_bot_manager: CustomBotManager,
|
||||||
|
voice: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Handle speak command for a custom bot."""
|
"""Handle speak command for a custom bot."""
|
||||||
text_to_speak = message[len(bot_name) :].lstrip()
|
text_to_speak = message[len(bot_name) :].lstrip()
|
||||||
@@ -380,6 +434,14 @@ async def _speak_with_bot(
|
|||||||
_, system_prompt, _, _ = bot_info
|
_, system_prompt, _, _ = bot_info
|
||||||
system_prompt_edit = f"{system_prompt}\nKeep your responses under 2-3 sentences."
|
system_prompt_edit = f"{system_prompt}\nKeep your responses under 2-3 sentences."
|
||||||
|
|
||||||
|
# Determine language for the chosen voice
|
||||||
|
chosen_voice = voice or TTS_VOICE
|
||||||
|
lang = DEFAULT_LANG
|
||||||
|
for cat in VOICES_LIST.values():
|
||||||
|
if chosen_voice in cat["voices"]:
|
||||||
|
lang = str(cat["language"])
|
||||||
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
db = get_database()
|
db = get_database()
|
||||||
context = db.get_conversation_context(
|
context = db.get_conversation_context(
|
||||||
@@ -429,8 +491,9 @@ async def _speak_with_bot(
|
|||||||
await ctx.send(f"Generating speech for **{bot_name}**...")
|
await ctx.send(f"Generating speech for **{bot_name}**...")
|
||||||
audio_buffer = engine.generate_audio(
|
audio_buffer = engine.generate_audio(
|
||||||
bot_response,
|
bot_response,
|
||||||
voice=TTS_VOICE,
|
voice=chosen_voice,
|
||||||
speed=TTS_SPEED,
|
speed=TTS_SPEED,
|
||||||
|
lang=lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_file = discord.File(audio_buffer, filename="speech.mp3")
|
audio_file = discord.File(audio_buffer, filename="speech.mp3")
|
||||||
@@ -447,14 +510,25 @@ async def _speak_plain(
|
|||||||
ctx: CommandsContext[Bot],
|
ctx: CommandsContext[Bot],
|
||||||
message: str,
|
message: str,
|
||||||
engine: tts.TTSEngine,
|
engine: tts.TTSEngine,
|
||||||
|
voice: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Handle speak command for plain text."""
|
"""Handle speak command for plain text."""
|
||||||
|
chosen_voice = voice or TTS_VOICE
|
||||||
|
|
||||||
|
# Determine language for the chosen voice
|
||||||
|
lang = DEFAULT_LANG
|
||||||
|
for cat in VOICES_LIST.values():
|
||||||
|
if chosen_voice in cat["voices"]:
|
||||||
|
lang = str(cat["language"])
|
||||||
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await ctx.send("Generating speech...")
|
await ctx.send("Generating speech...")
|
||||||
audio_buffer = engine.generate_audio(
|
audio_buffer = engine.generate_audio(
|
||||||
message,
|
message,
|
||||||
voice=TTS_VOICE,
|
voice=chosen_voice,
|
||||||
speed=TTS_SPEED,
|
speed=TTS_SPEED,
|
||||||
|
lang=lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_file = discord.File(audio_buffer, filename="speech.mp3")
|
audio_file = discord.File(audio_buffer, filename="speech.mp3")
|
||||||
|
|||||||
Reference in New Issue
Block a user