Features:
- Bilingual support (English/Mandarin Chinese)
- Hotword detection: 'Hey Osiris' / '你好 Osiris'
- Music playback control (MP3, WAV, OGG, FLAC)
- OpenClaw integration for AI responses
- Google AIY Voice Kit V1 compatible
- Text-to-speech in both languages
- Voice command recognition
- Raspberry Pi ready with installation script
AI Now Inc - Del Mar Demo Unit 🏭
268 lines
7.8 KiB
Python
Executable File
268 lines
7.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Text-to-Speech Engine
|
|
Supports English and Mandarin Chinese with Google Cloud TTS and offline alternatives.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import logging
|
|
from typing import Optional, List
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from google.cloud import texttospeech
|
|
HAS_GOOGLE_CLOUD = True
|
|
except ImportError:
|
|
HAS_GOOGLE_CLOUD = False
|
|
|
|
try:
|
|
import pygame
|
|
HAS_PYGAME = True
|
|
except ImportError:
|
|
HAS_PYGAME = False
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TTSEngine:
|
|
"""
|
|
Bilingual TTS engine supporting English and Mandarin Chinese.
|
|
"""
|
|
|
|
def __init__(self, config_path: str = "config.json"):
|
|
self.config = self._load_config(config_path)
|
|
|
|
# TTS configuration
|
|
tts_config = self.config.get("tts", {})
|
|
self.english_voice = tts_config.get("english_voice", "en-US-Standard-A")
|
|
self.chinese_voice = tts_config.get("chinese_voice", "zh-CN-Standard-A")
|
|
self.speed = tts_config.get("speed", 1.0)
|
|
self.pitch = tts_config.get("pitch", 0)
|
|
|
|
# Initialize Google Cloud client if available
|
|
self.client = None
|
|
if HAS_GOOGLE_CLOUD and self.config.get("openclaw", {}).get("enabled", True):
|
|
try:
|
|
self.client = texttospeech.TextToSpeechClient()
|
|
logger.info("Google Cloud TTS initialized")
|
|
except Exception as e:
|
|
logger.warning(f"Google Cloud TTS not available: {e}")
|
|
|
|
# Initialize audio output
|
|
if HAS_PYGAME:
|
|
pygame.mixer.init()
|
|
|
|
logger.info("TTSEngine initialized")
|
|
|
|
def _load_config(self, config_path: str) -> dict:
|
|
"""Load configuration."""
|
|
try:
|
|
with open(config_path, 'r') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
return {"tts": {}}
|
|
|
|
def speak(self, text: str, language: str = "en") -> bool:
|
|
"""
|
|
Speak text in the specified language.
|
|
|
|
Args:
|
|
text: Text to speak
|
|
language: 'en' for English, 'zh' for Chinese
|
|
|
|
Returns:
|
|
True if speech succeeded
|
|
"""
|
|
try:
|
|
# Generate speech audio
|
|
audio_data = self._synthesize(text, language)
|
|
|
|
if audio_data:
|
|
# Play audio
|
|
return self._play_audio(audio_data)
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"TTS error: {e}")
|
|
return False
|
|
|
|
def _synthesize(self, text: str, language: str) -> Optional[bytes]:
|
|
"""
|
|
Synthesize speech from text.
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
language: Language code
|
|
|
|
Returns:
|
|
Audio data or None
|
|
"""
|
|
if self.client and HAS_GOOGLE_CLOUD:
|
|
return self._google_synthesize(text, language)
|
|
else:
|
|
return self._offline_synthesize(text, language)
|
|
|
|
def _google_synthesize(self, text: str, language: str) -> Optional[bytes]:
|
|
"""Use Google Cloud TTS."""
|
|
if not self.client:
|
|
return None
|
|
|
|
# Select voice based on language
|
|
if language == "zh":
|
|
voice_name = self.chinese_voice
|
|
lang_code = "zh-CN"
|
|
else:
|
|
voice_name = self.english_voice
|
|
lang_code = "en-US"
|
|
|
|
# Configure synthesis
|
|
voice = texttospeech.VoiceSelectionParams(
|
|
language_code=lang_code,
|
|
name=voice_name,
|
|
)
|
|
|
|
audio_config = texttospeech.AudioConfig(
|
|
audio_encoding=texttospeech.AudioEncoding.MP3,
|
|
speaking_rate=self.speed,
|
|
pitch=self.pitch,
|
|
)
|
|
|
|
synthesis_input = texttospeech.SynthesisInput(text=text)
|
|
|
|
# Perform synthesis
|
|
response = self.client.synthesize_speech(
|
|
request=texttospeech.SynthesizeSpeechRequest(
|
|
input=synthesis_input,
|
|
voice=voice,
|
|
audio_config=audio_config,
|
|
)
|
|
)
|
|
|
|
return response.audio_content
|
|
|
|
def _offline_synthesize(self, text: str, language: str) -> Optional[bytes]:
|
|
"""
|
|
Offline TTS fallback (basic system TTS).
|
|
|
|
This is a placeholder - in production, you'd use:
|
|
- espeak for English
|
|
- A Chinese TTS engine for Mandarin
|
|
"""
|
|
logger.warning("Using offline TTS (limited quality)")
|
|
|
|
# Try system TTS
|
|
try:
|
|
if language == "zh":
|
|
# Chinese TTS (if available)
|
|
os.system(f'espeak -v zh "{text}" --stdout > /tmp/tts_output.wav')
|
|
else:
|
|
# English TTS
|
|
os.system(f'espeak "{text}" --stdout > /tmp/tts_output.wav')
|
|
|
|
# Read the file
|
|
if os.path.exists('/tmp/tts_output.wav'):
|
|
with open('/tmp/tts_output.wav', 'rb') as f:
|
|
return f.read()
|
|
except Exception as e:
|
|
logger.error(f"Offline TTS failed: {e}")
|
|
|
|
return None
|
|
|
|
def _play_audio(self, audio_data: bytes) -> bool:
|
|
"""
|
|
Play audio data.
|
|
|
|
Args:
|
|
audio_data: Audio bytes (MP3 or WAV)
|
|
|
|
Returns:
|
|
True if playback succeeded
|
|
"""
|
|
if not HAS_PYGAME:
|
|
logger.warning("Pygame not available for audio playback")
|
|
return False
|
|
|
|
try:
|
|
# Save to temp file
|
|
temp_path = "/tmp/tts_audio.mp3"
|
|
with open(temp_path, 'wb') as f:
|
|
f.write(audio_data)
|
|
|
|
# Load and play
|
|
pygame.mixer.music.load(temp_path)
|
|
pygame.mixer.music.play()
|
|
|
|
# Wait for completion
|
|
while pygame.mixer.music.get_busy():
|
|
pygame.time.wait(100)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Audio playback error: {e}")
|
|
return False
|
|
|
|
def speak_sync(self, text: str, language: str = "en",
|
|
on_complete=None) -> bool:
|
|
"""
|
|
Synchronous speech with optional callback.
|
|
|
|
Args:
|
|
text: Text to speak
|
|
language: Language code
|
|
on_complete: Callback function when done
|
|
|
|
Returns:
|
|
True if speech succeeded
|
|
"""
|
|
result = self.speak(text, language)
|
|
|
|
if on_complete:
|
|
on_complete(result)
|
|
|
|
return result
|
|
|
|
def get_voices(self) -> List[dict]:
|
|
"""Get list of available voices."""
|
|
voices = []
|
|
|
|
if self.client and HAS_GOOGLE_CLOUD:
|
|
try:
|
|
response = self.client.list_voices()
|
|
for voice in response.voices:
|
|
voices.append({
|
|
"name": voice.name,
|
|
"language": voice.language_codes,
|
|
"gender": voice.ssml_gender
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Error listing voices: {e}")
|
|
|
|
return voices
|
|
|
|
|
|
def main():
|
|
"""Test the TTS engine."""
|
|
tts = TTSEngine()
|
|
|
|
# Test English
|
|
print("Testing English TTS...")
|
|
tts.speak("Hello! I am your voice assistant.", "en")
|
|
|
|
# Test Chinese
|
|
print("Testing Chinese TTS...")
|
|
tts.speak("你好!我是你的语音助手。", "zh")
|
|
|
|
# List available voices
|
|
voices = tts.get_voices()
|
|
print(f"\nAvailable voices: {len(voices)}")
|
|
for voice in voices[:5]: # Show first 5
|
|
print(f" - {voice['name']} ({', '.join(voice['language'])})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
main()
|