openclaw-voice-assistant/tts_engine.py
Claw - AI Now Inc 1662bc141a Initial commit: Bilingual Voice Assistant for Google AIY Voice Kit V1
Features:
- Bilingual support (English/Mandarin Chinese)
- Hotword detection: 'Hey Osiris' / '你好 Osiris'
- Music playback control (MP3, WAV, OGG, FLAC)
- OpenClaw integration for AI responses
- Google AIY Voice Kit V1 compatible
- Text-to-speech in both languages
- Voice command recognition
- Raspberry Pi ready with installation script

AI Now Inc - Del Mar Demo Unit 🏭
2026-03-01 00:02:49 -08:00

268 lines
7.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Text-to-Speech Engine
Supports English and Mandarin Chinese with Google Cloud TTS and offline alternatives.
"""
import os
import json
import logging
from typing import Optional, List
from pathlib import Path
try:
from google.cloud import texttospeech
HAS_GOOGLE_CLOUD = True
except ImportError:
HAS_GOOGLE_CLOUD = False
try:
import pygame
HAS_PYGAME = True
except ImportError:
HAS_PYGAME = False
logger = logging.getLogger(__name__)
class TTSEngine:
"""
Bilingual TTS engine supporting English and Mandarin Chinese.
"""
def __init__(self, config_path: str = "config.json"):
self.config = self._load_config(config_path)
# TTS configuration
tts_config = self.config.get("tts", {})
self.english_voice = tts_config.get("english_voice", "en-US-Standard-A")
self.chinese_voice = tts_config.get("chinese_voice", "zh-CN-Standard-A")
self.speed = tts_config.get("speed", 1.0)
self.pitch = tts_config.get("pitch", 0)
# Initialize Google Cloud client if available
self.client = None
if HAS_GOOGLE_CLOUD and self.config.get("openclaw", {}).get("enabled", True):
try:
self.client = texttospeech.TextToSpeechClient()
logger.info("Google Cloud TTS initialized")
except Exception as e:
logger.warning(f"Google Cloud TTS not available: {e}")
# Initialize audio output
if HAS_PYGAME:
pygame.mixer.init()
logger.info("TTSEngine initialized")
def _load_config(self, config_path: str) -> dict:
"""Load configuration."""
try:
with open(config_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
return {"tts": {}}
def speak(self, text: str, language: str = "en") -> bool:
"""
Speak text in the specified language.
Args:
text: Text to speak
language: 'en' for English, 'zh' for Chinese
Returns:
True if speech succeeded
"""
try:
# Generate speech audio
audio_data = self._synthesize(text, language)
if audio_data:
# Play audio
return self._play_audio(audio_data)
return False
except Exception as e:
logger.error(f"TTS error: {e}")
return False
def _synthesize(self, text: str, language: str) -> Optional[bytes]:
"""
Synthesize speech from text.
Args:
text: Text to synthesize
language: Language code
Returns:
Audio data or None
"""
if self.client and HAS_GOOGLE_CLOUD:
return self._google_synthesize(text, language)
else:
return self._offline_synthesize(text, language)
def _google_synthesize(self, text: str, language: str) -> Optional[bytes]:
"""Use Google Cloud TTS."""
if not self.client:
return None
# Select voice based on language
if language == "zh":
voice_name = self.chinese_voice
lang_code = "zh-CN"
else:
voice_name = self.english_voice
lang_code = "en-US"
# Configure synthesis
voice = texttospeech.VoiceSelectionParams(
language_code=lang_code,
name=voice_name,
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=self.speed,
pitch=self.pitch,
)
synthesis_input = texttospeech.SynthesisInput(text=text)
# Perform synthesis
response = self.client.synthesize_speech(
request=texttospeech.SynthesizeSpeechRequest(
input=synthesis_input,
voice=voice,
audio_config=audio_config,
)
)
return response.audio_content
def _offline_synthesize(self, text: str, language: str) -> Optional[bytes]:
"""
Offline TTS fallback (basic system TTS).
This is a placeholder - in production, you'd use:
- espeak for English
- A Chinese TTS engine for Mandarin
"""
logger.warning("Using offline TTS (limited quality)")
# Try system TTS
try:
if language == "zh":
# Chinese TTS (if available)
os.system(f'espeak -v zh "{text}" --stdout > /tmp/tts_output.wav')
else:
# English TTS
os.system(f'espeak "{text}" --stdout > /tmp/tts_output.wav')
# Read the file
if os.path.exists('/tmp/tts_output.wav'):
with open('/tmp/tts_output.wav', 'rb') as f:
return f.read()
except Exception as e:
logger.error(f"Offline TTS failed: {e}")
return None
def _play_audio(self, audio_data: bytes) -> bool:
"""
Play audio data.
Args:
audio_data: Audio bytes (MP3 or WAV)
Returns:
True if playback succeeded
"""
if not HAS_PYGAME:
logger.warning("Pygame not available for audio playback")
return False
try:
# Save to temp file
temp_path = "/tmp/tts_audio.mp3"
with open(temp_path, 'wb') as f:
f.write(audio_data)
# Load and play
pygame.mixer.music.load(temp_path)
pygame.mixer.music.play()
# Wait for completion
while pygame.mixer.music.get_busy():
pygame.time.wait(100)
return True
except Exception as e:
logger.error(f"Audio playback error: {e}")
return False
def speak_sync(self, text: str, language: str = "en",
on_complete=None) -> bool:
"""
Synchronous speech with optional callback.
Args:
text: Text to speak
language: Language code
on_complete: Callback function when done
Returns:
True if speech succeeded
"""
result = self.speak(text, language)
if on_complete:
on_complete(result)
return result
def get_voices(self) -> List[dict]:
"""Get list of available voices."""
voices = []
if self.client and HAS_GOOGLE_CLOUD:
try:
response = self.client.list_voices()
for voice in response.voices:
voices.append({
"name": voice.name,
"language": voice.language_codes,
"gender": voice.ssml_gender
})
except Exception as e:
logger.error(f"Error listing voices: {e}")
return voices
def main():
"""Test the TTS engine."""
tts = TTSEngine()
# Test English
print("Testing English TTS...")
tts.speak("Hello! I am your voice assistant.", "en")
# Test Chinese
print("Testing Chinese TTS...")
tts.speak("你好!我是你的语音助手。", "zh")
# List available voices
voices = tts.get_voices()
print(f"\nAvailable voices: {len(voices)}")
for voice in voices[:5]: # Show first 5
print(f" - {voice['name']} ({', '.join(voice['language'])})")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()