openclaw-voice-assistant/tts_engine.py

#!/usr/bin/env python3
"""
Text-to-Speech Engine
Supports English and Mandarin Chinese with Google Cloud TTS and offline alternatives.
"""

import os
import json
import logging
from typing import Optional, List
from pathlib import Path

try:
    from google.cloud import texttospeech
    HAS_GOOGLE_CLOUD = True
except ImportError:
    HAS_GOOGLE_CLOUD = False

try:
    import pygame
    HAS_PYGAME = True
except ImportError:
    HAS_PYGAME = False

logger = logging.getLogger(__name__)


class TTSEngine:
    """
    Bilingual TTS engine supporting English and Mandarin Chinese.
    """

    def __init__(self, config_path: str = "config.json"):
        self.config = self._load_config(config_path)

        # TTS configuration
        tts_config = self.config.get("tts", {})
        self.english_voice = tts_config.get("english_voice", "en-US-Standard-A")
        self.chinese_voice = tts_config.get("chinese_voice", "zh-CN-Standard-A")
        self.speed = tts_config.get("speed", 1.0)
        self.pitch = tts_config.get("pitch", 0)

        # Initialize Google Cloud client if available
        self.client = None
        if HAS_GOOGLE_CLOUD and self.config.get("openclaw", {}).get("enabled", True):
            try:
                self.client = texttospeech.TextToSpeechClient()
                logger.info("Google Cloud TTS initialized")
            except Exception as e:
                logger.warning(f"Google Cloud TTS not available: {e}")

        # Initialize audio output
        if HAS_PYGAME:
            pygame.mixer.init()

        logger.info("TTSEngine initialized")

    def _load_config(self, config_path: str) -> dict:
        """Load configuration."""
        try:
            with open(config_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return {"tts": {}}

    def speak(self, text: str, language: str = "en") -> bool:
        """
        Speak text in the specified language.

        Args:
            text: Text to speak
            language: 'en' for English, 'zh' for Chinese

        Returns:
            True if speech succeeded
        """
        try:
            # Generate speech audio
            audio_data = self._synthesize(text, language)

            if audio_data:
                # Play audio
                return self._play_audio(audio_data)

            return False

        except Exception as e:
            logger.error(f"TTS error: {e}")
            return False

    def _synthesize(self, text: str, language: str) -> Optional[bytes]:
        """
        Synthesize speech from text.

        Args:
            text: Text to synthesize
            language: Language code

        Returns:
            Audio data or None
        """
        if self.client and HAS_GOOGLE_CLOUD:
            return self._google_synthesize(text, language)
        else:
            return self._offline_synthesize(text, language)

    def _google_synthesize(self, text: str, language: str) -> Optional[bytes]:
        """Use Google Cloud TTS."""
        if not self.client:
            return None

        # Select voice based on language
        if language == "zh":
            voice_name = self.chinese_voice
            lang_code = "zh-CN"
        else:
            voice_name = self.english_voice
            lang_code = "en-US"

        # Configure synthesis
        voice = texttospeech.VoiceSelectionParams(
            language_code=lang_code,
            name=voice_name,
        )

        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            speaking_rate=self.speed,
            pitch=self.pitch,
        )

        synthesis_input = texttospeech.SynthesisInput(text=text)

        # Perform synthesis
        response = self.client.synthesize_speech(
            request=texttospeech.SynthesizeSpeechRequest(
                input=synthesis_input,
                voice=voice,
                audio_config=audio_config,
            )
        )

        return response.audio_content

    def _offline_synthesize(self, text: str, language: str) -> Optional[bytes]:
        """
        Offline TTS fallback (basic system TTS).

        This is a placeholder - in production, you'd use:
        - espeak for English
        - A Chinese TTS engine for Mandarin
        """
        logger.warning("Using offline TTS (limited quality)")

        # Try system TTS
        try:
            if language == "zh":
                # Chinese TTS (if available)
                os.system(f'espeak -v zh "{text}" --stdout > /tmp/tts_output.wav')
            else:
                # English TTS
                os.system(f'espeak "{text}" --stdout > /tmp/tts_output.wav')

            # Read the file
            if os.path.exists('/tmp/tts_output.wav'):
                with open('/tmp/tts_output.wav', 'rb') as f:
                    return f.read()
        except Exception as e:
            logger.error(f"Offline TTS failed: {e}")

        return None

    def _play_audio(self, audio_data: bytes) -> bool:
        """
        Play audio data.

        Args:
            audio_data: Audio bytes (MP3 or WAV)

        Returns:
            True if playback succeeded
        """
        if not HAS_PYGAME:
            logger.warning("Pygame not available for audio playback")
            return False

        try:
            # Save to temp file
            temp_path = "/tmp/tts_audio.mp3"
            with open(temp_path, 'wb') as f:
                f.write(audio_data)

            # Load and play
            pygame.mixer.music.load(temp_path)
            pygame.mixer.music.play()

            # Wait for completion
            while pygame.mixer.music.get_busy():
                pygame.time.wait(100)

            return True

        except Exception as e:
            logger.error(f"Audio playback error: {e}")
            return False

    def speak_sync(self, text: str, language: str = "en",
                   on_complete=None) -> bool:
        """
        Synchronous speech with optional callback.

        Args:
            text: Text to speak
            language: Language code
            on_complete: Callback function when done

        Returns:
            True if speech succeeded
        """
        result = self.speak(text, language)

        if on_complete:
            on_complete(result)

        return result

    def get_voices(self) -> List[dict]:
        """Get list of available voices."""
        voices = []

        if self.client and HAS_GOOGLE_CLOUD:
            try:
                response = self.client.list_voices()
                for voice in response.voices:
                    voices.append({
                        "name": voice.name,
                        "language": voice.language_codes,
                        "gender": voice.ssml_gender
                    })
            except Exception as e:
                logger.error(f"Error listing voices: {e}")

        return voices


def main():
    """Test the TTS engine."""
    tts = TTSEngine()

    # Test English
    print("Testing English TTS...")
    tts.speak("Hello! I am your voice assistant.", "en")

    # Test Chinese
    print("Testing Chinese TTS...")
    tts.speak("你好！我是你的语音助手。", "zh")

    # List available voices
    voices = tts.get_voices()
    print(f"\nAvailable voices: {len(voices)}")
    for voice in voices[:5]:  # Show first 5
        print(f"  - {voice['name']} ({', '.join(voice['language'])})")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()