openclaw-voice-assistant/speech_recognizer.py

#!/usr/bin/env python3
"""
Bilingual Speech Recognizer
Supports English and Mandarin Chinese with automatic language detection.
"""

import os
import json
import logging
from typing import Optional, Tuple
from pathlib import Path

try:
    import aiy.voice
    from aiy import speech
    HAS_AIY = True
except ImportError:
    HAS_AIY = False

try:
    from google.cloud import speech as speech_service
    HAS_GOOGLE_CLOUD = True
except ImportError:
    HAS_GOOGLE_CLOUD = False

try:
    from langdetect import detect
    HAS_LANG_DETECT = True
except ImportError:
    HAS_LANG_DETECT = False

logger = logging.getLogger(__name__)


class BilingualSpeechRecognizer:
    """
    Speech recognizer with automatic English/Mandarin detection.
    """

    def __init__(self, config_path: str = "config.json"):
        self.config = self._load_config(config_path)
        self.language_cache = {}

        if HAS_AIY:
            self.aiy_recognizer = speech.Recognizer()
        else:
            self.aiy_recognizer = None

        logger.info("BilingualSpeechRecognizer initialized")

    def _load_config(self, config_path: str) -> dict:
        """Load configuration from JSON file."""
        try:
            with open(config_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            logger.warning(f"Config file {config_path} not found, using defaults")
            return {
                "speech": {
                    "language": "auto",
                    "recognition_timeout": 5
                }
            }

    def recognize(self, audio_data: bytes, timeout: Optional[int] = None) -> Tuple[Optional[str], str]:
        """
        Recognize speech from audio data.

        Args:
            audio_data: Raw audio bytes
            timeout: Recognition timeout in seconds

        Returns:
            Tuple of (recognized_text, detected_language)
        """
        if timeout is None:
            timeout = self.config.get("speech", {}).get("recognition_timeout", 5)

        # Try Google Cloud Speech first (if available)
        if HAS_GOOGLE_CLOUD and self.config.get("speech", {}).get("offline_mode", False) is False:
            try:
                text = self._google_cloud_recognize(audio_data)
                if text:
                    lang = self._detect_language(text)
                    return text, lang
            except Exception as e:
                logger.warning(f"Google Cloud recognition failed: {e}")

        # Fall back to AIY/local recognition
        if self.aiy_recognizer:
            try:
                text = self._aiy_recognize(audio_data)
                if text:
                    lang = self._detect_language(text)
                    return text, lang
            except Exception as e:
                logger.warning(f"AIY recognition failed: {e}")

        # Last resort: simple language detection from text
        return None, "unknown"

    def _google_cloud_recognize(self, audio_data: bytes) -> Optional[str]:
        """Use Google Cloud Speech-to-Text for recognition."""
        if not HAS_GOOGLE_CLOUD:
            return None

        client = speech_service.SpeechClient()

        # Try bilingual recognition
        config = speech_service.RecognitionConfig(
            encoding=speech_service.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_codes=["en-US", "zh-CN", "zh-TW"],
            enable_automatic_punctuation=True,
        )

        response = client.recognize(
            config=config,
            audio=speech_service.RecognitionAudio(content=audio_data)
        )

        if response.results:
            result = response.results[0]
            if result.alternatives:
                return result.alternatives[0].transcript

        return None

    def _aiy_recognize(self, audio_data: bytes) -> Optional[str]:
        """Use AIY Voice Kit for recognition."""
        if not self.aiy_recognizer:
            return None

        try:
            # AIY uses Google's speech recognition internally
            recognizer = self.aiy_recognizer
            # This is a simplified version - actual implementation depends on AIY version
            return None
        except Exception as e:
            logger.error(f"AIY recognition error: {e}")
            return None

    def _detect_language(self, text: str) -> str:
        """
        Detect if text is English or Chinese.

        Returns:
            'en' for English, 'zh' for Chinese, 'unknown' otherwise
        """
        if not text:
            return "unknown"

        # Simple heuristic: check for Chinese characters
        chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
        if chinese_chars > len(text) * 0.3:  # 30% Chinese characters
            return "zh"

        # Use langdetect if available
        if HAS_LANG_DETECT:
            try:
                detected = detect(text)
                if detected in ["zh-cn", "zh-tw", "zh"]:
                    return "zh"
                elif detected in ["en", "en-us", "en-gb"]:
                    return "en"
            except:
                pass

        # Default to English
        return "en"

    def listen_for_hotword(self, callback) -> None:
        """
        Listen for hotword activation.

        Args:
            callback: Function to call when hotword detected
        """
        if not HAS_AIY:
            logger.warning("AIY not available, hotword detection disabled")
            return

        # Implementation depends on AIY version
        # This is a placeholder for the actual hotword detection
        logger.info("Hotword detection enabled")


def main():
    """Test the speech recognizer."""
    recognizer = BilingualSpeechRecognizer()

    # Test language detection
    test_texts = [
        "Hello, how are you?",
        "你好，你好吗？",
        "Play some music",
        "播放音乐"
    ]

    for text in test_texts:
        lang = recognizer._detect_language(text)
        print(f"'{text}' -> Language: {lang}")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()