openclaw-voice-assistant/speech_recognizer.py
Claw - AI Now Inc 1662bc141a Initial commit: Bilingual Voice Assistant for Google AIY Voice Kit V1
Features:
- Bilingual support (English/Mandarin Chinese)
- Hotword detection: 'Hey Osiris' / '你好 Osiris'
- Music playback control (MP3, WAV, OGG, FLAC)
- OpenClaw integration for AI responses
- Google AIY Voice Kit V1 compatible
- Text-to-speech in both languages
- Voice command recognition
- Raspberry Pi ready with installation script

AI Now Inc - Del Mar Demo Unit 🏭
2026-03-01 00:02:49 -08:00

208 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Bilingual Speech Recognizer
Supports English and Mandarin Chinese with automatic language detection.
"""
import os
import json
import logging
from typing import Optional, Tuple
from pathlib import Path
try:
import aiy.voice
from aiy import speech
HAS_AIY = True
except ImportError:
HAS_AIY = False
try:
from google.cloud import speech as speech_service
HAS_GOOGLE_CLOUD = True
except ImportError:
HAS_GOOGLE_CLOUD = False
try:
from langdetect import detect
HAS_LANG_DETECT = True
except ImportError:
HAS_LANG_DETECT = False
logger = logging.getLogger(__name__)
class BilingualSpeechRecognizer:
"""
Speech recognizer with automatic English/Mandarin detection.
"""
def __init__(self, config_path: str = "config.json"):
self.config = self._load_config(config_path)
self.language_cache = {}
if HAS_AIY:
self.aiy_recognizer = speech.Recognizer()
else:
self.aiy_recognizer = None
logger.info("BilingualSpeechRecognizer initialized")
def _load_config(self, config_path: str) -> dict:
"""Load configuration from JSON file."""
try:
with open(config_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
logger.warning(f"Config file {config_path} not found, using defaults")
return {
"speech": {
"language": "auto",
"recognition_timeout": 5
}
}
def recognize(self, audio_data: bytes, timeout: Optional[int] = None) -> Tuple[Optional[str], str]:
"""
Recognize speech from audio data.
Args:
audio_data: Raw audio bytes
timeout: Recognition timeout in seconds
Returns:
Tuple of (recognized_text, detected_language)
"""
if timeout is None:
timeout = self.config.get("speech", {}).get("recognition_timeout", 5)
# Try Google Cloud Speech first (if available)
if HAS_GOOGLE_CLOUD and self.config.get("speech", {}).get("offline_mode", False) is False:
try:
text = self._google_cloud_recognize(audio_data)
if text:
lang = self._detect_language(text)
return text, lang
except Exception as e:
logger.warning(f"Google Cloud recognition failed: {e}")
# Fall back to AIY/local recognition
if self.aiy_recognizer:
try:
text = self._aiy_recognize(audio_data)
if text:
lang = self._detect_language(text)
return text, lang
except Exception as e:
logger.warning(f"AIY recognition failed: {e}")
# Last resort: simple language detection from text
return None, "unknown"
def _google_cloud_recognize(self, audio_data: bytes) -> Optional[str]:
"""Use Google Cloud Speech-to-Text for recognition."""
if not HAS_GOOGLE_CLOUD:
return None
client = speech_service.SpeechClient()
# Try bilingual recognition
config = speech_service.RecognitionConfig(
encoding=speech_service.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_codes=["en-US", "zh-CN", "zh-TW"],
enable_automatic_punctuation=True,
)
response = client.recognize(
config=config,
audio=speech_service.RecognitionAudio(content=audio_data)
)
if response.results:
result = response.results[0]
if result.alternatives:
return result.alternatives[0].transcript
return None
def _aiy_recognize(self, audio_data: bytes) -> Optional[str]:
"""Use AIY Voice Kit for recognition."""
if not self.aiy_recognizer:
return None
try:
# AIY uses Google's speech recognition internally
recognizer = self.aiy_recognizer
# This is a simplified version - actual implementation depends on AIY version
return None
except Exception as e:
logger.error(f"AIY recognition error: {e}")
return None
def _detect_language(self, text: str) -> str:
"""
Detect if text is English or Chinese.
Returns:
'en' for English, 'zh' for Chinese, 'unknown' otherwise
"""
if not text:
return "unknown"
# Simple heuristic: check for Chinese characters
chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
if chinese_chars > len(text) * 0.3: # 30% Chinese characters
return "zh"
# Use langdetect if available
if HAS_LANG_DETECT:
try:
detected = detect(text)
if detected in ["zh-cn", "zh-tw", "zh"]:
return "zh"
elif detected in ["en", "en-us", "en-gb"]:
return "en"
except:
pass
# Default to English
return "en"
def listen_for_hotword(self, callback) -> None:
"""
Listen for hotword activation.
Args:
callback: Function to call when hotword detected
"""
if not HAS_AIY:
logger.warning("AIY not available, hotword detection disabled")
return
# Implementation depends on AIY version
# This is a placeholder for the actual hotword detection
logger.info("Hotword detection enabled")
def main():
"""Test the speech recognizer."""
recognizer = BilingualSpeechRecognizer()
# Test language detection
test_texts = [
"Hello, how are you?",
"你好,你好吗?",
"Play some music",
"播放音乐"
]
for text in test_texts:
lang = recognizer._detect_language(text)
print(f"'{text}' -> Language: {lang}")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()