Features:
- Bilingual support (English/Mandarin Chinese)
- Hotword detection: 'Hey Osiris' / '你好 Osiris'
- Music playback control (MP3, WAV, OGG, FLAC)
- OpenClaw integration for AI responses
- Google AIY Voice Kit V1 compatible
- Text-to-speech in both languages
- Voice command recognition
- Raspberry Pi ready with installation script
AI Now Inc - Del Mar Demo Unit 🏭
208 lines
6.2 KiB
Python
Executable File
208 lines
6.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Bilingual Speech Recognizer
|
|
Supports English and Mandarin Chinese with automatic language detection.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import logging
|
|
from typing import Optional, Tuple
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import aiy.voice
|
|
from aiy import speech
|
|
HAS_AIY = True
|
|
except ImportError:
|
|
HAS_AIY = False
|
|
|
|
try:
|
|
from google.cloud import speech as speech_service
|
|
HAS_GOOGLE_CLOUD = True
|
|
except ImportError:
|
|
HAS_GOOGLE_CLOUD = False
|
|
|
|
try:
|
|
from langdetect import detect
|
|
HAS_LANG_DETECT = True
|
|
except ImportError:
|
|
HAS_LANG_DETECT = False
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BilingualSpeechRecognizer:
|
|
"""
|
|
Speech recognizer with automatic English/Mandarin detection.
|
|
"""
|
|
|
|
def __init__(self, config_path: str = "config.json"):
|
|
self.config = self._load_config(config_path)
|
|
self.language_cache = {}
|
|
|
|
if HAS_AIY:
|
|
self.aiy_recognizer = speech.Recognizer()
|
|
else:
|
|
self.aiy_recognizer = None
|
|
|
|
logger.info("BilingualSpeechRecognizer initialized")
|
|
|
|
def _load_config(self, config_path: str) -> dict:
|
|
"""Load configuration from JSON file."""
|
|
try:
|
|
with open(config_path, 'r') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
logger.warning(f"Config file {config_path} not found, using defaults")
|
|
return {
|
|
"speech": {
|
|
"language": "auto",
|
|
"recognition_timeout": 5
|
|
}
|
|
}
|
|
|
|
def recognize(self, audio_data: bytes, timeout: Optional[int] = None) -> Tuple[Optional[str], str]:
|
|
"""
|
|
Recognize speech from audio data.
|
|
|
|
Args:
|
|
audio_data: Raw audio bytes
|
|
timeout: Recognition timeout in seconds
|
|
|
|
Returns:
|
|
Tuple of (recognized_text, detected_language)
|
|
"""
|
|
if timeout is None:
|
|
timeout = self.config.get("speech", {}).get("recognition_timeout", 5)
|
|
|
|
# Try Google Cloud Speech first (if available)
|
|
if HAS_GOOGLE_CLOUD and self.config.get("speech", {}).get("offline_mode", False) is False:
|
|
try:
|
|
text = self._google_cloud_recognize(audio_data)
|
|
if text:
|
|
lang = self._detect_language(text)
|
|
return text, lang
|
|
except Exception as e:
|
|
logger.warning(f"Google Cloud recognition failed: {e}")
|
|
|
|
# Fall back to AIY/local recognition
|
|
if self.aiy_recognizer:
|
|
try:
|
|
text = self._aiy_recognize(audio_data)
|
|
if text:
|
|
lang = self._detect_language(text)
|
|
return text, lang
|
|
except Exception as e:
|
|
logger.warning(f"AIY recognition failed: {e}")
|
|
|
|
# Last resort: simple language detection from text
|
|
return None, "unknown"
|
|
|
|
def _google_cloud_recognize(self, audio_data: bytes) -> Optional[str]:
|
|
"""Use Google Cloud Speech-to-Text for recognition."""
|
|
if not HAS_GOOGLE_CLOUD:
|
|
return None
|
|
|
|
client = speech_service.SpeechClient()
|
|
|
|
# Try bilingual recognition
|
|
config = speech_service.RecognitionConfig(
|
|
encoding=speech_service.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
sample_rate_hertz=16000,
|
|
language_codes=["en-US", "zh-CN", "zh-TW"],
|
|
enable_automatic_punctuation=True,
|
|
)
|
|
|
|
response = client.recognize(
|
|
config=config,
|
|
audio=speech_service.RecognitionAudio(content=audio_data)
|
|
)
|
|
|
|
if response.results:
|
|
result = response.results[0]
|
|
if result.alternatives:
|
|
return result.alternatives[0].transcript
|
|
|
|
return None
|
|
|
|
def _aiy_recognize(self, audio_data: bytes) -> Optional[str]:
|
|
"""Use AIY Voice Kit for recognition."""
|
|
if not self.aiy_recognizer:
|
|
return None
|
|
|
|
try:
|
|
# AIY uses Google's speech recognition internally
|
|
recognizer = self.aiy_recognizer
|
|
# This is a simplified version - actual implementation depends on AIY version
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"AIY recognition error: {e}")
|
|
return None
|
|
|
|
def _detect_language(self, text: str) -> str:
|
|
"""
|
|
Detect if text is English or Chinese.
|
|
|
|
Returns:
|
|
'en' for English, 'zh' for Chinese, 'unknown' otherwise
|
|
"""
|
|
if not text:
|
|
return "unknown"
|
|
|
|
# Simple heuristic: check for Chinese characters
|
|
chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
|
|
if chinese_chars > len(text) * 0.3: # 30% Chinese characters
|
|
return "zh"
|
|
|
|
# Use langdetect if available
|
|
if HAS_LANG_DETECT:
|
|
try:
|
|
detected = detect(text)
|
|
if detected in ["zh-cn", "zh-tw", "zh"]:
|
|
return "zh"
|
|
elif detected in ["en", "en-us", "en-gb"]:
|
|
return "en"
|
|
except:
|
|
pass
|
|
|
|
# Default to English
|
|
return "en"
|
|
|
|
def listen_for_hotword(self, callback) -> None:
|
|
"""
|
|
Listen for hotword activation.
|
|
|
|
Args:
|
|
callback: Function to call when hotword detected
|
|
"""
|
|
if not HAS_AIY:
|
|
logger.warning("AIY not available, hotword detection disabled")
|
|
return
|
|
|
|
# Implementation depends on AIY version
|
|
# This is a placeholder for the actual hotword detection
|
|
logger.info("Hotword detection enabled")
|
|
|
|
|
|
def main():
|
|
"""Test the speech recognizer."""
|
|
recognizer = BilingualSpeechRecognizer()
|
|
|
|
# Test language detection
|
|
test_texts = [
|
|
"Hello, how are you?",
|
|
"你好,你好吗?",
|
|
"Play some music",
|
|
"播放音乐"
|
|
]
|
|
|
|
for text in test_texts:
|
|
lang = recognizer._detect_language(text)
|
|
print(f"'{text}' -> Language: {lang}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
main()
|