openclaw-voice-assistant/assistant.py

#!/usr/bin/env python3
"""
Bilingual Voice Assistant Core
Main logic for processing voice commands and generating responses.
"""

import os
import json
import logging
import random
from typing import Optional, Dict, List, Tuple
from pathlib import Path
from datetime import datetime

from speech_recognizer import BilingualSpeechRecognizer
from music_player import MusicPlayer
from openclaw_client import OpenClawClient

logger = logging.getLogger(__name__)

# Emoji mappings for different response types
EMOJIS = {
    'greeting': '👋',
    'music_play': '🎵',
    'music_pause': '⏸️ ',
    'music_resume': '▶️ ',
    'music_stop': '⏹️ ',
    'music_next': '⏭️ ',
    'music_volume': '🔊',
    'time': '🕐',
    'question': '🤔',
    'answer': '💬',
    'error': '⚠️ ',
    'success': '✅',
    'thinking': '💭',
    'openclaw': '🏭',
}


class VoiceAssistant:
    """
    Main assistant class coordinating speech recognition,
    command processing, and responses.
    """

    def __init__(self, config_path: str = "config.json"):
        self.config_path = config_path
        self.config = self._load_config(config_path)

        # Initialize components
        self.speech_recognizer = BilingualSpeechRecognizer(config_path)
        self.music_player = MusicPlayer(config_path)
        self.openclaw_client = OpenClawClient(config_path)

        # Command patterns
        self.music_commands = [
            "play", "pause", "resume", "stop", "next", "previous",
            "volume", "shuffle", "repeat"
        ]

        self.chinese_music_commands = [
            "播放", "暂停", "继续", "停止", "下一首", "上一首",
            "音量", "随机", "重复"
        ]

        logger.info("VoiceAssistant initialized")

    def _load_config(self, config_path: str) -> dict:
        """Load configuration."""
        try:
            with open(config_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return {}

    def process_command(self, text: str, language: str = "en") -> Tuple[str, str]:
        """
        Process a voice command and return response.

        Args:
            text: Recognized text
            language: Detected language ('en' or 'zh')

        Returns:
            Tuple of (response_text, response_language)
        """
        text_lower = text.lower()

        # Music commands
        if self._is_music_command(text_lower, language):
            return self._handle_music_command(text_lower, language)

        # Time query
        if any(word in text_lower for word in ["what time", "time is it", "几点", "时间"]):
            return self._get_time(language)

        # Greeting
        if any(word in text_lower for word in ["hello", "hi", "hey", "你好", "您好"]):
            return self._get_greeting(language)

        # OpenClaw query
        if "ask claw" in text_lower or "问 claw" in text_lower:
            # Extract the actual question
            question = text_lower.replace("ask claw", "").replace("问 claw", "").strip()
            return self._ask_openclaw(question, language)

        # Default: ask OpenClaw
        return self._ask_openclaw(text, language)

    def _is_music_command(self, text: str, language: str) -> bool:
        """Check if text is a music command."""
        if language == "en":
            return any(cmd in text for cmd in self.music_commands)
        else:
            return any(cmd in text for cmd in self.chinese_music_commands)

    def _handle_music_command(self, text: str, language: str) -> Tuple[str, str]:
        """Handle music playback commands."""

        # Play command
        if "play" in text or "播放" in text:
            # Extract song name if specified
            song_name = self._extract_song_name(text)
            if song_name:
                matches = self.music_player.search_tracks(song_name)
                if matches:
                    self.music_player.play(matches[0])
                    return (f"Playing {matches[0].name}",
                            "en" if language == "en" else "zh")
                else:
                    return ("Song not found",
                            "en" if language == "en" else "zh")
            else:
                # Play random track
                if self.music_player.music_library:
                    first_track = list(self.music_player.music_library.values())[0]
                    self.music_player.play(first_track)
                    return (f"{EMOJIS['music_play']} Playing music",
                            "en" if language == "en" else "zh")

        # Pause
        elif "pause" in text or "暂停" in text:
            self.music_player.pause()
            return (f"{EMOJIS['music_pause']} Paused", "en" if language == "en" else "zh")

        # Resume
        elif "resume" in text or "继续" in text:
            self.music_player.resume()
            return (f"{EMOJIS['music_resume']} Resumed", "en" if language == "en" else "zh")

        # Stop
        elif "stop" in text or "停止" in text:
            self.music_player.stop()
            return (f"{EMOJIS['music_stop']} Stopped", "en" if language == "en" else "zh")

        # Next
        elif "next" in text or "下一首" in text:
            self.music_player.next()
            return (f"{EMOJIS['music_next']} Next track", "en" if language == "en" else "zh")

        # Volume
        elif "volume" in text or "音量" in text:
            if "up" in text or "大" in text:
                self.music_player.set_volume(self.music_player.volume + 0.1)
            elif "down" in text or "小" in text:
                self.music_player.set_volume(self.music_player.volume - 0.1)
            return (f"{EMOJIS['music_volume']} Volume adjusted", "en" if language == "en" else "zh")

        return ("Command not recognized", "en" if language == "en" else "zh")

    def _extract_song_name(self, text: str) -> Optional[str]:
        """Extract song name from command."""
        # Simple implementation - look for text after "play"
        if "play" in text:
            parts = text.split("play", 1)
            if len(parts) > 1:
                return parts[1].strip()
        if "播放" in text:
            parts = text.split("播放", 1)
            if len(parts) > 1:
                return parts[1].strip()
        return None

    def _get_time(self, language: str) -> Tuple[str, str]:
        """Get current time response."""
        now = datetime.now()
        emoji = EMOJIS['time']
        if language == "zh":
            return (f"{emoji} 现在时间是 {now.strftime('%H点%M分')}", "zh")
        else:
            return (f"{emoji} The current time is {now.strftime('%I:%M %p')}", "en")

    def _get_greeting(self, language: str) -> Tuple[str, str]:
        """Get greeting response."""
        greetings_en = [
            "Hello! How can I help you?",
            "Hi there! What can I do for you?",
            "Hey! Ready to assist you."
        ]
        greetings_zh = [
            "你好！有什么可以帮你的吗？",
            "您好！需要什么帮助？",
            "嗨！随时为您服务。"
        ]

        if language == "zh":
            return (random.choice(greetings_zh), "zh")
        else:
            return (random.choice(greetings_en), "en")

    def _ask_openclaw(self, question: str, language: str) -> Tuple[str, str]:
        """Send question to OpenClaw and get response."""
        if not self.openclaw_client.enabled:
            if language == "zh":
                return (f"{EMOJIS['openclaw']} OpenClaw 未启用", "zh")
            else:
                return (f"{EMOJIS['openclaw']} OpenClaw is not enabled", "en")

        # Add context about language preference
        context = {"preferred_language": language}

        response = self.openclaw_client.send_request(question, context)

        if "error" in response:
            if language == "zh":
                return (f"{EMOJIS['error']} 抱歉，暂时无法回答", "zh")
            else:
                return (f"{EMOJIS['error']} Sorry, I can't answer that right now", "en")

        # Extract response text
        response_text = response.get("response", str(response))

        # Detect response language
        response_lang = language  # Assume same language
        if any('\u4e00' <= char <= '\u9fff' for char in response_text):
            response_lang = "zh"

        return (response_text, response_lang)

    def get_status(self) -> Dict:
        """Get assistant status."""
        return {
            "speech_recognizer": "active",
            "music_player": self.music_player.get_status(),
            "openclaw": self.openclaw_client.get_status()
        }


def main():
    """Test the assistant."""
    assistant = VoiceAssistant()

    # Test commands
    test_commands = [
        ("hello", "en"),
        ("what time is it", "en"),
        ("play music", "en"),
        ("你好", "zh"),
        ("现在几点", "zh"),
        ("播放音乐", "zh")
    ]

    for text, lang in test_commands:
        response, resp_lang = assistant.process_command(text, lang)
        print(f"Input: {text} ({lang})")
        print(f"Output: {response} ({resp_lang})")
        print("-" * 40)


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()