diff --git a/display.py b/display.py index 4e3e839..813526a 100644 --- a/display.py +++ b/display.py @@ -18,6 +18,7 @@ import math import threading import time as _time import argparse +import subprocess import urllib.request import urllib.parse import xml.etree.ElementTree as ET @@ -238,31 +239,80 @@ def _weather_worker(locations: list[tuple[float, float, str]]) -> None: # WAKE WORD # ═══════════════════════════════════════════════════════════════════════════════ -_OWW_MODEL = '/home/dfr84/Python/JARVIS/Jarvis.onnx' -_OWW_THRESHOLD = 0.5 -_OWW_CHUNK = 1280 +_OWW_MODEL = '/home/dfr84/Python/JARVIS/Jarvis.onnx' +_OWW_THRESHOLD = 0.6 +_OWW_DEBOUNCE = 2 # consecutive frames above threshold to trigger +_OWW_CHUNK = 1280 +_LISTEN_SECONDS = 4.0 +_RESPONSE_HOLD = 3.0 + +# state: 'idle' | 'listening' | 'positive' | 'negative' +_wake: dict = {'state': 'idle', 'detected_at': 0.0, 'idle_since': 0.0} +_WAKE_COOLDOWN = 4.0 # seconds after response before listening again + +_FACE_GREEN = ( 40, 200, 80) +_FACE_BLUE = ( 40, 80, 220) +_FACE_RED = (200, 50, 50) + + +def _lerp_color(c1: tuple, c2: tuple, t: float) -> tuple: + return tuple(int(c1[i] + (c2[i] - c1[i]) * t) for i in range(3)) + + +def _speak(text: str) -> None: + subprocess.run(['espeak-ng', '-s', '150', text], capture_output=True) + + +def _handle_command(raw: bytes, n_ch: int, native_hz: int) -> None: + try: + import speech_recognition as sr + import numpy as np + + audio_np = np.frombuffer(raw, dtype=np.int16) + if n_ch > 1: + audio_np = audio_np.reshape(-1, n_ch)[:, 0] + + target_hz = 16000 + if native_hz != target_hz: + ratio = target_hz / native_hz + new_len = int(len(audio_np) * ratio) + indices = np.round(np.linspace(0, len(audio_np) - 1, new_len)).astype(int) + audio_np = audio_np[indices] + + audio_data = sr.AudioData(audio_np.tobytes(), target_hz, 2) + text = sr.Recognizer().recognize_google(audio_data).lower() + + if 'are you there' in text: + _wake['state'] = 'positive' + _speak('Yes, I am here') + else: + _wake['state'] = 'negative' + _speak("Sorry, I didn't understand") + + except Exception: + _wake['state'] = 'negative' + _speak("Sorry, I didn't understand") + + _time.sleep(_RESPONSE_HOLD) + _wake['idle_since'] = _time.time() + _wake['state'] = 'idle' -# Shared wake state — written by audio thread, read by render thread -_wake: dict = {'active': False, 'detected_at': 0.0} def _wake_worker() -> None: try: import pyaudio import numpy as np from openwakeword.model import Model - except ImportError as e: - print(f'[WAKE] Missing dependency: {e} — wake word disabled') + except ImportError: return try: - model = Model(wakeword_model_paths=[_OWW_MODEL]) - - audio = pyaudio.PyAudio() - dev_info = audio.get_device_info_by_index(_args.mic) + model = Model(wakeword_model_paths=[_OWW_MODEL]) + audio = pyaudio.PyAudio() + dev_info = audio.get_device_info_by_index(_args.mic) n_ch = int(dev_info['maxInputChannels']) native_hz = int(dev_info['defaultSampleRate']) target_hz = 16000 - # frames_per_buffer scaled so we always get ~_OWW_CHUNK samples at 16 kHz buf_frames = int(_OWW_CHUNK * native_hz / target_hz) stream = audio.open( @@ -274,35 +324,49 @@ def _wake_worker() -> None: frames_per_buffer=buf_frames, ) + cmd_frames: list[bytes] = [] + hit_count = 0 + while True: - data = stream.read(buf_frames, exception_on_overflow=False) - audio_data = np.frombuffer(data, dtype=np.int16) - if n_ch > 1: - audio_data = audio_data.reshape(-1, n_ch)[:, 0] - # resample to 16 kHz - if native_hz != target_hz: - ratio = target_hz / native_hz - new_len = int(len(audio_data) * ratio) - indices = np.round(np.linspace(0, len(audio_data) - 1, new_len)).astype(int) - audio_data = audio_data[indices] - prediction = model.predict(audio_data) - for score in prediction.values(): - if score >= _OWW_THRESHOLD: - _wake['active'] = True + data = stream.read(buf_frames, exception_on_overflow=False) + state = _wake['state'] + + if state == 'idle': + audio_np = np.frombuffer(data, dtype=np.int16) + if n_ch > 1: + audio_np = audio_np.reshape(-1, n_ch)[:, 0] + if native_hz != target_hz: + ratio = target_hz / native_hz + new_len = int(len(audio_np) * ratio) + indices = np.round(np.linspace(0, len(audio_np) - 1, new_len)).astype(int) + audio_np = audio_np[indices] + if _time.time() - _wake['idle_since'] < _WAKE_COOLDOWN: + hit_count = 0 + continue + triggered = any(s >= _OWW_THRESHOLD for s in model.predict(audio_np).values()) + if triggered: + hit_count += 1 + else: + hit_count = 0 + if hit_count >= _OWW_DEBOUNCE: + hit_count = 0 + _wake['state'] = 'listening' _wake['detected_at'] = _time.time() - break - except Exception as e: - import traceback - print(f'[WAKE] {type(e).__name__}: {e}') - traceback.print_exc() + cmd_frames.clear() + elif state == 'listening': + cmd_frames.append(data) + if _time.time() - _wake['detected_at'] >= _LISTEN_SECONDS: + _wake['state'] = 'processing' + threading.Thread( + target=_handle_command, + args=(b''.join(cmd_frames), n_ch, native_hz), + daemon=True, + ).start() + cmd_frames.clear() -def _lerp_color(c1: tuple, c2: tuple, t: float) -> tuple: - return tuple(int(c1[i] + (c2[i] - c1[i]) * t) for i in range(3)) - -_FACE_GREEN = ( 40, 200, 80) -_FACE_BLUE = ( 40, 80, 220) -_WAKE_DURATION = 8.0 # seconds before returning to idle + except Exception: + pass # ═══════════════════════════════════════════════════════════════════════════════ @@ -596,14 +660,15 @@ def main() -> None: pygame.quit(); return # ── Wake-word face colour ───────────────────────────────────────────── - if _wake['active']: + _ws = _wake['state'] + if _ws in ('listening', 'processing'): elapsed = _time.time() - _wake['detected_at'] - if elapsed >= _WAKE_DURATION: - _wake['active'] = False - face_color = None - else: - t = (math.sin(elapsed * math.pi * 2.0) + 1.0) / 2.0 # 0→1, 1 Hz - face_color = _lerp_color(_FACE_GREEN, _FACE_BLUE, t) + t = (math.sin(elapsed * math.pi * 2.0) + 1.0) / 2.0 + face_color = _lerp_color(_FACE_GREEN, _FACE_BLUE, t) + elif _ws == 'positive': + face_color = _FACE_GREEN + elif _ws == 'negative': + face_color = _FACE_RED else: face_color = None