voice_assistant/mqtt_audio_bridge.py

import paho.mqtt.client as mqtt
import requests
import wave
import io
import numpy as np
import openwakeword
from openwakeword.model import Model

# --- CONFIG ---
MQTT_BROKER = "192.168.20.30"
MQTT_USER   = "mqtt-user"
MQTT_PASS   = "sam4jo"
TOPIC_AUDIO = "homeassistant/voice/audio"
TOPIC_ACK   = "homeassistant/voice/ack"
TOPIC_TEXT  = "homeassistant/voice/text"

WHISPER_URL = "http://localhost:5000/transcribe"
WAKE_WORD_MODEL = "hey_jarvis" # or 'alexa', 'hey_mycroft', 'timer', 'weather'

# --- STATE ---
# We buffer audio here.
# 1. We feed it to WakeWord.
# 2. If WakeWord triggers, we KEEP recording for the command.
audio_buffer = bytearray()
is_command_mode = False
command_buffer = bytearray()

print("Loading OpenWakeWord...")
owwModel = Model(wakeword_models=[WAKE_WORD_MODEL])

def send_to_whisper(raw_data):
    print(f"Sending {len(raw_data)} bytes to Whisper...")
    wav_io = io.BytesIO()
    with wave.open(wav_io, "wb") as wav_file:
        wav_file.setnchannels(1)
        wav_file.setsampwidth(2)
        wav_file.setframerate(16000)
        wav_file.writeframes(raw_data)
    wav_io.seek(0)

    try:
        files = {'audio': ('cmd.wav', wav_io, 'audio/wav')}
        res = requests.post(WHISPER_URL, files=files)
        if res.status_code == 200:
            text = res.json().get("transcript", "")
            print(f"COMMAND: {text}")
            client.publish(TOPIC_TEXT, text)
            client.publish(TOPIC_ACK, "OK") # Rainbow LED
        else:
            print("Whisper Error")
    except Exception as e:
        print(f"Error: {e}")

def on_message(client, userdata, msg):
    global audio_buffer, is_command_mode, command_buffer

    if msg.topic == TOPIC_AUDIO:
        payload = msg.payload

        # 1. If we are already listening for a command, just accumulate
        if is_command_mode:
            command_buffer.extend(payload)
            # If buffer gets too big (e.g. 5 seconds), cut it off
            if len(command_buffer) > 16000 * 2 * 5:
                print("Timeout. Sending...")
                send_to_whisper(command_buffer)
                is_command_mode = False
                command_buffer = bytearray()
            return

        # 2. If NOT in command mode, feed OpenWakeWord
        # OWW needs 1280 samples (2560 bytes) chunks usually
        audio_buffer.extend(payload)

        # Process in chunks
        chunk_size = 1280 * 2 # 1280 samples * 2 bytes
        while len(audio_buffer) >= chunk_size:
            chunk = audio_buffer[:chunk_size]
            audio_buffer = audio_buffer[chunk_size:]

            # Convert to numpy for OWW
            audio_int16 = np.frombuffer(chunk, dtype=np.int16)

            # Predict
            prediction = owwModel.predict(audio_int16)

            # Check score (0.0 to 1.0)
            if prediction[WAKE_WORD_MODEL] > 0.5:
                print(f"WAKE WORD DETECTED: {WAKE_WORD_MODEL}")
                is_command_mode = True
                command_buffer = bytearray() # Start fresh for command
                # Optional: Send "Awake" LED command to ESP32 here

    elif msg.topic == "homeassistant/voice/status" and msg.payload.decode() == "processing":
        # ESP32 finished its VAD stream.
        if is_command_mode:
            send_to_whisper(command_buffer)
            is_command_mode = False
            command_buffer = bytearray()

client = mqtt.Client()
client.username_pw_set(MQTT_USER, MQTT_PASS)
client.connect(MQTT_BROKER, 1883)
client.subscribe([(TOPIC_AUDIO, 0), ("homeassistant/voice/status", 0)])
client.loop_forever()