voice_assistant/mqtt_audio_bridge.py

106 lines
3.6 KiB
Python

import paho.mqtt.client as mqtt
import requests
import wave
import io
import numpy as np
import openwakeword
from openwakeword.model import Model
# --- CONFIG ---
MQTT_BROKER = "192.168.20.30"
MQTT_USER = "mqtt-user"
MQTT_PASS = "sam4jo"
TOPIC_AUDIO = "homeassistant/voice/audio"
TOPIC_ACK = "homeassistant/voice/ack"
TOPIC_TEXT = "homeassistant/voice/text"
WHISPER_URL = "http://localhost:5000/transcribe"
WAKE_WORD_MODEL = "hey_jarvis" # or 'alexa', 'hey_mycroft', 'timer', 'weather'
# --- STATE ---
# We buffer audio here.
# 1. We feed it to WakeWord.
# 2. If WakeWord triggers, we KEEP recording for the command.
audio_buffer = bytearray()
is_command_mode = False
command_buffer = bytearray()
print("Loading OpenWakeWord...")
owwModel = Model(wakeword_models=[WAKE_WORD_MODEL])
def send_to_whisper(raw_data):
print(f"Sending {len(raw_data)} bytes to Whisper...")
wav_io = io.BytesIO()
with wave.open(wav_io, "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(16000)
wav_file.writeframes(raw_data)
wav_io.seek(0)
try:
files = {'audio': ('cmd.wav', wav_io, 'audio/wav')}
res = requests.post(WHISPER_URL, files=files)
if res.status_code == 200:
text = res.json().get("transcript", "")
print(f"COMMAND: {text}")
client.publish(TOPIC_TEXT, text)
client.publish(TOPIC_ACK, "OK") # Rainbow LED
else:
print("Whisper Error")
except Exception as e:
print(f"Error: {e}")
def on_message(client, userdata, msg):
global audio_buffer, is_command_mode, command_buffer
if msg.topic == TOPIC_AUDIO:
payload = msg.payload
# 1. If we are already listening for a command, just accumulate
if is_command_mode:
command_buffer.extend(payload)
# If buffer gets too big (e.g. 5 seconds), cut it off
if len(command_buffer) > 16000 * 2 * 5:
print("Timeout. Sending...")
send_to_whisper(command_buffer)
is_command_mode = False
command_buffer = bytearray()
return
# 2. If NOT in command mode, feed OpenWakeWord
# OWW needs 1280 samples (2560 bytes) chunks usually
audio_buffer.extend(payload)
# Process in chunks
chunk_size = 1280 * 2 # 1280 samples * 2 bytes
while len(audio_buffer) >= chunk_size:
chunk = audio_buffer[:chunk_size]
audio_buffer = audio_buffer[chunk_size:]
# Convert to numpy for OWW
audio_int16 = np.frombuffer(chunk, dtype=np.int16)
# Predict
prediction = owwModel.predict(audio_int16)
# Check score (0.0 to 1.0)
if prediction[WAKE_WORD_MODEL] > 0.5:
print(f"WAKE WORD DETECTED: {WAKE_WORD_MODEL}")
is_command_mode = True
command_buffer = bytearray() # Start fresh for command
# Optional: Send "Awake" LED command to ESP32 here
elif msg.topic == "homeassistant/voice/status" and msg.payload.decode() == "processing":
# ESP32 finished its VAD stream.
if is_command_mode:
send_to_whisper(command_buffer)
is_command_mode = False
command_buffer = bytearray()
client = mqtt.Client()
client.username_pw_set(MQTT_USER, MQTT_PASS)
client.connect(MQTT_BROKER, 1883)
client.subscribe([(TOPIC_AUDIO, 0), ("homeassistant/voice/status", 0)])
client.loop_forever()