voice_bridge/venv/lib/python3.12/site-packages/openwakeword/model.py

# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Imports
import numpy as np
import onnxruntime as ort
import openwakeword
from openwakeword.utils import AudioFeatures

import wave
import os
import pickle
from collections import deque, defaultdict
from functools import partial
import time
from typing import List, Union, DefaultDict, Dict


# Define main model class
class Model():
    """
    The main model class for openWakeWord. Creates a model object with the shared audio pre-processer
    and for arbitrarily many custom wake word/wake phrase models.
    """
    def __init__(
            self,
            wakeword_model_paths: List[str] = [],
            class_mapping_dicts: List[dict] = [],
            enable_speex_noise_suppression: bool = False,
            vad_threshold: float = 0,
            custom_verifier_models: dict = {},
            custom_verifier_threshold: float = 0.1,
            **kwargs
            ):
        """Initialize the openWakeWord model object.

        Args:
            wakeword_model_paths (List[str]): A list of paths of ONNX models to load into the openWakeWord model object.
                                              If not provided, will load all of the pre-trained models.
            class_mapping_dicts (List[dict]): A list of dictionaries with integer to string class mappings for
                                              each model in the `wakeword_model_paths` arguments
                                              (e.g., {"0": "class_1", "1": "class_2"})
            enable_speex_noise_suppression (bool): Whether to use the noise suppresion from the SpeexDSP
                                                   library to pre-process all incoming audio. May increase
                                                   model performance when reasonably stationary background noise
                                                   is present in the environment where openWakeWord will be used.
                                                   It is very lightweight, so enabling it doesn't significantly
                                                   impact efficiency.
            vad_threshold (float): Whether to use a voice activity detection model (VAD) from Silero
                                   (https://github.com/snakers4/silero-vad) to filter predictions.
                                   For every input audio frame, a VAD score is obtained and only those model predictions
                                   with VAD scores above the threshold will be returned. The default value (0),
                                   disables voice activity detection entirely.
            custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
                                           the keys are the model names (corresponding to the openwakeword.models
                                           attribute) and the values are the filepaths of the
                                           custom verifier models.
            custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score
                                               from a model for a given frame is greater than this value, the
                                               associated custom verifier model will also predict on that frame, and
                                               the verifier score will be returned.
            kwargs (dict): Any other keyword arguments to pass the the preprocessor instance
        """

        # Initialize the ONNX models and store them
        sessionOptions = ort.SessionOptions()
        sessionOptions.inter_op_num_threads = 1
        sessionOptions.intra_op_num_threads = 1

        # Get model paths for pre-trained models if user doesn't provide models to load
        if wakeword_model_paths == []:
            wakeword_model_paths = openwakeword.get_pretrained_model_paths()
            wakeword_model_names = list(openwakeword.models.keys())
        else:
            wakeword_model_names = [os.path.basename(i[0:-5]) for i in wakeword_model_paths]

        # Create attributes to store models and metadata
        self.models = {}
        self.model_inputs = {}
        self.model_outputs = {}
        self.class_mapping = {}
        self.model_input_names = {}
        self.custom_verifier_models = {}
        self.custom_verifier_threshold = custom_verifier_threshold
        for mdl_path, mdl_name in zip(wakeword_model_paths, wakeword_model_names):
            # Load openwakeword models
            self.models[mdl_name] = ort.InferenceSession(mdl_path, sess_options=sessionOptions,
                                                         providers=["CPUExecutionProvider"])
            self.model_inputs[mdl_name] = self.models[mdl_name].get_inputs()[0].shape[1]
            self.model_outputs[mdl_name] = self.models[mdl_name].get_outputs()[0].shape[1]
            if class_mapping_dicts and class_mapping_dicts[wakeword_model_paths.index(mdl_path)].get(mdl_name, None):
                self.class_mapping[mdl_name] = class_mapping_dicts[wakeword_model_paths.index(mdl_path)]
            elif openwakeword.model_class_mappings.get(mdl_name, None):
                self.class_mapping[mdl_name] = openwakeword.model_class_mappings[mdl_name]
            else:
                self.class_mapping[mdl_name] = {str(i): str(i) for i in range(0, self.model_outputs[mdl_name])}
            self.model_input_names[mdl_name] = self.models[mdl_name].get_inputs()[0].name

            # Load custom verifier models
            if isinstance(custom_verifier_models, dict):
                if custom_verifier_models.get(mdl_name, False):
                    self.custom_verifier_models[mdl_name] = pickle.load(open(custom_verifier_models[mdl_name], 'rb'))

            if len(self.custom_verifier_models.keys()) < len(custom_verifier_models.keys()):
                raise ValueError(
                    "Custom verifier models were provided, but some were not matched with a base model!"
                    " Make sure that the keys provided in the `custom_verifier_models` dictionary argument"
                    " exactly match that of the `.models` attribute of an instantiated openWakeWord Model object"
                    " that has the same base models but doesn't have custom verifier models."
                )

        # Create buffer to store frame predictions
        self.prediction_buffer: DefaultDict[str, deque] = defaultdict(partial(deque, maxlen=30))

        # Initialize SpeexDSP noise canceller
        if enable_speex_noise_suppression:
            from speexdsp_ns import NoiseSuppression
            self.speex_ns = NoiseSuppression.create(160, 16000)
        else:
            self.speex_ns = None

        # Initialize Silero VAD
        self.vad_threshold = vad_threshold
        if vad_threshold > 0:
            self.vad = openwakeword.VAD()

        # Create AudioFeatures object
        self.preprocessor = AudioFeatures(**kwargs)

    def get_parent_model_from_label(self, label):
        """Gets the parent model associated with a given prediction label"""
        parent_model = ""
        for mdl in self.class_mapping.keys():
            if label in self.class_mapping[mdl].values():
                parent_model = mdl
            elif label in self.class_mapping.keys() and label == mdl:
                parent_model = mdl

        return parent_model

    def reset(self):
        """Reset the prediction buffer"""
        self.prediction_buffer = defaultdict(partial(deque, maxlen=30))

    def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False):
        """Predict with all of the wakeword models on the input audio frames

        Args:
            x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms
                                (1280 samples), with longer lengths reducing overall CPU usage
                                but decreasing detection latency.
            patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
                             be observed before the current frame will be returned as non-zero.
                             Must be provided as an a dictionary where the keys are the
                             model names and the values are the number of frames. Can reduce false-positive
                             detections at the cost of a lower true-positive rate.
                             By default, this behavior is disabled.
            threshold (dict): The threshold values to use when the `patience` behavior is enabled.
                              Must be provided as an a dictionary where the keys are the
                              model names and the values are the thresholds.
            timing (bool): Whether to return timing information of the models. Can be useful to debug and
                           assess how efficiently models are running on the current hardware.

        Returns:
            dict: A dictionary of scores between 0 and 1 for each model, where 0 indicates no
                  wake-word/wake-phrase detected. If the `timing` argument is true, returns a
                  tuple of dicts containing model predictions and timing information, respectively.
        """

        # Setup timing dict
        if timing:
            timing_dict: Dict[str, Dict] = {}
            timing_dict["models"] = {}
            feature_start = time.time()

        # Get audio features (optionally with Speex noise suppression)
        if self.speex_ns:
            self.preprocessor(self._suppress_noise_with_speex(x))
        else:
            self.preprocessor(x)

        if timing:
            timing_dict["models"]["preprocessor"] = time.time() - feature_start

        # Get predictions from model(s)
        predictions = {}
        for mdl in self.models.keys():
            input_name = self.model_input_names[mdl]

            if timing:
                model_start = time.time()

            # Run model to get predictions
            if len(x) > 1280:
                group_predictions = []
                for i in np.arange(len(x)//1280-1, -1, -1):
                    group_predictions.extend(
                        self.models[mdl].run(
                            None,
                            {input_name: self.preprocessor.get_features(
                                    self.model_inputs[mdl],
                                    start_ndx=-self.model_inputs[mdl] - i
                            )}
                        )
                    )
                prediction = np.array(group_predictions).max(axis=0)[None, ]
            else:
                prediction = self.models[mdl].run(
                                        None,
                                        {input_name: self.preprocessor.get_features(self.model_inputs[mdl])}
                                    )

            if self.model_outputs[mdl] == 1:
                predictions[mdl] = prediction[0][0][0]
            else:
                for int_label, cls in self.class_mapping[mdl].items():
                    predictions[cls] = prediction[0][0][int(int_label)]

            # Update scores based on custom verifier model
            if self.custom_verifier_models != {}:
                for cls in predictions.keys():
                    if predictions[cls] >= self.custom_verifier_threshold:
                        parent_model = self.get_parent_model_from_label(cls)
                        if self.custom_verifier_models.get(parent_model, False):
                            verifier_prediction = self.custom_verifier_models[parent_model].predict_proba(
                                self.preprocessor.get_features(self.model_inputs[mdl])
                            )[0][-1]
                            predictions[cls] = verifier_prediction

            # Update prediction buffer, and zero predictions for first 5 frames during model initialization
            for cls in predictions.keys():
                if len(self.prediction_buffer[cls]) < 5:
                    predictions[cls] = 0.0
                self.prediction_buffer[cls].append(predictions[cls])

            # Get timing information
            if timing:
                timing_dict["models"][mdl] = time.time() - model_start

        # Update scores based on thresholds or patience arguments
        if patience != {}:
            if threshold == {}:
                raise ValueError("Error! When using the `patience` argument, threshold "
                                 "values must be provided via the `threshold` argument!")
            for mdl in predictions.keys():
                parent_model = self.get_parent_model_from_label(mdl)
                if parent_model in patience.keys():
                    scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:]
                    if (scores >= threshold[parent_model]).sum() < patience[parent_model]:
                        predictions[mdl] = 0.0

        # (optionally) get voice activity detection scores and update model scores
        if self.vad_threshold > 0:
            if timing:
                vad_start = time.time()

            self.vad(x)

            if timing:
                timing_dict["models"]["vad"] = time.time() - vad_start

            # Get frames from last 0.4 to 0.56 seconds (3 frames) before the current
            # frame and get max VAD score
            vad_frames = list(self.vad.prediction_buffer)[-7:-4]
            vad_max_score = np.max(vad_frames) if len(vad_frames) > 0 else 0
            for mdl in predictions.keys():
                if vad_max_score < self.vad_threshold:
                    predictions[mdl] = 0.0

        if timing:
            return predictions, timing_dict
        else:
            return predictions

    def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, chunk_size=1280, **kwargs):
        """Predict on an full audio clip, simulating streaming prediction.
        The input clip must bit a 16-bit, 16 khz, single-channel WAV file.

        Args:
            clip (Union[str, np.ndarray]): The path to a 16-bit PCM, 16 khz, single-channel WAV file,
                                           or an 1D array containing the same type of data
            padding (int): How many seconds of silence to pad the start/end of the clip with
                            to make sure that short clips can be processed correctly (default: 1)
            chunk_size (int): The size (in samples) of each chunk of audio to pass to the model
            kwargs: Any keyword arguments to pass to the class `predict` method

        Returns:
            list: A list containing the frame-level prediction dictionaries for the audio clip
        """
        if isinstance(clip, str):
            # Load audio clip as 16-bit PCM data
            with wave.open(clip, mode='rb') as f:
                # Load WAV clip frames
                data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
        elif isinstance(clip, np.ndarray):
            data = clip

        if padding:
            data = np.concatenate(
                (
                    np.zeros(16000*padding).astype(np.int16),
                    data,
                    np.zeros(16000*padding).astype(np.int16)
                )
            )

        # Iterate through clip, getting predictions
        predictions = []
        step_size = chunk_size
        for i in range(0, data.shape[0]-step_size, step_size):
            predictions.append(self.predict(data[i:i+step_size], **kwargs))

        return predictions

    def _get_positive_prediction_frames(
            self,
            file: str,
            threshold: float = 0.5,
            return_type: str = "features",
            **kwargs
            ):
        """
        Gets predictions for the input audio data, and returns the audio features (embeddings)
        or audio data for all of the frames with a score above the `threshold` argument.
        Can be a useful way to collect false-positive predictions.

        Args:
            file (str): The path to a 16-bit 16khz WAV audio file to process
            threshold (float): The minimum score required for a frame of audio features
                               to be returned.
            return_type (str): The type of data to return when a positive prediction is
                               detected. Can be either 'features' or 'audio' to return
                               audio embeddings or raw audio data, respectively.
            kwargs: Any keyword arguments to pass to the class `predict` method

        Returns:
            dict: A dictionary with filenames as keys and  N x M arrays as values,
                  where N is the number of examples and M is the number
                  of audio features, depending on the model input shape.
        """
        # Load audio clip as 16-bit PCM data
        with wave.open(file, mode='rb') as f:
            # Load WAV clip frames
            data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)

        # Iterate through clip, getting predictions
        positive_data = defaultdict(list)
        step_size = 1280
        for i in range(0, data.shape[0]-step_size, step_size):
            predictions = self.predict(data[i:i+step_size], **kwargs)
            for lbl in predictions.keys():
                if predictions[lbl] >= threshold:
                    mdl = self.get_parent_model_from_label(lbl)
                    features = self.preprocessor.get_features(self.model_inputs[mdl])
                    if return_type == 'features':
                        positive_data[lbl].append(features)
                    if return_type == 'audio':
                        context = data[max(0, i - 16000*3):i + 16000]
                        if len(context) == 16000*4:
                            positive_data[lbl].append(context)

        positive_data_combined = {}
        for lbl in positive_data.keys():
            positive_data_combined[lbl] = np.vstack(positive_data[lbl])

        return positive_data_combined

    def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160):
        """
        Runs the input audio through the SpeexDSP noise suppression algorithm.
        Note that this function updates the state of the existing Speex noise
        suppression object, and isn't intended to be called externally.

        Args:
            x (ndarray): The 16-bit, 16khz audio to process. Must always be an
                         integer multiple of `frame_size`.
            frame_size (int): The frame size to use for the Speex Noise suppressor.
                              Must match the frame size specified during the
                              initialization of the noise suppressor.

        Returns:
            ndarray: The input audio with noise suppression applied
        """
        cleaned = []
        for i in range(0, x.shape[0], frame_size):
            chunk = x[i:i+frame_size]
            cleaned.append(self.speex_ns.process(chunk.tobytes()))

        cleaned_bytestring = b''.join(cleaned)
        cleaned_array = np.frombuffer(cleaned_bytestring, np.int16)
        return cleaned_array