# Copyright 2022 David Scripka. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Imports import numpy as np import onnxruntime as ort import openwakeword from openwakeword.utils import AudioFeatures import wave import os import pickle from collections import deque, defaultdict from functools import partial import time from typing import List, Union, DefaultDict, Dict # Define main model class class Model(): """ The main model class for openWakeWord. Creates a model object with the shared audio pre-processer and for arbitrarily many custom wake word/wake phrase models. """ def __init__( self, wakeword_model_paths: List[str] = [], class_mapping_dicts: List[dict] = [], enable_speex_noise_suppression: bool = False, vad_threshold: float = 0, custom_verifier_models: dict = {}, custom_verifier_threshold: float = 0.1, **kwargs ): """Initialize the openWakeWord model object. Args: wakeword_model_paths (List[str]): A list of paths of ONNX models to load into the openWakeWord model object. If not provided, will load all of the pre-trained models. class_mapping_dicts (List[dict]): A list of dictionaries with integer to string class mappings for each model in the `wakeword_model_paths` arguments (e.g., {"0": "class_1", "1": "class_2"}) enable_speex_noise_suppression (bool): Whether to use the noise suppresion from the SpeexDSP library to pre-process all incoming audio. May increase model performance when reasonably stationary background noise is present in the environment where openWakeWord will be used. It is very lightweight, so enabling it doesn't significantly impact efficiency. vad_threshold (float): Whether to use a voice activity detection model (VAD) from Silero (https://github.com/snakers4/silero-vad) to filter predictions. For every input audio frame, a VAD score is obtained and only those model predictions with VAD scores above the threshold will be returned. The default value (0), disables voice activity detection entirely. custom_verifier_models (dict): A dictionary of paths to custom verifier models, where the keys are the model names (corresponding to the openwakeword.models attribute) and the values are the filepaths of the custom verifier models. custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score from a model for a given frame is greater than this value, the associated custom verifier model will also predict on that frame, and the verifier score will be returned. kwargs (dict): Any other keyword arguments to pass the the preprocessor instance """ # Initialize the ONNX models and store them sessionOptions = ort.SessionOptions() sessionOptions.inter_op_num_threads = 1 sessionOptions.intra_op_num_threads = 1 # Get model paths for pre-trained models if user doesn't provide models to load if wakeword_model_paths == []: wakeword_model_paths = openwakeword.get_pretrained_model_paths() wakeword_model_names = list(openwakeword.models.keys()) else: wakeword_model_names = [os.path.basename(i[0:-5]) for i in wakeword_model_paths] # Create attributes to store models and metadata self.models = {} self.model_inputs = {} self.model_outputs = {} self.class_mapping = {} self.model_input_names = {} self.custom_verifier_models = {} self.custom_verifier_threshold = custom_verifier_threshold for mdl_path, mdl_name in zip(wakeword_model_paths, wakeword_model_names): # Load openwakeword models self.models[mdl_name] = ort.InferenceSession(mdl_path, sess_options=sessionOptions, providers=["CPUExecutionProvider"]) self.model_inputs[mdl_name] = self.models[mdl_name].get_inputs()[0].shape[1] self.model_outputs[mdl_name] = self.models[mdl_name].get_outputs()[0].shape[1] if class_mapping_dicts and class_mapping_dicts[wakeword_model_paths.index(mdl_path)].get(mdl_name, None): self.class_mapping[mdl_name] = class_mapping_dicts[wakeword_model_paths.index(mdl_path)] elif openwakeword.model_class_mappings.get(mdl_name, None): self.class_mapping[mdl_name] = openwakeword.model_class_mappings[mdl_name] else: self.class_mapping[mdl_name] = {str(i): str(i) for i in range(0, self.model_outputs[mdl_name])} self.model_input_names[mdl_name] = self.models[mdl_name].get_inputs()[0].name # Load custom verifier models if isinstance(custom_verifier_models, dict): if custom_verifier_models.get(mdl_name, False): self.custom_verifier_models[mdl_name] = pickle.load(open(custom_verifier_models[mdl_name], 'rb')) if len(self.custom_verifier_models.keys()) < len(custom_verifier_models.keys()): raise ValueError( "Custom verifier models were provided, but some were not matched with a base model!" " Make sure that the keys provided in the `custom_verifier_models` dictionary argument" " exactly match that of the `.models` attribute of an instantiated openWakeWord Model object" " that has the same base models but doesn't have custom verifier models." ) # Create buffer to store frame predictions self.prediction_buffer: DefaultDict[str, deque] = defaultdict(partial(deque, maxlen=30)) # Initialize SpeexDSP noise canceller if enable_speex_noise_suppression: from speexdsp_ns import NoiseSuppression self.speex_ns = NoiseSuppression.create(160, 16000) else: self.speex_ns = None # Initialize Silero VAD self.vad_threshold = vad_threshold if vad_threshold > 0: self.vad = openwakeword.VAD() # Create AudioFeatures object self.preprocessor = AudioFeatures(**kwargs) def get_parent_model_from_label(self, label): """Gets the parent model associated with a given prediction label""" parent_model = "" for mdl in self.class_mapping.keys(): if label in self.class_mapping[mdl].values(): parent_model = mdl elif label in self.class_mapping.keys() and label == mdl: parent_model = mdl return parent_model def reset(self): """Reset the prediction buffer""" self.prediction_buffer = defaultdict(partial(deque, maxlen=30)) def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False): """Predict with all of the wakeword models on the input audio frames Args: x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms (1280 samples), with longer lengths reducing overall CPU usage but decreasing detection latency. patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must be observed before the current frame will be returned as non-zero. Must be provided as an a dictionary where the keys are the model names and the values are the number of frames. Can reduce false-positive detections at the cost of a lower true-positive rate. By default, this behavior is disabled. threshold (dict): The threshold values to use when the `patience` behavior is enabled. Must be provided as an a dictionary where the keys are the model names and the values are the thresholds. timing (bool): Whether to return timing information of the models. Can be useful to debug and assess how efficiently models are running on the current hardware. Returns: dict: A dictionary of scores between 0 and 1 for each model, where 0 indicates no wake-word/wake-phrase detected. If the `timing` argument is true, returns a tuple of dicts containing model predictions and timing information, respectively. """ # Setup timing dict if timing: timing_dict: Dict[str, Dict] = {} timing_dict["models"] = {} feature_start = time.time() # Get audio features (optionally with Speex noise suppression) if self.speex_ns: self.preprocessor(self._suppress_noise_with_speex(x)) else: self.preprocessor(x) if timing: timing_dict["models"]["preprocessor"] = time.time() - feature_start # Get predictions from model(s) predictions = {} for mdl in self.models.keys(): input_name = self.model_input_names[mdl] if timing: model_start = time.time() # Run model to get predictions if len(x) > 1280: group_predictions = [] for i in np.arange(len(x)//1280-1, -1, -1): group_predictions.extend( self.models[mdl].run( None, {input_name: self.preprocessor.get_features( self.model_inputs[mdl], start_ndx=-self.model_inputs[mdl] - i )} ) ) prediction = np.array(group_predictions).max(axis=0)[None, ] else: prediction = self.models[mdl].run( None, {input_name: self.preprocessor.get_features(self.model_inputs[mdl])} ) if self.model_outputs[mdl] == 1: predictions[mdl] = prediction[0][0][0] else: for int_label, cls in self.class_mapping[mdl].items(): predictions[cls] = prediction[0][0][int(int_label)] # Update scores based on custom verifier model if self.custom_verifier_models != {}: for cls in predictions.keys(): if predictions[cls] >= self.custom_verifier_threshold: parent_model = self.get_parent_model_from_label(cls) if self.custom_verifier_models.get(parent_model, False): verifier_prediction = self.custom_verifier_models[parent_model].predict_proba( self.preprocessor.get_features(self.model_inputs[mdl]) )[0][-1] predictions[cls] = verifier_prediction # Update prediction buffer, and zero predictions for first 5 frames during model initialization for cls in predictions.keys(): if len(self.prediction_buffer[cls]) < 5: predictions[cls] = 0.0 self.prediction_buffer[cls].append(predictions[cls]) # Get timing information if timing: timing_dict["models"][mdl] = time.time() - model_start # Update scores based on thresholds or patience arguments if patience != {}: if threshold == {}: raise ValueError("Error! When using the `patience` argument, threshold " "values must be provided via the `threshold` argument!") for mdl in predictions.keys(): parent_model = self.get_parent_model_from_label(mdl) if parent_model in patience.keys(): scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] if (scores >= threshold[parent_model]).sum() < patience[parent_model]: predictions[mdl] = 0.0 # (optionally) get voice activity detection scores and update model scores if self.vad_threshold > 0: if timing: vad_start = time.time() self.vad(x) if timing: timing_dict["models"]["vad"] = time.time() - vad_start # Get frames from last 0.4 to 0.56 seconds (3 frames) before the current # frame and get max VAD score vad_frames = list(self.vad.prediction_buffer)[-7:-4] vad_max_score = np.max(vad_frames) if len(vad_frames) > 0 else 0 for mdl in predictions.keys(): if vad_max_score < self.vad_threshold: predictions[mdl] = 0.0 if timing: return predictions, timing_dict else: return predictions def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, chunk_size=1280, **kwargs): """Predict on an full audio clip, simulating streaming prediction. The input clip must bit a 16-bit, 16 khz, single-channel WAV file. Args: clip (Union[str, np.ndarray]): The path to a 16-bit PCM, 16 khz, single-channel WAV file, or an 1D array containing the same type of data padding (int): How many seconds of silence to pad the start/end of the clip with to make sure that short clips can be processed correctly (default: 1) chunk_size (int): The size (in samples) of each chunk of audio to pass to the model kwargs: Any keyword arguments to pass to the class `predict` method Returns: list: A list containing the frame-level prediction dictionaries for the audio clip """ if isinstance(clip, str): # Load audio clip as 16-bit PCM data with wave.open(clip, mode='rb') as f: # Load WAV clip frames data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16) elif isinstance(clip, np.ndarray): data = clip if padding: data = np.concatenate( ( np.zeros(16000*padding).astype(np.int16), data, np.zeros(16000*padding).astype(np.int16) ) ) # Iterate through clip, getting predictions predictions = [] step_size = chunk_size for i in range(0, data.shape[0]-step_size, step_size): predictions.append(self.predict(data[i:i+step_size], **kwargs)) return predictions def _get_positive_prediction_frames( self, file: str, threshold: float = 0.5, return_type: str = "features", **kwargs ): """ Gets predictions for the input audio data, and returns the audio features (embeddings) or audio data for all of the frames with a score above the `threshold` argument. Can be a useful way to collect false-positive predictions. Args: file (str): The path to a 16-bit 16khz WAV audio file to process threshold (float): The minimum score required for a frame of audio features to be returned. return_type (str): The type of data to return when a positive prediction is detected. Can be either 'features' or 'audio' to return audio embeddings or raw audio data, respectively. kwargs: Any keyword arguments to pass to the class `predict` method Returns: dict: A dictionary with filenames as keys and N x M arrays as values, where N is the number of examples and M is the number of audio features, depending on the model input shape. """ # Load audio clip as 16-bit PCM data with wave.open(file, mode='rb') as f: # Load WAV clip frames data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16) # Iterate through clip, getting predictions positive_data = defaultdict(list) step_size = 1280 for i in range(0, data.shape[0]-step_size, step_size): predictions = self.predict(data[i:i+step_size], **kwargs) for lbl in predictions.keys(): if predictions[lbl] >= threshold: mdl = self.get_parent_model_from_label(lbl) features = self.preprocessor.get_features(self.model_inputs[mdl]) if return_type == 'features': positive_data[lbl].append(features) if return_type == 'audio': context = data[max(0, i - 16000*3):i + 16000] if len(context) == 16000*4: positive_data[lbl].append(context) positive_data_combined = {} for lbl in positive_data.keys(): positive_data_combined[lbl] = np.vstack(positive_data[lbl]) return positive_data_combined def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160): """ Runs the input audio through the SpeexDSP noise suppression algorithm. Note that this function updates the state of the existing Speex noise suppression object, and isn't intended to be called externally. Args: x (ndarray): The 16-bit, 16khz audio to process. Must always be an integer multiple of `frame_size`. frame_size (int): The frame size to use for the Speex Noise suppressor. Must match the frame size specified during the initialization of the noise suppressor. Returns: ndarray: The input audio with noise suppression applied """ cleaned = [] for i in range(0, x.shape[0], frame_size): chunk = x[i:i+frame_size] cleaned.append(self.speex_ns.process(chunk.tobytes())) cleaned_bytestring = b''.join(cleaned) cleaned_array = np.frombuffer(cleaned_bytestring, np.int16) return cleaned_array