add read me

2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions
--- a/venv/lib/python3.12/site-packages/openwakeword/init.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/init.py
@@ -0,0 +1,39 @@
+import os
+from openwakeword.model import Model
+from openwakeword.vad import VAD
+from openwakeword.custom_verifier_model import train_custom_verifier
+
+__all__ = ['Model', 'VAD', 'train_custom_verifier']
+
+models = {
+    "alexa": {
+        "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/alexa_v0.1.onnx")
+    },
+    "hey_mycroft": {
+        "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_mycroft_v0.1.onnx")
+    },
+    "hey_jarvis": {
+        "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_jarvis_v0.1.onnx")
+    },
+    "timer": {
+        "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/timer_v0.1.onnx")
+    },
+    "weather": {
+        "model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/weather_v0.1.onnx")
+    }
+}
+
+model_class_mappings = {
+    "timer": {
+        "1": "1_minute_timer",
+        "2": "5_minute_timer",
+        "3": "10_minute_timer",
+        "4": "20_minute_timer",
+        "5": "30_minute_timer",
+        "6": "1_hour_timer"
+    }
+}
+
+
+def get_pretrained_model_paths():
+    return [models[i]["model_path"] for i in models.keys()]
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/init.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/init.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/custom_verifier_model.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/custom_verifier_model.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/data.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/data.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/metrics.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/metrics.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/model.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/model.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/utils.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/utils.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/pycache/vad.cpython-312.pyc
+++ b/venv/lib/python3.12/site-packages/openwakeword/pycache/vad.cpython-312.pyc
--- a/venv/lib/python3.12/site-packages/openwakeword/custom_verifier_model.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/custom_verifier_model.py
@@ -0,0 +1,174 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Imports
+import os
+from tqdm import tqdm
+import collections
+import openwakeword
+import numpy as np
+import scipy
+import pickle
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+
+
+# Define functions to prepare data for speaker dependent verifier model
+def get_reference_clip_features(
+        reference_clip: str,
+        oww_model: openwakeword.Model,
+        model_name: str,
+        threshold: float = 0.5,
+        N: int = 3,
+        **kwargs
+        ):
+    """
+    Processes input audio files (16-bit, 16-khz single-channel WAV files) and gets the openWakeWord
+    audio features that produce a prediction from the specified model greater than the threshold value.
+
+
+    Args:
+        reference_clip (str): The target audio file to get features from
+        oww_model (openwakeword.Model): The openWakeWord model object used to get predictions
+        model_name (str): The name of the model to get predictions from (should correspond to
+                          a python dictionary key in the oww_model.models attribute)
+        threshold (float): The minimum score from the model required to capture the associated features
+        N (int): How many times to run feature extraction for a given clip, adding some slight variation
+                 in the starting position each time to ensure that the features are not identical
+
+    Returns:
+        ndarray: A numpy array of shape N x M x L, where N is the number of examples, M is the number
+                 of frames in the window, and L is the audio feature/embedding dimension.
+    """
+
+    # Create dictionary to store frames
+    positive_data = collections.defaultdict(list)
+
+    # Get predictions
+    for _ in range(N):
+        # Load clip
+        if type(reference_clip) == str:
+            sr, dat = scipy.io.wavfile.read(reference_clip)
+        else:
+            dat = reference_clip
+
+        # Set random starting point to get small variations in features
+        if N != 1:
+            dat = dat[np.random.randint(0, 1280):]
+
+        # Get predictions
+        step_size = 1280
+        for i in range(0, dat.shape[0]-step_size, step_size):
+            predictions = oww_model.predict(dat[i:i+step_size], **kwargs)
+            if predictions[model_name] >= threshold:
+                features = oww_model.preprocessor.get_features(  # type: ignore[has-type]
+                    oww_model.model_inputs[model_name]           # type: ignore[has-type]
+                )
+                positive_data[model_name].append(features)
+
+    if len(positive_data[model_name]) == 0:
+        positive_data[model_name].append(
+            np.empty((0, oww_model.model_inputs[model_name], 96)))  # type: ignore[has-type]
+
+    return np.vstack(positive_data[model_name])
+
+
+def flatten_features(x):
+    return [i.flatten() for i in x]
+
+
+def train_verifier_model(features: np.ndarray, labels: np.ndarray):
+    """
+    Train a logistic regression binary classifier model on the provided features and labels
+
+    Args:
+        features (ndarray): A N x M numpy array, where N is the number of examples and M
+                             is the number of features
+        labels (ndarray): A 1D numpy array where each value corresponds to the label of the Nth
+                           example in the `features` argument
+
+    Returns:
+        The trained scikit-learn logistic regression model
+    """
+    # C value matters alot here, depending on dataset size (larger datasets work better with larger C?)
+    clf = LogisticRegression(random_state=0, max_iter=2000, C=0.001)
+    pipeline = make_pipeline(FunctionTransformer(flatten_features), StandardScaler(), clf)
+    pipeline.fit(features, labels)
+
+    return pipeline
+
+
+def train_custom_verifier(
+        positive_reference_clips: str,
+        negative_reference_clips: str,
+        output_path: str,
+        model_name: str,
+        **kwargs
+        ):
+    """
+    Trains a voice-specific custom verifier model on examples of wake word/phrase speech and other speech
+    from a single user.
+
+    Args:
+        positive_reference_clips (str): The path to a directory containing single-channel 16khz, 16-bit WAV files
+                                        of the target wake word/phrase.
+        negative_reference_clips (str): The path to a directory containing single-channel 16khz, 16-bit WAV files
+                                        of miscellaneous speech not containing the target wake word/phrase.
+        output_path (str): The location to save the trained verifier model (as a scikit-learn .joblib file)
+        model_name (str): The name or path of the trained openWakeWord model that the verifier model will be
+                          based on. If only a name, it must be one of the pre-trained models included in the
+                          openWakeWord release.
+        kwargs: Any other keyword arguments to pass to the openWakeWord model initialization
+
+    Returns:
+        None
+    """
+    # Load target openWakeWord model
+    if os.path.exists(model_name):
+        oww = openwakeword.Model(
+            wakeword_model_paths=[model_name],
+            **kwargs
+        )
+        model_name = model_name.split(os.path.sep)[-1][0:-5]
+    else:
+        oww = openwakeword.Model(**kwargs)
+
+    # Get features from positive reference clips
+    positive_features = np.vstack(
+        [get_reference_clip_features(i, oww, model_name, N=5)
+         for i in tqdm(positive_reference_clips, desc="Processing positive reference clips")]
+    )
+    if positive_features.shape[0] == 0:
+        raise ValueError("The positive features were created! Make sure that"
+                         " the positive reference clips contain the appropriate audio"
+                         " for the desired model")
+
+    # Get features from negative reference clips
+    negative_features = np.vstack(
+        [get_reference_clip_features(i, oww, model_name, threshold=0.0, N=1)
+         for i in tqdm(negative_reference_clips, desc="Processing negative reference clips")]
+    )
+
+    # Train logistic regression model on reference clip features
+    print("Training and saving verifier model...")
+    lr_model = train_verifier_model(
+        np.vstack((positive_features, negative_features)),
+        np.array([1]*positive_features.shape[0] + [0]*negative_features.shape[0])
+    )
+
+    # Save logistic regression model to specified output location
+    print("Done!")
+    pickle.dump(lr_model, open(output_path, "wb"))
--- a/venv/lib/python3.12/site-packages/openwakeword/data.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/data.py
@@ -0,0 +1,712 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# imports
+from multiprocessing.pool import ThreadPool
+import os
+from functools import partial
+from pathlib import Path
+import random
+from tqdm import tqdm
+from typing import List, Tuple
+import numpy as np
+import torch
+from numpy.lib.format import open_memmap
+from speechbrain.dataio.dataio import read_audio
+from speechbrain.processing.signal_processing import reverberate
+import torchaudio
+import mutagen
+import acoustics
+
+
+# Load audio clips and structure into clips of the same length
+def stack_clips(audio_data, clip_size=16000*2):
+    """
+    Takes an input list of 1D arrays (of different lengths), concatenates them together,
+    and then extracts clips of a uniform size by dividing the combined array.
+
+    Args:
+        audio_data (List[ndarray]): A list of 1D numpy arrays to combine and stack
+        clip_size (int): The desired total length of the uniform clip size (in samples)
+
+    Returns:
+        ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
+    """
+
+    # Combine all clips into single clip
+    combined_data = np.hstack((audio_data))
+
+    # Get chunks of the specified size
+    new_examples = []
+    for i in range(0, combined_data.shape[0], clip_size):
+        chunk = combined_data[i:i+clip_size]
+        if chunk.shape[0] != clip_size:
+            chunk = np.hstack((chunk, np.zeros(clip_size - chunk.shape[0])))
+        new_examples.append(chunk)
+
+    return np.array(new_examples)
+
+
+def load_audio_clips(files, clip_size=32000):
+    """
+    Takes the specified audio files and shapes them into an array of N by `clip_size`,
+    where N is determined by the length of the audio files and `clip_size` at run time.
+
+    Clips longer than `clip size` are truncated and extended into the N+1 row.
+    Clips shorter than `clip_size` are combined with the previous or next clip
+    (except for the last clip in `files`, which is ignored if it is too short.)
+
+    Args:
+        files (List[str]): A list of filepaths
+        clip_size (int): The number of samples (of 16khz audio) for all of the rows in the array
+
+    Returns:
+        ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
+    """
+
+    # Load audio files
+    audio_data = []
+    for i in files:
+        try:
+            audio_data.append(read_audio(i))
+        except ValueError:
+            continue
+
+    # Get shape of output array
+    N = sum([i.shape[0] for i in audio_data])//clip_size
+    X = np.empty((N, clip_size))
+
+    # Add audio data to rows
+    previous_row_remainder = None
+    cnt = 0
+    for row in audio_data:
+        row = np.hstack((previous_row_remainder, row))
+        while row.shape[0] >= clip_size:
+            X[cnt, :] = row[0:clip_size]
+            row = row[clip_size:]
+            cnt += 1
+
+        previous_row_remainder = row if row.size > 0 else None
+
+    # Convert to 16-bit PCM data
+    X = (X*32767).astype(np.int16)
+
+    return X
+
+
+# Dato I/O utils
+
+
+# Convert clips with sox
+def _convert_clip(input_file, output_file, backend="ffmpeg"):
+    if backend == "sox":
+        cmd = f"sox \"{input_file}\" -G -r 16000 -c 1 \"{output_file}\""
+    elif backend == "ffmpeg":
+        cmd = f"ffmpeg -y -i \"{input_file}\" -ar 16000 \"{output_file}\""
+    os.system(cmd)
+    return None
+
+
+def convert_clips(input_files, output_files, sr=16000, ncpu=1, backend="ffmpeg"):
+    """
+    Converts files in parallel with multithreading using Sox or ffmpeg.
+
+    Intended to only convert input audio files into single-channel, 16 khz clips.
+
+    Args:
+        input_files (List[str]): A list of paths to input files
+        output_files (List[str]): A list of paths to output files, corresponding 1:1 to the input files
+        sr (int): The output sample rate of the converted clip
+        ncpu (int): The number of CPUs to use for the conversion
+        backend (str): The utilty to use for conversion, "sox" or "ffmpeg"
+
+    Returns:
+        None
+    """
+    # Setup ThreadPool object
+    pool = ThreadPool(processes=ncpu)
+
+    # Set backend for conversion
+    f = partial(_convert_clip, backend=backend)
+
+    # Submit jobs
+    pool.starmap(f, [(i, j) for i, j in zip(input_files, output_files)])
+
+
+def filter_audio_paths(target_dirs, min_length_secs, max_length_secs, duration_method="size", glob_filter=None):
+    """
+    Gets the paths of wav files in flat target directories, automatically filtering
+    out files below/above the specified length (in seconds). Assumes that all
+    wav files are sampled at 16khz, are single channel, and have 16-bit PCM data.
+
+    Uses `os.scandir` in Python for highly efficient file system exploration,
+    and doesn't require loading the files into memory for length estimation.
+
+    Args:
+        target_dir (List[str]): The target directories containing the audio files
+        min_length_secs (float): The minimum length in seconds (otherwise the clip is skipped)
+        max_length_secs (float): The maximum length in seconds (otherwise the clip is skipped)
+        duration_method (str): Whether to use the file size ('size'), or header information ('header')
+                               to estimate the duration of the audio file. 'size' is generally
+                               much faster, but assumes that all files in the target directory
+                               are the same type, sample rate, and bitrate. If None, durations are not calculated.
+        glob_filter (str): A pathlib glob filter string to select specific files within the target directory
+
+    Returns:
+        tuple: A list of strings corresponding to the paths of the wav files that met the length criteria,
+               and a list of their durations (in seconds)
+    """
+
+    file_paths = []
+    durations = []
+    for target_dir in target_dirs:
+        sizes = []
+        dir_paths = []
+        if glob_filter:
+            dir_paths = [str(i) for i in Path(target_dir).glob(glob_filter)]
+            file_paths.extend(dir_paths)
+            sizes.extend([os.path.getsize(i) for i in dir_paths])
+        else:
+            for i in tqdm(os.scandir(target_dir)):
+                dir_paths.append(i.path)
+                file_paths.append(i.path)
+                sizes.append(i.stat().st_size)
+
+        if duration_method == "size":
+            durations.extend(estimate_clip_duration(dir_paths, sizes))
+
+        elif duration_method == "header":
+            durations.extend([get_clip_duration(i) for i in tqdm(dir_paths)])
+
+    if durations != []:
+        filtered = [(i, j) for i, j in zip(file_paths, durations) if j >= min_length_secs and j <= max_length_secs]
+        return [i[0] for i in filtered], [i[1] for i in filtered]
+    else:
+        return file_paths, []
+
+
+def estimate_clip_duration(audio_files: list, sizes: list):
+    """Estimates the duration of each audio file in a list.
+
+    Assumes that all of the audio files have the same audio format,
+    bit depth, and sample rate.
+
+    Args:
+        audio_file (str): A list of audio file paths
+        sizes (int): The size of each audio file in bytes
+
+    Returns:
+        list: A list of durations (in seconds) for the audio files
+    """
+
+    # Determine file type by checking the first file
+    details = torchaudio.info(audio_files[0])
+
+    # Caculate any correction factors needed from the first file
+    details = mutagen.File(audio_files[0])
+    correction = 8*os.path.getsize(audio_files[0]) - details.info.bitrate*details.info.length
+
+    # Estimate duration for all remaining clips from file size only
+    durations = []
+    for size in sizes:
+        durations.append((size*8-correction)/details.info.bitrate)
+
+    return durations
+
+
+def estimate_mp3_duration(fpath):
+    """Estimates the duration of an MP3 file from metadata and file-size.
+    Is only accurate for 16000 khz sample rate audio with a relatively
+    constant bit-rate.
+
+    Args:
+        fpath (str): The input path to the MP3 file
+
+    Returns:
+        float: The duration of the MP3 file in seconds
+    """
+
+    conversion_factors = {
+        "16_khz_single_channel": 0.000333318208471784,
+        "16_khz_stereo": 0.000333318208471784/2
+    }
+
+    duration_seconds = 0
+    try:
+        md = torchaudio.info(fpath)
+    except RuntimeError:
+        return duration_seconds
+
+    nbytes = os.path.getsize(fpath)
+    if md.num_channels == 1:
+        if md.sample_rate == 16000:
+            duration_seconds = nbytes*conversion_factors["16_khz_single_channel"]
+    elif md.num_channels == 2:
+        if md.sample_rate == 16000:
+            duration_seconds = nbytes*conversion_factors["16_khz_stereo"]
+
+    return duration_seconds
+
+
+def get_clip_duration(clip):
+    """Gets the duration of an audio clip in seconds from file header information"""
+    try:
+        metadata = torchaudio.info(clip)
+    except RuntimeError:  # skip cases where file metadata can't be read
+        return 0
+
+    return metadata.num_frames/metadata.sample_rate
+
+
+def get_wav_duration_from_filesize(size, nbytes=2):
+    """
+    Calculates the duration (in seconds) from a WAV file, assuming it contains 16 khz single-channel audio.
+    The bit depth is user specified, and defaults to 2 for 16-bit PCM audio.
+
+    Args:
+        size (int): The file size in bytes
+        nbytes (int): How many bytes for each data point in the audio (e.g., 16-bit is 2, 32-bit is 4, etc.)
+
+    Returns:
+        float: The duration of the audio file in seconds
+    """
+    return (size-44)/nbytes/16000
+
+
+# Data augmentation utility function
+def mix_clips_batch(
+        foreground_clips: List[str],
+        background_clips: List[str],
+        combined_size: int,
+        labels: List[int] = [],
+        batch_size: int = 32,
+        snr_low: float = 0,
+        snr_high: float = 0,
+        start_index: List[int] = [],
+        foreground_durations: List[float] = [],
+        foreground_truncate_strategy: str = "random",
+        rirs: List[str] = [],
+        rir_probability: int = 1,
+        volume_augmentation: bool = True,
+        generated_noise_augmentation: float = 0.0,
+        shuffle: bool = True,
+        return_sequence_labels: bool = False,
+        return_background_clips: bool = False,
+        return_background_clips_delay: Tuple[int, int] = (0, 0),
+        seed: int = 0
+        ):
+    """
+    Mixes foreground and background clips at a random SNR level in batches.
+
+    References: https://pytorch.org/audio/main/tutorials/audio_data_augmentation_tutorial.html and
+    https://speechbrain.readthedocs.io/en/latest/API/speechbrain.processing.speech_augmentation.html#speechbrain.processing.speech_augmentation.AddNoise
+
+    Args:
+        foreground_clips (List[str]): A list of paths to the foreground clips
+        background_clips (List[str]): A list of paths to the background clips (randomly selected for each
+                                      foreground clip)
+        combined_size (int): The total length (in samples) of the combined clip. If needed, the background
+                             clips are duplicated or truncated to reach this length.
+        labels (List[int]): A list of integer labels corresponding 1:1 for the foreground clips. Will be updated
+                            as needed with foreground clips to ensure that mixed clips retain the proper labels.
+        batch_size (int): The batch size
+        snr_low (float): The low SNR level of the mixing in db
+        snr_high (float): The high snr level of the mixing in db
+        start_index (List[int]): The starting position (in samples) for the foreground clip to start in
+                                 the background clip. If the foreground clip is longer than `combined_size`
+                                 when starting at this point, the foreground clip will be truncated
+                                 according to the `foreground_truncate_strategy` argument.
+        foreground_durations (List[float]): The desired duration of each foreground clip (in seconds)
+        foreground_truncate_strategy (str): The method used to truncate the foreground clip, if needed based on the
+                                            `start_index`, `foreground_durations`, and `combined_size` arguments.
+                                            See the options in the `truncate_clip` method.
+        rirs (List[str]): A list of paths to room impulse response functions (RIR) to convolve with the
+                          clips to simulate different recording environments. Applies a single random selection from the
+                          list RIR file to the entire batch. If empty (the default), nothing is done.
+        rir_probability (float): The probability (between 0 and 1) that the batch will be convolved with a RIR file.
+        volume_augmentation (bool): Whether to randomly apply volume augmentation to the clips in the batch.
+                                    This simply scales the data of each clip such that the maximum value is is between
+                                    0.02 and 1.0 (the floor shouldn't be zero as beyond a certain point the audio data
+                                    is no longer valid).
+        generated_noise_augmentation: The probability of further mixing the mixed clip with generated random noise.
+                                      Will be either "white", "brown", "blue", "pink", or "violet" noise, mixed at a
+                                      random SNR between `snr_low` and `snr_high`.
+        return_background_clips (bool): Whether to return the segment of the background clip that was mixed with each
+                                        foreground clip in the batch.
+        return_background_clips_delay (Tuple(int)): The lower and upper bound of a random delay (in samples)
+                                           to apply to the segment of each returned backgroud clip mixed
+                                           with each foreground clip in the batch. This is primarily intended to
+                                           simulate the drift between input and output channels
+                                           in audio devices, which means that the mixed audio is never
+                                           exactly aligned with the two source clips.
+        shuffle (bool): Whether to shuffle the foreground clips before mixing (default: True)
+        return_sequence_labels (bool): Whether to return sequence labels (i.e., frame-level labels) for each clip
+                                       based on the start/end positions of the foreground clip.
+        seed (int): A random seed
+
+    Returns:
+        generator: Returns a generator that yields batches of mixed foreground/background audio, labels, and the
+                   background segments used for each audio clip (or None is the
+                   `return_backgroun_clips` argument is False)
+    """
+    # Set random seed, if needed
+    if seed:
+        np.random.seed(seed)
+        random.seed(seed)
+
+    # Check and Set start indices, if needed
+    if not start_index:
+        start_index = [0]*batch_size
+    else:
+        if min(start_index) < 0:
+            raise ValueError("Error! At least one value of the `start_index` argument is <0. Check your inputs.")
+
+    # Make dummy labels
+    if not labels:
+        labels = [0]*len(foreground_clips)
+
+    if shuffle:
+        p = np.random.permutation(len(foreground_clips))
+        foreground_clips = np.array(foreground_clips)[p].tolist()
+        start_index = np.array(start_index)[p].tolist()
+        labels = np.array(labels)[p].tolist()
+        if foreground_durations:
+            foreground_durations = np.array(foreground_durations)[p].tolist()
+
+    for i in range(0, len(foreground_clips), batch_size):
+        # Load foreground clips/start indices and truncate as needed
+        sr = 16000
+        start_index_batch = start_index[i:i+batch_size]
+        foreground_clips_batch = [read_audio(j) for j in foreground_clips[i:i+batch_size]]
+        foreground_clips_batch = [j[0] if len(j.shape) > 1 else j for j in foreground_clips_batch]
+        if foreground_durations:
+            foreground_clips_batch = [truncate_clip(j, int(k*sr), foreground_truncate_strategy)
+                                      for j, k in zip(foreground_clips_batch, foreground_durations[i:i+batch_size])]
+        labels_batch = np.array(labels[i:i+batch_size])
+
+        # Load background clips and pad/truncate as needed
+        background_clips_batch = [read_audio(j) for j in random.sample(background_clips, batch_size)]
+        background_clips_batch = [j[0] if len(j.shape) > 1 else j for j in background_clips_batch]
+        background_clips_batch_delayed = []
+        delay = np.random.randint(return_background_clips_delay[0], return_background_clips_delay[1] + 1)
+        for ndx, background_clip in enumerate(background_clips_batch):
+            if background_clip.shape[0] < (combined_size + delay):
+                repeated = background_clip.repeat(
+                    np.ceil((combined_size + delay)/background_clip.shape[0]).astype(np.int32)
+                )
+                background_clips_batch[ndx] = repeated[0:combined_size]
+                background_clips_batch_delayed.append(repeated[0+delay:combined_size + delay].clone())
+            elif background_clip.shape[0] > (combined_size + delay):
+                r = np.random.randint(0, max(1, background_clip.shape[0] - combined_size - delay))
+                background_clips_batch[ndx] = background_clip[r:r + combined_size]
+                background_clips_batch_delayed.append(background_clip[r+delay:r + combined_size + delay].clone())
+
+        # Mix clips at snr levels
+        snrs_db = np.random.uniform(snr_low, snr_high, batch_size)
+        mixed_clips = []
+        sequence_labels = []
+        for fg, bg, snr, start in zip(foreground_clips_batch, background_clips_batch,
+                                      snrs_db, start_index_batch):
+            if bg.shape[0] != combined_size:
+                raise ValueError(bg.shape)
+            mixed_clip = mix_clip(fg, bg, snr, start)
+            sequence_labels.append(get_frame_labels(combined_size, start, start+fg.shape[0]))
+
+            if np.random.random() < generated_noise_augmentation:
+                noise_color = ["white", "pink", "blue", "brown", "violet"]
+                noise_clip = acoustics.generator.noise(combined_size, color=np.random.choice(noise_color))
+                noise_clip = torch.from_numpy(noise_clip/noise_clip.max())
+                mixed_clip = mix_clip(mixed_clip, noise_clip, np.random.choice(snrs_db), 0)
+
+            mixed_clips.append(mixed_clip)
+
+        mixed_clips_batch = torch.vstack(mixed_clips)
+        sequence_labels_batch = torch.from_numpy(np.vstack(sequence_labels))
+
+        # Apply reverberation to the batch (from a single RIR file)
+        if rirs:
+            if np.random.random() <= rir_probability:
+                rir_waveform, sr = torchaudio.load(random.choice(rirs))
+                if rir_waveform.shape[0] > 1:
+                    rir_waveform = rir_waveform[random.randint(0, rir_waveform.shape[0]-1), :]
+                mixed_clips_batch = reverberate(mixed_clips_batch, rir_waveform, rescale_amp="avg")
+
+        # Apply volume augmentation
+        if volume_augmentation:
+            volume_levels = np.random.uniform(0.02, 1.0, mixed_clips_batch.shape[0])
+            mixed_clips_batch = (volume_levels/mixed_clips_batch.max(axis=1)[0])[..., None]*mixed_clips_batch
+        else:
+            # Normalize clips only if max value is outside of [-1, 1]
+            abs_max, _ = torch.max(
+                torch.abs(mixed_clips_batch), dim=1, keepdim=True
+            )
+            mixed_clips_batch = mixed_clips_batch / abs_max.clamp(min=1.0)
+
+        # Convert to 16-bit PCM audio
+        mixed_clips_batch = (mixed_clips_batch.numpy()*32767).astype(np.int16)
+
+        # Remove any clips that are silent (happens rarely when mixing/reverberating)
+        error_index = np.where(mixed_clips_batch.max(axis=1) != 0)[0]
+        mixed_clips_batch = mixed_clips_batch[error_index]
+        labels_batch = labels_batch[error_index]
+        sequence_labels_batch = sequence_labels_batch[error_index]
+
+        if not return_background_clips:
+            yield mixed_clips_batch, labels_batch if not return_sequence_labels else sequence_labels_batch, None
+        else:
+            background_clips_batch_delayed = (torch.vstack(background_clips_batch_delayed).numpy()
+                                              * 32767).astype(np.int16)[error_index]
+            yield (mixed_clips_batch,
+                   labels_batch if not return_sequence_labels else sequence_labels_batch,
+                   background_clips_batch_delayed)
+
+
+def get_frame_labels(combined_size, start, end, buffer=1):
+    sequence_label = np.zeros(np.ceil((combined_size-12400)/1280).astype(int))
+    frame_positions = np.arange(12400, combined_size, 1280)
+    start_frame = np.argmin(abs(frame_positions - start))
+    end_frame = np.argmin(abs(frame_positions - end))
+    sequence_label[start_frame:start_frame+2] = 1
+    sequence_label[end_frame-1:end_frame+1] = 1
+    return sequence_label
+
+
+def mix_clip(fg, bg, snr, start):
+    fg_rms, bg_rms = fg.norm(p=2), bg.norm(p=2)
+    snr = 10 ** (snr / 20)
+    scale = snr * bg_rms / fg_rms
+    bg[start:start + fg.shape[0]] = bg[start:start + fg.shape[0]] + scale*fg
+    return bg / 2
+
+
+def truncate_clip(x, max_size, method="truncate_start"):
+    """
+    Truncates and audio clip with the specified method
+
+    Args:
+        x (nd.array): An array of audio data
+        max_size (int): The maximum size (in samples)
+        method (str): Can be one of four options:
+            - "truncate_start": Truncate the start of the clip
+            - "truncate_end": Truncate the end of the clip
+            - "truncate_both": Truncate both the start and end of the clip
+            - "random": Randomly select a segment of the right size from the clip
+
+    Returns:
+        nd.array: The truncated audio data
+    """
+    if x.shape[0] > max_size:
+        if method == "truncate_start":
+            x = x[x.shape[0] - max_size:]
+        if method == "truncate_end":
+            x = x[0:max_size]
+        if method == "truncate_both":
+            n = int(np.ceil(x.shape[0] - max_size)/2)
+            x = x[n:-n][0:max_size]
+        if method == "random":
+            rn = np.random.randint(0, x.shape[0] - max_size)
+            x = x[rn:rn + max_size]
+
+    return x
+
+
+# Reverberation data augmentation function
+def apply_reverb(x, rir_files):
+    """
+    Applies reverberation to the input audio clips
+
+    Args:
+        x (nd.array): A numpy array of shape (batch, audio_samples) containing the audio clips
+        rir_files (Union[str, list]): Either a path to an RIR (room impulse response) file or a list
+                                      of RIR files. If a list, one file will be randomly chosen
+                                      to apply to `x`
+
+    Returns:
+        nd.array: The reverberated audio clips
+    """
+    if isinstance(rir_files, str):
+        rir_waveform, sr = torchaudio.load(rir_files[0])
+    elif isinstance(rir_files, list):
+        rir_waveform, sr = torchaudio.load(random.choice(rir_files))
+
+    # Apply reverberation to the batch (from a single RIR file)
+    if rir_waveform.shape[0] > 1:
+        rir_waveform = rir_waveform[random.randint(0, rir_waveform.shape[0]-1), :]
+    reverbed = reverberate(torch.from_numpy(x), rir_waveform, rescale_amp="avg")
+
+    return reverbed.numpy()
+
+
+# Load batches of data from mmaped numpy arrays
+class mmap_batch_generator:
+    """
+    A generator class designed to dynamically build batches from mmaped numpy arrays.
+
+    The generator will return tuples of (data, labels) with a batch size determined
+    by the `n_per_class` initialization argument. When a mmaped numpy array has been
+    fully interated over, it will restart at the zeroth index automatically.
+    """
+    def __init__(self,
+                 data_files: dict,
+                 label_files: dict = {},
+                 batch_size: int = 128,
+                 n_per_class: dict = {},
+                 data_transform_funcs: dict = {},
+                 label_transform_funcs: dict = {}
+                 ):
+        """
+        Initialize the generator object
+
+        Args:
+            data_files (dict): A dictionary of labels (as keys) and on-disk numpy array paths (as values).
+                               Keys should be integer strings representing class labels.
+            label_files (dict): A dictionary where the keys are the class labels and the values are the per-example
+                                labels. The values must be the same shape as the correponding numpy data arrays
+                                from the `data_files` argument.
+            batch_size (int): The number of samples per batch
+            n_per_class (dict): A dictionary with integer string labels (as keys) and number of example per batch
+                               (as values). If None (the default), batch sizes for each class will be
+                               automatically calculated based on the the input dataframe shapes and transformation
+                               functions.
+
+            data_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of per class
+                                    data loaded from the mmaped array. For example, with an array of shape
+                                    (batch, timesteps, features), if the goal is to half the timesteps per example,
+                                    (effectively doubling the size of the batch) this function could be passed:
+
+                                    lambda x: np.vstack(
+                                        (x[:, 0:timesteps//2, :], x[:, timesteps//2:, :]
+                                    ))
+
+                                    The user should incorporate the effect of any transform on the values of the
+                                    `n_per_class` argument accordingly, in order to end of with the desired
+                                    total batch size for each iteration of the generator.
+            label_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of labels.
+                                          For example, strings can be mapped to integers or one-hot encoded,
+                                          groups of classes can be merged together into one, etc.
+        """
+        # inputs
+        self.data_files = data_files
+        self.label_files = label_files
+        self.n_per_class = n_per_class
+        self.data_transform_funcs = data_transform_funcs
+        self.label_transform_funcs = label_transform_funcs
+
+        # Get array mmaps and store their shapes (but load files < 1 GB total size into memory)
+        self.data = {label: np.load(fl, mmap_mode='r') for label, fl in data_files.items()}
+        self.labels = {label: np.load(fl) for label, fl in label_files.items()}
+        self.data_counter = {label: 0 for label in data_files.keys()}
+        self.original_shapes = {label: self.data[label].shape for label in self.data.keys()}
+        self.shapes = {label: self.data[label].shape for label in self.data.keys()}
+
+        # # Update effective shape of mmap array based on user-provided transforms (currently broken)
+        # for lbl, f in self.data_transform_funcs.items():
+        #     dummy_data = np.random.random((1, self.original_shapes[lbl][1], self.original_shapes[lbl][2]))
+        #     new_shape = f(dummy_data).shape
+        #     self.shapes[lbl] = (new_shape[0]*self.original_shapes[lbl][0], new_shape[1], new_shape[2])
+
+        # Calculate batch sizes, if the user didn't specify them
+        scale_factor = 1
+        if not self.n_per_class:
+            self.n_per_class = {}
+            for lbl, shape in self.shapes.items():
+                dummy_data = np.random.random((10, self.shapes[lbl][1], self.shapes[lbl][2]))
+                if self.data_transform_funcs.get(lbl, None):
+                    scale_factor = self.data_transform_funcs.get(lbl, None)(dummy_data).shape[0]/10
+
+                ratio = self.shapes[lbl][0]/sum([i[0] for i in self.shapes.values()])
+                self.n_per_class[lbl] = max(1, int(int(batch_size*ratio)/scale_factor))
+
+            # Get estimated batches per epoch, including the effect of any user-provided transforms
+            batch_size = sum([val*scale_factor for val in self.n_per_class.values()])
+            batches_per_epoch = sum([i[0] for i in self.shapes.values()])//batch_size
+            self.batch_per_epoch = batches_per_epoch
+            print("Batches/steps per epoch:", batches_per_epoch)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # Build batch
+        while True:
+            X, y = [], []
+            for label, n in self.n_per_class.items():
+                # Restart at zeroth index if an array reaches the end
+                if self.data_counter[label] >= self.shapes[label][0]:
+                    self.data_counter[label] = 0
+                    # self.data[label] = np.load(self.data_files[label], mmap_mode='r')
+
+                # Get data from mmaped file
+                x = self.data[label][self.data_counter[label]:self.data_counter[label]+n]
+                self.data_counter[label] += x.shape[0]
+
+                # Transform data
+                if self.data_transform_funcs and self.data_transform_funcs.get(label):
+                    x = self.data_transform_funcs[label](x)
+
+                # Make labels for data (following whatever the current shape of `x` is)
+                if self.label_files.get(label, None):
+                    y_batch = self.labels[label][self.data_counter[label]:self.data_counter[label]+n]
+                else:
+                    y_batch = [label]*x.shape[0]
+
+                # Transform labels
+                if self.label_transform_funcs and self.label_transform_funcs.get(label):
+                    y_batch = self.label_transform_funcs[label](y_batch)
+
+                # Add data to batch
+                X.append(x)
+                y.extend(y_batch)
+
+            return np.vstack(X), np.array(y)
+
+
+# Function to remove empty rows from the end of a mmap array
+def trim_mmap(mmap_path):
+    """
+    Trims blank rows from the end of a mmaped numpy array by creates new mmap array without the blank rows.
+    Note that a copy is created and disk usage will briefly double as the function runs.
+
+    Args:
+        mmap_path (str): The path to mmap array file to trim
+
+    Returns:
+        None
+    """
+    # Identify the last full row in the mmaped file
+    mmap_file1 = np.load(mmap_path, mmap_mode='r')
+    i = -1
+    while np.all(mmap_file1[i, :, :] == 0):
+        i -= 1
+
+    N_new = mmap_file1.shape[0] + i + 1
+
+    # Create new mmap_file and copy over data in batches
+    output_file2 = mmap_path.strip(".npy") + "2.npy"
+    mmap_file2 = open_memmap(output_file2, mode='w+', dtype=np.float32,
+                             shape=(N_new, mmap_file1.shape[1], mmap_file1.shape[2]))
+
+    for i in tqdm(range(0, mmap_file1.shape[0], 1024), total=mmap_file1.shape[0]//1024):
+        if i + 1024 > N_new:
+            mmap_file2[i:N_new] = mmap_file1[i:N_new].copy()
+            mmap_file2.flush()
+        else:
+            mmap_file2[i:i+1024] = mmap_file1[i:i+1024].copy()
+            mmap_file2.flush()
+
+    # Remove old mmaped file
+    os.remove(mmap_path)
+
+    # Rename new mmap file to match original
+    os.rename(output_file2, mmap_path)
--- a/venv/lib/python3.12/site-packages/openwakeword/metrics.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/metrics.py
@@ -0,0 +1,100 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Imports
+import re
+from tqdm import tqdm
+import numpy as np
+from typing import List
+
+
+# Define metric utility functions specific to the wakeword detection use-case
+
+def get_false_positives(scores: List, threshold: float, grouping_window: int = 50):
+    """
+    Counts the number of false-positives based on a list of scores and a specified threshold.
+
+    Args:
+        scores (List): A list of predicted scores, between 0 and 1
+        threshold (float): The threshold to use to determine false-positive predictions
+        grouping_window (int: The size (in number of frames) for grouping scores above
+                                 the threshold into a single false positive for counting
+
+    Returns:
+        int: The number of false positive predictions in the list of scores
+    """
+    bin_pred = np.array(scores) >= threshold
+    bin_pred_string = ''.join(["1" if i else "0" for i in bin_pred])
+    transitions = list(re.finditer("01", bin_pred_string))
+    n = grouping_window
+    for t in transitions:
+        if bin_pred[t.end()] != 0:
+            bin_pred[t.end():t.end() + min(len(transitions) - t.end(), n)] = [0]*min(len(transitions) - t.end(), n)
+
+    return sum(bin_pred)
+
+
+def generate_roc_curve_fprs(
+                            scores: list,
+                            n_points: int = 25,
+                            time_per_prediction: float = .08,
+                            **kwargs
+                            ):
+    """
+    Generates the false positive rate (fpr) per hour for the given predictions
+    over a range of score thresholds. Assumes that all predictions should be less than the threshold,
+    else the prediction is a false positive.
+
+    Args:
+        scores (List): A list of predicted scores, between 0 and 1
+        n_points (int): The number of points to use when calculating false positive rates
+        time_per_prediction (float): The time (in seconds) that each prediction represents
+        kwargs (dict): Any other keyword arguments to pass to the `get_false_positives` function
+
+    Returns:
+        list: A list of false positive rates per hour at different score threshold levels
+    """
+
+    # Determine total time
+    total_hours = time_per_prediction*len(scores)/3600  # convert to hours
+
+    # Calculate true positive rate
+    fprs = []
+    for threshold in tqdm(np.linspace(0.01, 0.99, num=n_points)):
+        fpr = get_false_positives(scores, threshold=threshold, **kwargs)
+        fprs.append(fpr/total_hours)
+
+    return fprs
+
+
+def generate_roc_curve_tprs(
+                            scores: list,
+                            n_points: int = 25
+                            ):
+    """
+    Generates the true positive rate (true accept rate) for the given predictions
+    over a range score thresholds. Assumes that all predictions are supposed to be equal to 1.
+
+    Args:
+        scores (list): A list of scores for each prediction
+
+    Returns:
+        list: A list of true positive rates at different score threshold levels
+    """
+
+    tprs = []
+    for threshold in tqdm(np.linspace(0.01, 0.99, num=n_points)):
+        tprs.append(sum(scores >= threshold)/len(scores))
+
+    return tprs
--- a/venv/lib/python3.12/site-packages/openwakeword/model.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/model.py
@@ -0,0 +1,402 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Imports
+import numpy as np
+import onnxruntime as ort
+import openwakeword
+from openwakeword.utils import AudioFeatures
+
+import wave
+import os
+import pickle
+from collections import deque, defaultdict
+from functools import partial
+import time
+from typing import List, Union, DefaultDict, Dict
+
+
+# Define main model class
+class Model():
+    """
+    The main model class for openWakeWord. Creates a model object with the shared audio pre-processer
+    and for arbitrarily many custom wake word/wake phrase models.
+    """
+    def __init__(
+            self,
+            wakeword_model_paths: List[str] = [],
+            class_mapping_dicts: List[dict] = [],
+            enable_speex_noise_suppression: bool = False,
+            vad_threshold: float = 0,
+            custom_verifier_models: dict = {},
+            custom_verifier_threshold: float = 0.1,
+            **kwargs
+            ):
+        """Initialize the openWakeWord model object.
+
+        Args:
+            wakeword_model_paths (List[str]): A list of paths of ONNX models to load into the openWakeWord model object.
+                                              If not provided, will load all of the pre-trained models.
+            class_mapping_dicts (List[dict]): A list of dictionaries with integer to string class mappings for
+                                              each model in the `wakeword_model_paths` arguments
+                                              (e.g., {"0": "class_1", "1": "class_2"})
+            enable_speex_noise_suppression (bool): Whether to use the noise suppresion from the SpeexDSP
+                                                   library to pre-process all incoming audio. May increase
+                                                   model performance when reasonably stationary background noise
+                                                   is present in the environment where openWakeWord will be used.
+                                                   It is very lightweight, so enabling it doesn't significantly
+                                                   impact efficiency.
+            vad_threshold (float): Whether to use a voice activity detection model (VAD) from Silero
+                                   (https://github.com/snakers4/silero-vad) to filter predictions.
+                                   For every input audio frame, a VAD score is obtained and only those model predictions
+                                   with VAD scores above the threshold will be returned. The default value (0),
+                                   disables voice activity detection entirely.
+            custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
+                                           the keys are the model names (corresponding to the openwakeword.models
+                                           attribute) and the values are the filepaths of the
+                                           custom verifier models.
+            custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score
+                                               from a model for a given frame is greater than this value, the
+                                               associated custom verifier model will also predict on that frame, and
+                                               the verifier score will be returned.
+            kwargs (dict): Any other keyword arguments to pass the the preprocessor instance
+        """
+
+        # Initialize the ONNX models and store them
+        sessionOptions = ort.SessionOptions()
+        sessionOptions.inter_op_num_threads = 1
+        sessionOptions.intra_op_num_threads = 1
+
+        # Get model paths for pre-trained models if user doesn't provide models to load
+        if wakeword_model_paths == []:
+            wakeword_model_paths = openwakeword.get_pretrained_model_paths()
+            wakeword_model_names = list(openwakeword.models.keys())
+        else:
+            wakeword_model_names = [os.path.basename(i[0:-5]) for i in wakeword_model_paths]
+
+        # Create attributes to store models and metadata
+        self.models = {}
+        self.model_inputs = {}
+        self.model_outputs = {}
+        self.class_mapping = {}
+        self.model_input_names = {}
+        self.custom_verifier_models = {}
+        self.custom_verifier_threshold = custom_verifier_threshold
+        for mdl_path, mdl_name in zip(wakeword_model_paths, wakeword_model_names):
+            # Load openwakeword models
+            self.models[mdl_name] = ort.InferenceSession(mdl_path, sess_options=sessionOptions,
+                                                         providers=["CPUExecutionProvider"])
+            self.model_inputs[mdl_name] = self.models[mdl_name].get_inputs()[0].shape[1]
+            self.model_outputs[mdl_name] = self.models[mdl_name].get_outputs()[0].shape[1]
+            if class_mapping_dicts and class_mapping_dicts[wakeword_model_paths.index(mdl_path)].get(mdl_name, None):
+                self.class_mapping[mdl_name] = class_mapping_dicts[wakeword_model_paths.index(mdl_path)]
+            elif openwakeword.model_class_mappings.get(mdl_name, None):
+                self.class_mapping[mdl_name] = openwakeword.model_class_mappings[mdl_name]
+            else:
+                self.class_mapping[mdl_name] = {str(i): str(i) for i in range(0, self.model_outputs[mdl_name])}
+            self.model_input_names[mdl_name] = self.models[mdl_name].get_inputs()[0].name
+
+            # Load custom verifier models
+            if isinstance(custom_verifier_models, dict):
+                if custom_verifier_models.get(mdl_name, False):
+                    self.custom_verifier_models[mdl_name] = pickle.load(open(custom_verifier_models[mdl_name], 'rb'))
+
+            if len(self.custom_verifier_models.keys()) < len(custom_verifier_models.keys()):
+                raise ValueError(
+                    "Custom verifier models were provided, but some were not matched with a base model!"
+                    " Make sure that the keys provided in the `custom_verifier_models` dictionary argument"
+                    " exactly match that of the `.models` attribute of an instantiated openWakeWord Model object"
+                    " that has the same base models but doesn't have custom verifier models."
+                )
+
+        # Create buffer to store frame predictions
+        self.prediction_buffer: DefaultDict[str, deque] = defaultdict(partial(deque, maxlen=30))
+
+        # Initialize SpeexDSP noise canceller
+        if enable_speex_noise_suppression:
+            from speexdsp_ns import NoiseSuppression
+            self.speex_ns = NoiseSuppression.create(160, 16000)
+        else:
+            self.speex_ns = None
+
+        # Initialize Silero VAD
+        self.vad_threshold = vad_threshold
+        if vad_threshold > 0:
+            self.vad = openwakeword.VAD()
+
+        # Create AudioFeatures object
+        self.preprocessor = AudioFeatures(**kwargs)
+
+    def get_parent_model_from_label(self, label):
+        """Gets the parent model associated with a given prediction label"""
+        parent_model = ""
+        for mdl in self.class_mapping.keys():
+            if label in self.class_mapping[mdl].values():
+                parent_model = mdl
+            elif label in self.class_mapping.keys() and label == mdl:
+                parent_model = mdl
+
+        return parent_model
+
+    def reset(self):
+        """Reset the prediction buffer"""
+        self.prediction_buffer = defaultdict(partial(deque, maxlen=30))
+
+    def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False):
+        """Predict with all of the wakeword models on the input audio frames
+
+        Args:
+            x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms
+                                (1280 samples), with longer lengths reducing overall CPU usage
+                                but decreasing detection latency.
+            patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
+                             be observed before the current frame will be returned as non-zero.
+                             Must be provided as an a dictionary where the keys are the
+                             model names and the values are the number of frames. Can reduce false-positive
+                             detections at the cost of a lower true-positive rate.
+                             By default, this behavior is disabled.
+            threshold (dict): The threshold values to use when the `patience` behavior is enabled.
+                              Must be provided as an a dictionary where the keys are the
+                              model names and the values are the thresholds.
+            timing (bool): Whether to return timing information of the models. Can be useful to debug and
+                           assess how efficiently models are running on the current hardware.
+
+        Returns:
+            dict: A dictionary of scores between 0 and 1 for each model, where 0 indicates no
+                  wake-word/wake-phrase detected. If the `timing` argument is true, returns a
+                  tuple of dicts containing model predictions and timing information, respectively.
+        """
+
+        # Setup timing dict
+        if timing:
+            timing_dict: Dict[str, Dict] = {}
+            timing_dict["models"] = {}
+            feature_start = time.time()
+
+        # Get audio features (optionally with Speex noise suppression)
+        if self.speex_ns:
+            self.preprocessor(self._suppress_noise_with_speex(x))
+        else:
+            self.preprocessor(x)
+
+        if timing:
+            timing_dict["models"]["preprocessor"] = time.time() - feature_start
+
+        # Get predictions from model(s)
+        predictions = {}
+        for mdl in self.models.keys():
+            input_name = self.model_input_names[mdl]
+
+            if timing:
+                model_start = time.time()
+
+            # Run model to get predictions
+            if len(x) > 1280:
+                group_predictions = []
+                for i in np.arange(len(x)//1280-1, -1, -1):
+                    group_predictions.extend(
+                        self.models[mdl].run(
+                            None,
+                            {input_name: self.preprocessor.get_features(
+                                    self.model_inputs[mdl],
+                                    start_ndx=-self.model_inputs[mdl] - i
+                            )}
+                        )
+                    )
+                prediction = np.array(group_predictions).max(axis=0)[None, ]
+            else:
+                prediction = self.models[mdl].run(
+                                        None,
+                                        {input_name: self.preprocessor.get_features(self.model_inputs[mdl])}
+                                    )
+
+            if self.model_outputs[mdl] == 1:
+                predictions[mdl] = prediction[0][0][0]
+            else:
+                for int_label, cls in self.class_mapping[mdl].items():
+                    predictions[cls] = prediction[0][0][int(int_label)]
+
+            # Update scores based on custom verifier model
+            if self.custom_verifier_models != {}:
+                for cls in predictions.keys():
+                    if predictions[cls] >= self.custom_verifier_threshold:
+                        parent_model = self.get_parent_model_from_label(cls)
+                        if self.custom_verifier_models.get(parent_model, False):
+                            verifier_prediction = self.custom_verifier_models[parent_model].predict_proba(
+                                self.preprocessor.get_features(self.model_inputs[mdl])
+                            )[0][-1]
+                            predictions[cls] = verifier_prediction
+
+            # Update prediction buffer, and zero predictions for first 5 frames during model initialization
+            for cls in predictions.keys():
+                if len(self.prediction_buffer[cls]) < 5:
+                    predictions[cls] = 0.0
+                self.prediction_buffer[cls].append(predictions[cls])
+
+            # Get timing information
+            if timing:
+                timing_dict["models"][mdl] = time.time() - model_start
+
+        # Update scores based on thresholds or patience arguments
+        if patience != {}:
+            if threshold == {}:
+                raise ValueError("Error! When using the `patience` argument, threshold "
+                                 "values must be provided via the `threshold` argument!")
+            for mdl in predictions.keys():
+                parent_model = self.get_parent_model_from_label(mdl)
+                if parent_model in patience.keys():
+                    scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:]
+                    if (scores >= threshold[parent_model]).sum() < patience[parent_model]:
+                        predictions[mdl] = 0.0
+
+        # (optionally) get voice activity detection scores and update model scores
+        if self.vad_threshold > 0:
+            if timing:
+                vad_start = time.time()
+
+            self.vad(x)
+
+            if timing:
+                timing_dict["models"]["vad"] = time.time() - vad_start
+
+            # Get frames from last 0.4 to 0.56 seconds (3 frames) before the current
+            # frame and get max VAD score
+            vad_frames = list(self.vad.prediction_buffer)[-7:-4]
+            vad_max_score = np.max(vad_frames) if len(vad_frames) > 0 else 0
+            for mdl in predictions.keys():
+                if vad_max_score < self.vad_threshold:
+                    predictions[mdl] = 0.0
+
+        if timing:
+            return predictions, timing_dict
+        else:
+            return predictions
+
+    def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, chunk_size=1280, **kwargs):
+        """Predict on an full audio clip, simulating streaming prediction.
+        The input clip must bit a 16-bit, 16 khz, single-channel WAV file.
+
+        Args:
+            clip (Union[str, np.ndarray]): The path to a 16-bit PCM, 16 khz, single-channel WAV file,
+                                           or an 1D array containing the same type of data
+            padding (int): How many seconds of silence to pad the start/end of the clip with
+                            to make sure that short clips can be processed correctly (default: 1)
+            chunk_size (int): The size (in samples) of each chunk of audio to pass to the model
+            kwargs: Any keyword arguments to pass to the class `predict` method
+
+        Returns:
+            list: A list containing the frame-level prediction dictionaries for the audio clip
+        """
+        if isinstance(clip, str):
+            # Load audio clip as 16-bit PCM data
+            with wave.open(clip, mode='rb') as f:
+                # Load WAV clip frames
+                data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
+        elif isinstance(clip, np.ndarray):
+            data = clip
+
+        if padding:
+            data = np.concatenate(
+                (
+                    np.zeros(16000*padding).astype(np.int16),
+                    data,
+                    np.zeros(16000*padding).astype(np.int16)
+                )
+            )
+
+        # Iterate through clip, getting predictions
+        predictions = []
+        step_size = chunk_size
+        for i in range(0, data.shape[0]-step_size, step_size):
+            predictions.append(self.predict(data[i:i+step_size], **kwargs))
+
+        return predictions
+
+    def _get_positive_prediction_frames(
+            self,
+            file: str,
+            threshold: float = 0.5,
+            return_type: str = "features",
+            **kwargs
+            ):
+        """
+        Gets predictions for the input audio data, and returns the audio features (embeddings)
+        or audio data for all of the frames with a score above the `threshold` argument.
+        Can be a useful way to collect false-positive predictions.
+
+        Args:
+            file (str): The path to a 16-bit 16khz WAV audio file to process
+            threshold (float): The minimum score required for a frame of audio features
+                               to be returned.
+            return_type (str): The type of data to return when a positive prediction is
+                               detected. Can be either 'features' or 'audio' to return
+                               audio embeddings or raw audio data, respectively.
+            kwargs: Any keyword arguments to pass to the class `predict` method
+
+        Returns:
+            dict: A dictionary with filenames as keys and  N x M arrays as values,
+                  where N is the number of examples and M is the number
+                  of audio features, depending on the model input shape.
+        """
+        # Load audio clip as 16-bit PCM data
+        with wave.open(file, mode='rb') as f:
+            # Load WAV clip frames
+            data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
+
+        # Iterate through clip, getting predictions
+        positive_data = defaultdict(list)
+        step_size = 1280
+        for i in range(0, data.shape[0]-step_size, step_size):
+            predictions = self.predict(data[i:i+step_size], **kwargs)
+            for lbl in predictions.keys():
+                if predictions[lbl] >= threshold:
+                    mdl = self.get_parent_model_from_label(lbl)
+                    features = self.preprocessor.get_features(self.model_inputs[mdl])
+                    if return_type == 'features':
+                        positive_data[lbl].append(features)
+                    if return_type == 'audio':
+                        context = data[max(0, i - 16000*3):i + 16000]
+                        if len(context) == 16000*4:
+                            positive_data[lbl].append(context)
+
+        positive_data_combined = {}
+        for lbl in positive_data.keys():
+            positive_data_combined[lbl] = np.vstack(positive_data[lbl])
+
+        return positive_data_combined
+
+    def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160):
+        """
+        Runs the input audio through the SpeexDSP noise suppression algorithm.
+        Note that this function updates the state of the existing Speex noise
+        suppression object, and isn't intended to be called externally.
+
+        Args:
+            x (ndarray): The 16-bit, 16khz audio to process. Must always be an
+                         integer multiple of `frame_size`.
+            frame_size (int): The frame size to use for the Speex Noise suppressor.
+                              Must match the frame size specified during the
+                              initialization of the noise suppressor.
+
+        Returns:
+            ndarray: The input audio with noise suppression applied
+        """
+        cleaned = []
+        for i in range(0, x.shape[0], frame_size):
+            chunk = x[i:i+frame_size]
+            cleaned.append(self.speex_ns.process(chunk.tobytes()))
+
+        cleaned_bytestring = b''.join(cleaned)
+        cleaned_array = np.frombuffer(cleaned_bytestring, np.int16)
+        return cleaned_array
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/alexa_v0.1.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/alexa_v0.1.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/embedding_model.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/embedding_model.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/hey_jarvis_v0.1.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/hey_jarvis_v0.1.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/hey_marvin_v0.1.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/hey_marvin_v0.1.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/hey_mycroft_v0.1.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/hey_mycroft_v0.1.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/melspectrogram.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/melspectrogram.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/silero_vad.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/silero_vad.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/timer_v0.1.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/timer_v0.1.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/resources/models/weather_v0.1.onnx
+++ b/venv/lib/python3.12/site-packages/openwakeword/resources/models/weather_v0.1.onnx
--- a/venv/lib/python3.12/site-packages/openwakeword/utils.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/utils.py
@@ -0,0 +1,407 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Imports
+import os
+import onnxruntime as ort
+import numpy as np
+import pathlib
+from collections import deque
+from multiprocessing.pool import ThreadPool
+from multiprocessing import Process, Queue
+import time
+import openwakeword
+from typing import Union, List, Callable, Deque
+
+
+# Base class for computing audio features using Google's speech_embedding
+# model (https://tfhub.dev/google/speech_embedding/1)
+class AudioFeatures():
+    """
+    A class for creating audio features from audio data, including melspectograms and Google's
+    `speech_embedding` features.
+    """
+    def __init__(self,
+                 melspec_onnx_model_path: str = os.path.join(
+                                                            pathlib.Path(__file__).parent.resolve(),
+                                                            "resources", "models", "melspectrogram.onnx"
+                 ),
+                 embedding_onnx_model_path: str = os.path.join(
+                                                                pathlib.Path(__file__).parent.resolve(),
+                                                                "resources", "models", "embedding_model.onnx"
+                 ),
+                 sr: int = 16000,
+                 ncpu: int = 1
+                 ):
+        """
+        Initialize the AudioFeatures object.
+
+        Args:
+            melspec_onnx_model_path (str): The path to the ONNX model for computing melspectograms from audio data
+            embedding_onnx_model_path (str): The path to the ONNX model for Google's `speech_embedding` model
+            sr (int): The sample rate of the audio (default: 16000 khz)
+            ncpu (int): The number of CPUs to use when computing melspectrograms and audio features (default: 1)
+        """
+        # Initialize the ONNX models
+        sessionOptions = ort.SessionOptions()
+        sessionOptions.inter_op_num_threads = ncpu
+        sessionOptions.intra_op_num_threads = ncpu
+        self.melspec_model = ort.InferenceSession(melspec_onnx_model_path, sess_options=sessionOptions,
+                                                  providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        self.embedding_model = ort.InferenceSession(embedding_onnx_model_path, sess_options=sessionOptions,
+                                                    providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        self.onnx_execution_provider = self.melspec_model.get_providers()[0]
+
+        # Create databuffers
+        self.raw_data_buffer: Deque = deque(maxlen=sr*10)
+        self.melspectrogram_buffer = np.ones((76, 32))  # n_frames x num_features
+        self.melspectrogram_max_len = 10*97  # 97 is the number of frames in 1 second of 16hz audio
+        self.accumulated_samples = 0  # the samples added to the buffer since the audio preprocessor was last called
+        self.feature_buffer = self._get_embeddings(np.zeros(160000).astype(np.int16))  # fill with blank data to start
+        self.feature_buffer_max_len = 120  # ~10 seconds of feature buffer history
+
+    def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2):
+        """
+        Function to compute the mel-spectrogram of the provided audio samples.
+
+        Args:
+            x (Union[np.ndarray, List]): The input audio data to compute the melspectrogram from
+            melspec_transform (Callable): A function to transform the computed melspectrogram. Defaults to a transform
+                                          that makes the ONNX melspectrogram model closer to the native Tensorflow
+                                          implementation from Google (https://tfhub.dev/google/speech_embedding/1).
+
+        Return:
+            np.ndarray: The computed melspectrogram of the input audio data
+        """
+        # Get input data and adjust type/shape as needed
+        x = np.array(x).astype(np.int16) if isinstance(x, list) else x
+        if x.dtype != np.int16:
+            raise ValueError("Input data must be 16-bit integers (i.e., 16-bit PCM audio)."
+                             f"You provided {x.dtype} data.")
+        x = x[None, ] if len(x.shape) < 2 else x
+        x = x.astype(np.float32) if x.dtype != np.float32 else x
+
+        # Get melspectrogram
+        outputs = self.melspec_model.run(None, {'input': x})
+        spec = np.squeeze(outputs[0])
+
+        # Arbitrary transform of melspectrogram
+        spec = melspec_transform(spec)
+
+        return spec
+
+    def _get_embeddings_from_melspec(self, melspec):
+        """
+        Computes the Google `speech_embedding` features from a melspectrogram input
+
+        Args:
+            melspec (np.ndarray): The input melspectrogram
+
+        Returns:
+            np.ndarray: The computed audio features/embeddings
+        """
+        if melspec.shape[0] != 1:
+            melspec = melspec[None, ]
+        embedding = self.embedding_model.run(None, {'input_1': melspec})[0].squeeze()
+        return embedding
+
+    def _get_embeddings(self, x: np.ndarray, window_size: int = 76, step_size: int = 8, **kwargs):
+        """Function to compute the embeddings of the provide audio samples."""
+        spec = self._get_melspectrogram(x, **kwargs)
+        windows = []
+        for i in range(0, spec.shape[0], 8):
+            window = spec[i:i+window_size]
+            if window.shape[0] == window_size:  # truncate short windows
+                windows.append(window)
+
+        batch = np.expand_dims(np.array(windows), axis=-1).astype(np.float32)
+        embedding = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
+        return embedding
+
+    def get_embedding_shape(self, audio_length: float, sr: int = 16000):
+        """Function that determines the size of the output embedding array for a given audio clip length (in seconds)"""
+        x = (np.random.uniform(-1, 1, int(audio_length*sr))*32767).astype(np.int16)
+        return self._get_embeddings(x).shape
+
+    def _get_melspectrogram_batch(self, x, batch_size=128, ncpu=1):
+        """
+        Compute the melspectrogram of the input audio samples in batches.
+
+        Note that the optimal performance will depend in the interaction between the device,
+        batch size, and ncpu (if a CPU device is used). The user is encouraged
+        to experiment with different values of these parameters to identify
+        which combination is best for their data, as often differences of 1-4x are seen.
+
+        Args:
+            x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
+                        Assumes that all of the audio data is the same length (same number of samples).
+            batch_size (int): The batch size to use when computing the melspectrogram
+            ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
+                        no effect if the underlying model is executing on a GPU.
+
+        Returns:
+            ndarray: A numpy array of shape (N, frames, melbins) containing the melspectrogram of
+                    all N input audio examples
+        """
+
+        # Prepare ThreadPool object, if needed for multithreading
+        pool = None
+        if "CPU" in self.onnx_execution_provider:
+            pool = ThreadPool(processes=ncpu)
+
+        # Make batches
+        n_frames = int(np.ceil(x.shape[1]/160-3))
+        mel_bins = 32  # fixed by melspectrogram model
+        melspecs = np.empty((x.shape[0], n_frames, mel_bins), dtype=np.float32)
+        for i in range(0, max(batch_size, x.shape[0]), batch_size):
+            batch = x[i:i+batch_size]
+
+            if "CUDA" in self.onnx_execution_provider:
+                result = self._get_melspectrogram(batch)
+
+            elif pool:
+                result = np.array(pool.map(self._get_melspectrogram,
+                                           batch, chunksize=batch.shape[0]//ncpu))
+
+            melspecs[i:i+batch_size, :, :] = result.squeeze()
+
+        # Cleanup ThreadPool
+        if pool:
+            pool.close()
+
+        return melspecs
+
+    def _get_embeddings_batch(self, x, batch_size=128, ncpu=1):
+        """
+        Compute the embeddings of the input melspectrograms in batches.
+
+        Note that the optimal performance will depend in the interaction between the device,
+        batch size, and ncpu (if a CPU device is used). The user is encouraged
+        to experiment with different values of these parameters to identify
+        which combination is best for their data, as often differences of 1-4x are seen.
+
+        Args:
+            x (ndarray): A numpy array of melspectrograms of shape (N, frames, melbins).
+                        Assumes that all of the melspectrograms have the same shape.
+            batch_size (int): The batch size to use when computing the embeddings
+            ncpu (int): The number of CPUs to use when computing the embeddings. This argument has
+                        no effect if the underlying model is executing on a GPU.
+
+        Returns:
+            ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
+                    all N input melspectrograms
+        """
+        # Ensure input is the correct shape
+        if x.shape[1] < 76:
+            raise ValueError("Embedding model requires the input melspectrograms to have at least 76 frames")
+
+        # Prepare ThreadPool object, if needed for multithreading
+        pool = None
+        if "CPU" in self.onnx_execution_provider:
+            pool = ThreadPool(processes=ncpu)
+
+        # Calculate array sizes and make batches
+        n_frames = (x.shape[1] - 76)//8 + 1
+        embedding_dim = 96  # fixed by embedding model
+        embeddings = np.empty((x.shape[0], n_frames, embedding_dim), dtype=np.float32)
+
+        batch = []
+        ndcs = []
+        for ndx, melspec in enumerate(x):
+            window_size = 76
+            for i in range(0, melspec.shape[0], 8):
+                window = melspec[i:i+window_size]
+                if window.shape[0] == window_size:  # ignore windows that are too short (truncates end of clip)
+                    batch.append(window)
+            ndcs.append(ndx)
+
+            if len(batch) >= batch_size or ndx+1 == x.shape[0]:
+                batch = np.array(batch).astype(np.float32)
+                if "CUDA" in self.onnx_execution_provider:
+                    result = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
+
+                elif pool:
+                    result = np.array(pool.map(self._get_embeddings_from_melspec,
+                                      batch, chunksize=batch.shape[0]//ncpu))
+
+                for j, ndx2 in zip(range(0, result.shape[0], n_frames), ndcs):
+                    embeddings[ndx2, :, :] = result[j:j+n_frames]
+
+                batch = []
+                ndcs = []
+
+        # Cleanup ThreadPool
+        if pool:
+            pool.close()
+
+        return embeddings
+
+    def embed_clips(self, x, batch_size=128, ncpu=1):
+        """
+        Compute the embeddings of the input audio clips in batches.
+
+        Note that the optimal performance will depend in the interaction between the device,
+        batch size, and ncpu (if a CPU device is used). The user is encouraged
+        to experiment with different values of these parameters to identify
+        which combination is best for their data, as often differences of 1-4x are seen.
+
+        Args:
+            x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
+                        Assumes that all of the audio data is the same length (same number of samples).
+            batch_size (int): The batch size to use when computing the embeddings
+            ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
+                        no effect if the underlying model is executing on a GPU.
+
+        Returns:
+            ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
+                    all N input audio clips
+        """
+
+        # Compute melspectrograms
+        melspecs = self._get_melspectrogram_batch(x, batch_size=batch_size, ncpu=ncpu)
+
+        # Compute embeddings from melspectrograms
+        embeddings = self._get_embeddings_batch(melspecs[:, :, :, None], batch_size=batch_size, ncpu=ncpu)
+
+        return embeddings
+
+    def _streaming_melspectrogram(self, n_samples):
+        """Note! There seem to be some slight numerical issues depending on the underlying audio data
+        such that the streaming method is not exactly the same as when the melspectrogram of the entire
+        clip is calculated. It's unclear if this difference is significant and will impact model performance.
+        In particular padding with 0 or very small values seems to demonstrate the differences well.
+        """
+        self.melspectrogram_buffer = np.vstack(
+            (self.melspectrogram_buffer, self._get_melspectrogram(list(self.raw_data_buffer)[-n_samples-160*3:]))
+        )
+
+        if self.melspectrogram_buffer.shape[0] > self.melspectrogram_max_len:
+            self.melspectrogram_buffer = self.melspectrogram_buffer[-self.melspectrogram_max_len:, :]
+
+    def _buffer_raw_data(self, x):
+        """
+        Adds raw audio data to the input buffer
+        """
+        if len(x) < 400:
+            raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
+        self.raw_data_buffer.extend(x.tolist() if isinstance(x, np.ndarray) else x)
+
+    def _streaming_features(self, x):
+        # if len(x) != 1280:
+        #     raise ValueError("You must provide input samples in frames of 1280 samples @ 1600khz."
+        #                      f"Received a frame of {len(x)} samples.")
+
+        # Add raw audio data to buffer
+        self._buffer_raw_data(x)
+        self.accumulated_samples += len(x)
+
+        # Only calculate melspectrogram every ~0.5 seconds to significantly increase efficiency
+        if self.accumulated_samples >= 1280:
+            self._streaming_melspectrogram(self.accumulated_samples)
+
+            # Calculate new audio embeddings/features based on update melspectrograms
+            for i in np.arange(self.accumulated_samples//1280-1, -1, -1):
+                ndx = -8*i
+                ndx = ndx if ndx != 0 else len(self.melspectrogram_buffer)
+                x = self.melspectrogram_buffer[-76 + ndx:ndx].astype(np.float32)[None, :, :, None]
+                if x.shape[1] == 76:
+                    self.feature_buffer = np.vstack((self.feature_buffer,
+                                                    self.embedding_model.run(None, {'input_1': x})[0].squeeze()))
+
+            # Reset raw data buffer counter
+            self.accumulated_samples = 0
+
+        if self.feature_buffer.shape[0] > self.feature_buffer_max_len:
+            self.feature_buffer = self.feature_buffer[-self.feature_buffer_max_len:, :]
+
+    def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
+        if start_ndx != -1:
+            end_ndx = start_ndx + int(n_feature_frames) \
+                if start_ndx + n_feature_frames != 0 else len(self.feature_buffer)
+            return self.feature_buffer[start_ndx:end_ndx, :][None, ].astype(np.float32)
+        else:
+            return self.feature_buffer[int(-1*n_feature_frames):, :][None, ].astype(np.float32)
+
+    def __call__(self, x):
+        self._streaming_features(x)
+
+
+# Bulk prediction function
+def bulk_predict(
+                 file_paths: List[str],
+                 wakeword_model_paths: List[str],
+                 prediction_function: str = 'predict_clip',
+                 ncpu: int = 1,
+                 **kwargs
+                 ):
+    """
+    Bulk predict on the provided input files in parallel using multiprocessing using the specified model.
+
+    Args:
+        input_paths (List[str]): The list of input file to predict
+        wakeword_model_path (List[str])): The paths to the wakeword ONNX model files
+        prediction_function (str): The name of the method used to predict on the input audio files
+                                   (default is the `predict_clip` method)
+        ncpu (int): How many processes to create (up to max of available CPUs)
+        kwargs (dict): Any other keyword arguments to pass to the model initialization or
+                       specified prediction function
+
+    Returns:
+        dict: A dictionary containing the predictions for each file, with the filepath as the key
+    """
+
+    # Create openWakeWord model objects
+    n_batches = max(1, len(file_paths)//ncpu)
+    remainder = len(file_paths) % ncpu
+    chunks = [file_paths[i:i+n_batches] for i in range(0, max(1, len(file_paths)-remainder), n_batches)]
+    for i in range(1, remainder+1):
+        chunks[i-1].append(file_paths[-1*i])
+
+    # Create jobs
+    ps = []
+    mdls = []
+    q: Queue = Queue()
+    for chunk in chunks:
+        filtered_kwargs = {key: value for key, value in kwargs.items()
+                           if key in openwakeword.Model.__init__.__code__.co_varnames}
+        oww = openwakeword.Model(
+            wakeword_model_paths=wakeword_model_paths,
+            **filtered_kwargs
+        )
+        mdls.append(oww)
+
+        def f(clips):
+            results = []
+            for clip in clips:
+                func = getattr(mdls[-1], prediction_function)
+                filtered_kwargs = {key: value for key, value in kwargs.items()
+                                   if key in func.__code__.co_varnames}
+                results.append({clip: func(clip, **filtered_kwargs)})
+            q.put(results)
+
+        ps.append(Process(target=f, args=(chunk,)))
+
+    # Submit jobs
+    for p in ps:
+        p.start()
+
+    # Collection results
+    results = []
+    for p in ps:
+        while q.empty():
+            time.sleep(0.01)
+        results.extend(q.get())
+
+    # Consolidate results and return
+    return {list(i.keys())[0]: list(i.values())[0] for i in results}
--- a/venv/lib/python3.12/site-packages/openwakeword/vad.py
+++ b/venv/lib/python3.12/site-packages/openwakeword/vad.py
@@ -0,0 +1,128 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#######################
+# Silero VAD License
+#######################
+
+# MIT License
+
+# Copyright (c) 2020-present Silero Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+########################################
+
+# This file contains the implementation of a class for voice activity detection (VAD),
+# based on the pre-trained model from Silero (https://github.com/snakers4/silero-vad).
+# It can be used as with the openWakeWord library, or independently.
+
+# Imports
+import onnxruntime as ort
+import numpy as np
+import os
+from collections import deque
+
+
+class VAD():
+    """
+    A model class for a voice activity detection (VAD) based on Silero's model:
+
+    https://github.com/snakers4/silero-vad
+    """
+    def __init__(self,
+                 model_path: str = os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    "resources",
+                    "models",
+                    "silero_vad.onnx"
+                 )
+                 ):
+        """Initialize the VAD model object.
+
+            Args:
+                model_path (str): The path to the Silero VAD ONNX model.
+        """
+
+        # Initialize the ONNX model
+        sessionOptions = ort.SessionOptions()
+        sessionOptions.inter_op_num_threads = 1
+        sessionOptions.intra_op_num_threads = 1
+        self.model = ort.InferenceSession(model_path, sess_options=sessionOptions,
+                                          providers=["CPUExecutionProvider"])
+
+        # Create buffer
+        self.prediction_buffer: deque = deque(maxlen=125)  # buffer lenght of 10 seconds
+
+        # Set model parameters
+        self.sample_rate = np.array(16000).astype(np.int64)
+
+        # Reset model to start
+        self.reset_states()
+
+    def reset_states(self, batch_size=1):
+        self._h = np.zeros((2, batch_size, 64)).astype('float32')
+        self._c = np.zeros((2, batch_size, 64)).astype('float32')
+        self._last_sr = 0
+        self._last_batch_size = 0
+
+    def predict(self, x, frame_size=480):
+        """
+        Get the VAD predictions for the input audio frame.
+
+        Args:
+            x (np.ndarray): The input audio, must be 16 khz and 16-bit PCM format.
+                            If longer than the input frame, will be split into
+                            chunks of length `frame_size` and the predictions for
+                            each chunk returned. Must be a length that is integer
+                            multiples of the `frame_size` argument.
+            frame_size (int): The frame size in samples. The reccomended
+                              default is 480 samples (30 ms @ 16khz),
+                              but smaller and larger values
+                              can be used (though performance may decrease).
+
+        Returns
+            float: The average predicted score for the audio frame
+        """
+        chunks = [(x[i:i+frame_size]/32767).astype(np.float32)
+                  for i in range(0, x.shape[0], frame_size)]
+
+        frame_predictions = []
+        for chunk in chunks:
+            ort_inputs = {'input': chunk[None, ],
+                          'h': self._h, 'c': self._c, 'sr': self.sample_rate}
+            ort_outs = self.model.run(None, ort_inputs)
+            out, self._h, self._c = ort_outs
+            frame_predictions.append(out[0][0])
+
+        return np.mean(frame_predictions)
+
+    def __call__(self, x, frame_size=160*4):
+        self.prediction_buffer.append(self.predict(x, frame_size))