add read me
This commit is contained in:
39
venv/lib/python3.12/site-packages/openwakeword/__init__.py
Normal file
39
venv/lib/python3.12/site-packages/openwakeword/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
from openwakeword.model import Model
|
||||
from openwakeword.vad import VAD
|
||||
from openwakeword.custom_verifier_model import train_custom_verifier
|
||||
|
||||
__all__ = ['Model', 'VAD', 'train_custom_verifier']
|
||||
|
||||
models = {
|
||||
"alexa": {
|
||||
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/alexa_v0.1.onnx")
|
||||
},
|
||||
"hey_mycroft": {
|
||||
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_mycroft_v0.1.onnx")
|
||||
},
|
||||
"hey_jarvis": {
|
||||
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_jarvis_v0.1.onnx")
|
||||
},
|
||||
"timer": {
|
||||
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/timer_v0.1.onnx")
|
||||
},
|
||||
"weather": {
|
||||
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/weather_v0.1.onnx")
|
||||
}
|
||||
}
|
||||
|
||||
model_class_mappings = {
|
||||
"timer": {
|
||||
"1": "1_minute_timer",
|
||||
"2": "5_minute_timer",
|
||||
"3": "10_minute_timer",
|
||||
"4": "20_minute_timer",
|
||||
"5": "30_minute_timer",
|
||||
"6": "1_hour_timer"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_pretrained_model_paths():
|
||||
return [models[i]["model_path"] for i in models.keys()]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,174 @@
|
||||
# Copyright 2022 David Scripka. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Imports
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import collections
|
||||
import openwakeword
|
||||
import numpy as np
|
||||
import scipy
|
||||
import pickle
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import FunctionTransformer, StandardScaler
|
||||
|
||||
|
||||
# Define functions to prepare data for speaker dependent verifier model
|
||||
def get_reference_clip_features(
|
||||
reference_clip: str,
|
||||
oww_model: openwakeword.Model,
|
||||
model_name: str,
|
||||
threshold: float = 0.5,
|
||||
N: int = 3,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Processes input audio files (16-bit, 16-khz single-channel WAV files) and gets the openWakeWord
|
||||
audio features that produce a prediction from the specified model greater than the threshold value.
|
||||
|
||||
|
||||
Args:
|
||||
reference_clip (str): The target audio file to get features from
|
||||
oww_model (openwakeword.Model): The openWakeWord model object used to get predictions
|
||||
model_name (str): The name of the model to get predictions from (should correspond to
|
||||
a python dictionary key in the oww_model.models attribute)
|
||||
threshold (float): The minimum score from the model required to capture the associated features
|
||||
N (int): How many times to run feature extraction for a given clip, adding some slight variation
|
||||
in the starting position each time to ensure that the features are not identical
|
||||
|
||||
Returns:
|
||||
ndarray: A numpy array of shape N x M x L, where N is the number of examples, M is the number
|
||||
of frames in the window, and L is the audio feature/embedding dimension.
|
||||
"""
|
||||
|
||||
# Create dictionary to store frames
|
||||
positive_data = collections.defaultdict(list)
|
||||
|
||||
# Get predictions
|
||||
for _ in range(N):
|
||||
# Load clip
|
||||
if type(reference_clip) == str:
|
||||
sr, dat = scipy.io.wavfile.read(reference_clip)
|
||||
else:
|
||||
dat = reference_clip
|
||||
|
||||
# Set random starting point to get small variations in features
|
||||
if N != 1:
|
||||
dat = dat[np.random.randint(0, 1280):]
|
||||
|
||||
# Get predictions
|
||||
step_size = 1280
|
||||
for i in range(0, dat.shape[0]-step_size, step_size):
|
||||
predictions = oww_model.predict(dat[i:i+step_size], **kwargs)
|
||||
if predictions[model_name] >= threshold:
|
||||
features = oww_model.preprocessor.get_features( # type: ignore[has-type]
|
||||
oww_model.model_inputs[model_name] # type: ignore[has-type]
|
||||
)
|
||||
positive_data[model_name].append(features)
|
||||
|
||||
if len(positive_data[model_name]) == 0:
|
||||
positive_data[model_name].append(
|
||||
np.empty((0, oww_model.model_inputs[model_name], 96))) # type: ignore[has-type]
|
||||
|
||||
return np.vstack(positive_data[model_name])
|
||||
|
||||
|
||||
def flatten_features(x):
|
||||
return [i.flatten() for i in x]
|
||||
|
||||
|
||||
def train_verifier_model(features: np.ndarray, labels: np.ndarray):
|
||||
"""
|
||||
Train a logistic regression binary classifier model on the provided features and labels
|
||||
|
||||
Args:
|
||||
features (ndarray): A N x M numpy array, where N is the number of examples and M
|
||||
is the number of features
|
||||
labels (ndarray): A 1D numpy array where each value corresponds to the label of the Nth
|
||||
example in the `features` argument
|
||||
|
||||
Returns:
|
||||
The trained scikit-learn logistic regression model
|
||||
"""
|
||||
# C value matters alot here, depending on dataset size (larger datasets work better with larger C?)
|
||||
clf = LogisticRegression(random_state=0, max_iter=2000, C=0.001)
|
||||
pipeline = make_pipeline(FunctionTransformer(flatten_features), StandardScaler(), clf)
|
||||
pipeline.fit(features, labels)
|
||||
|
||||
return pipeline
|
||||
|
||||
|
||||
def train_custom_verifier(
|
||||
positive_reference_clips: str,
|
||||
negative_reference_clips: str,
|
||||
output_path: str,
|
||||
model_name: str,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Trains a voice-specific custom verifier model on examples of wake word/phrase speech and other speech
|
||||
from a single user.
|
||||
|
||||
Args:
|
||||
positive_reference_clips (str): The path to a directory containing single-channel 16khz, 16-bit WAV files
|
||||
of the target wake word/phrase.
|
||||
negative_reference_clips (str): The path to a directory containing single-channel 16khz, 16-bit WAV files
|
||||
of miscellaneous speech not containing the target wake word/phrase.
|
||||
output_path (str): The location to save the trained verifier model (as a scikit-learn .joblib file)
|
||||
model_name (str): The name or path of the trained openWakeWord model that the verifier model will be
|
||||
based on. If only a name, it must be one of the pre-trained models included in the
|
||||
openWakeWord release.
|
||||
kwargs: Any other keyword arguments to pass to the openWakeWord model initialization
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# Load target openWakeWord model
|
||||
if os.path.exists(model_name):
|
||||
oww = openwakeword.Model(
|
||||
wakeword_model_paths=[model_name],
|
||||
**kwargs
|
||||
)
|
||||
model_name = model_name.split(os.path.sep)[-1][0:-5]
|
||||
else:
|
||||
oww = openwakeword.Model(**kwargs)
|
||||
|
||||
# Get features from positive reference clips
|
||||
positive_features = np.vstack(
|
||||
[get_reference_clip_features(i, oww, model_name, N=5)
|
||||
for i in tqdm(positive_reference_clips, desc="Processing positive reference clips")]
|
||||
)
|
||||
if positive_features.shape[0] == 0:
|
||||
raise ValueError("The positive features were created! Make sure that"
|
||||
" the positive reference clips contain the appropriate audio"
|
||||
" for the desired model")
|
||||
|
||||
# Get features from negative reference clips
|
||||
negative_features = np.vstack(
|
||||
[get_reference_clip_features(i, oww, model_name, threshold=0.0, N=1)
|
||||
for i in tqdm(negative_reference_clips, desc="Processing negative reference clips")]
|
||||
)
|
||||
|
||||
# Train logistic regression model on reference clip features
|
||||
print("Training and saving verifier model...")
|
||||
lr_model = train_verifier_model(
|
||||
np.vstack((positive_features, negative_features)),
|
||||
np.array([1]*positive_features.shape[0] + [0]*negative_features.shape[0])
|
||||
)
|
||||
|
||||
# Save logistic regression model to specified output location
|
||||
print("Done!")
|
||||
pickle.dump(lr_model, open(output_path, "wb"))
|
||||
712
venv/lib/python3.12/site-packages/openwakeword/data.py
Normal file
712
venv/lib/python3.12/site-packages/openwakeword/data.py
Normal file
@@ -0,0 +1,712 @@
|
||||
# Copyright 2022 David Scripka. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# imports
|
||||
from multiprocessing.pool import ThreadPool
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
from typing import List, Tuple
|
||||
import numpy as np
|
||||
import torch
|
||||
from numpy.lib.format import open_memmap
|
||||
from speechbrain.dataio.dataio import read_audio
|
||||
from speechbrain.processing.signal_processing import reverberate
|
||||
import torchaudio
|
||||
import mutagen
|
||||
import acoustics
|
||||
|
||||
|
||||
# Load audio clips and structure into clips of the same length
|
||||
def stack_clips(audio_data, clip_size=16000*2):
|
||||
"""
|
||||
Takes an input list of 1D arrays (of different lengths), concatenates them together,
|
||||
and then extracts clips of a uniform size by dividing the combined array.
|
||||
|
||||
Args:
|
||||
audio_data (List[ndarray]): A list of 1D numpy arrays to combine and stack
|
||||
clip_size (int): The desired total length of the uniform clip size (in samples)
|
||||
|
||||
Returns:
|
||||
ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
|
||||
"""
|
||||
|
||||
# Combine all clips into single clip
|
||||
combined_data = np.hstack((audio_data))
|
||||
|
||||
# Get chunks of the specified size
|
||||
new_examples = []
|
||||
for i in range(0, combined_data.shape[0], clip_size):
|
||||
chunk = combined_data[i:i+clip_size]
|
||||
if chunk.shape[0] != clip_size:
|
||||
chunk = np.hstack((chunk, np.zeros(clip_size - chunk.shape[0])))
|
||||
new_examples.append(chunk)
|
||||
|
||||
return np.array(new_examples)
|
||||
|
||||
|
||||
def load_audio_clips(files, clip_size=32000):
|
||||
"""
|
||||
Takes the specified audio files and shapes them into an array of N by `clip_size`,
|
||||
where N is determined by the length of the audio files and `clip_size` at run time.
|
||||
|
||||
Clips longer than `clip size` are truncated and extended into the N+1 row.
|
||||
Clips shorter than `clip_size` are combined with the previous or next clip
|
||||
(except for the last clip in `files`, which is ignored if it is too short.)
|
||||
|
||||
Args:
|
||||
files (List[str]): A list of filepaths
|
||||
clip_size (int): The number of samples (of 16khz audio) for all of the rows in the array
|
||||
|
||||
Returns:
|
||||
ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
|
||||
"""
|
||||
|
||||
# Load audio files
|
||||
audio_data = []
|
||||
for i in files:
|
||||
try:
|
||||
audio_data.append(read_audio(i))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Get shape of output array
|
||||
N = sum([i.shape[0] for i in audio_data])//clip_size
|
||||
X = np.empty((N, clip_size))
|
||||
|
||||
# Add audio data to rows
|
||||
previous_row_remainder = None
|
||||
cnt = 0
|
||||
for row in audio_data:
|
||||
row = np.hstack((previous_row_remainder, row))
|
||||
while row.shape[0] >= clip_size:
|
||||
X[cnt, :] = row[0:clip_size]
|
||||
row = row[clip_size:]
|
||||
cnt += 1
|
||||
|
||||
previous_row_remainder = row if row.size > 0 else None
|
||||
|
||||
# Convert to 16-bit PCM data
|
||||
X = (X*32767).astype(np.int16)
|
||||
|
||||
return X
|
||||
|
||||
|
||||
# Dato I/O utils
|
||||
|
||||
|
||||
# Convert clips with sox
|
||||
def _convert_clip(input_file, output_file, backend="ffmpeg"):
|
||||
if backend == "sox":
|
||||
cmd = f"sox \"{input_file}\" -G -r 16000 -c 1 \"{output_file}\""
|
||||
elif backend == "ffmpeg":
|
||||
cmd = f"ffmpeg -y -i \"{input_file}\" -ar 16000 \"{output_file}\""
|
||||
os.system(cmd)
|
||||
return None
|
||||
|
||||
|
||||
def convert_clips(input_files, output_files, sr=16000, ncpu=1, backend="ffmpeg"):
|
||||
"""
|
||||
Converts files in parallel with multithreading using Sox or ffmpeg.
|
||||
|
||||
Intended to only convert input audio files into single-channel, 16 khz clips.
|
||||
|
||||
Args:
|
||||
input_files (List[str]): A list of paths to input files
|
||||
output_files (List[str]): A list of paths to output files, corresponding 1:1 to the input files
|
||||
sr (int): The output sample rate of the converted clip
|
||||
ncpu (int): The number of CPUs to use for the conversion
|
||||
backend (str): The utilty to use for conversion, "sox" or "ffmpeg"
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# Setup ThreadPool object
|
||||
pool = ThreadPool(processes=ncpu)
|
||||
|
||||
# Set backend for conversion
|
||||
f = partial(_convert_clip, backend=backend)
|
||||
|
||||
# Submit jobs
|
||||
pool.starmap(f, [(i, j) for i, j in zip(input_files, output_files)])
|
||||
|
||||
|
||||
def filter_audio_paths(target_dirs, min_length_secs, max_length_secs, duration_method="size", glob_filter=None):
|
||||
"""
|
||||
Gets the paths of wav files in flat target directories, automatically filtering
|
||||
out files below/above the specified length (in seconds). Assumes that all
|
||||
wav files are sampled at 16khz, are single channel, and have 16-bit PCM data.
|
||||
|
||||
Uses `os.scandir` in Python for highly efficient file system exploration,
|
||||
and doesn't require loading the files into memory for length estimation.
|
||||
|
||||
Args:
|
||||
target_dir (List[str]): The target directories containing the audio files
|
||||
min_length_secs (float): The minimum length in seconds (otherwise the clip is skipped)
|
||||
max_length_secs (float): The maximum length in seconds (otherwise the clip is skipped)
|
||||
duration_method (str): Whether to use the file size ('size'), or header information ('header')
|
||||
to estimate the duration of the audio file. 'size' is generally
|
||||
much faster, but assumes that all files in the target directory
|
||||
are the same type, sample rate, and bitrate. If None, durations are not calculated.
|
||||
glob_filter (str): A pathlib glob filter string to select specific files within the target directory
|
||||
|
||||
Returns:
|
||||
tuple: A list of strings corresponding to the paths of the wav files that met the length criteria,
|
||||
and a list of their durations (in seconds)
|
||||
"""
|
||||
|
||||
file_paths = []
|
||||
durations = []
|
||||
for target_dir in target_dirs:
|
||||
sizes = []
|
||||
dir_paths = []
|
||||
if glob_filter:
|
||||
dir_paths = [str(i) for i in Path(target_dir).glob(glob_filter)]
|
||||
file_paths.extend(dir_paths)
|
||||
sizes.extend([os.path.getsize(i) for i in dir_paths])
|
||||
else:
|
||||
for i in tqdm(os.scandir(target_dir)):
|
||||
dir_paths.append(i.path)
|
||||
file_paths.append(i.path)
|
||||
sizes.append(i.stat().st_size)
|
||||
|
||||
if duration_method == "size":
|
||||
durations.extend(estimate_clip_duration(dir_paths, sizes))
|
||||
|
||||
elif duration_method == "header":
|
||||
durations.extend([get_clip_duration(i) for i in tqdm(dir_paths)])
|
||||
|
||||
if durations != []:
|
||||
filtered = [(i, j) for i, j in zip(file_paths, durations) if j >= min_length_secs and j <= max_length_secs]
|
||||
return [i[0] for i in filtered], [i[1] for i in filtered]
|
||||
else:
|
||||
return file_paths, []
|
||||
|
||||
|
||||
def estimate_clip_duration(audio_files: list, sizes: list):
|
||||
"""Estimates the duration of each audio file in a list.
|
||||
|
||||
Assumes that all of the audio files have the same audio format,
|
||||
bit depth, and sample rate.
|
||||
|
||||
Args:
|
||||
audio_file (str): A list of audio file paths
|
||||
sizes (int): The size of each audio file in bytes
|
||||
|
||||
Returns:
|
||||
list: A list of durations (in seconds) for the audio files
|
||||
"""
|
||||
|
||||
# Determine file type by checking the first file
|
||||
details = torchaudio.info(audio_files[0])
|
||||
|
||||
# Caculate any correction factors needed from the first file
|
||||
details = mutagen.File(audio_files[0])
|
||||
correction = 8*os.path.getsize(audio_files[0]) - details.info.bitrate*details.info.length
|
||||
|
||||
# Estimate duration for all remaining clips from file size only
|
||||
durations = []
|
||||
for size in sizes:
|
||||
durations.append((size*8-correction)/details.info.bitrate)
|
||||
|
||||
return durations
|
||||
|
||||
|
||||
def estimate_mp3_duration(fpath):
|
||||
"""Estimates the duration of an MP3 file from metadata and file-size.
|
||||
Is only accurate for 16000 khz sample rate audio with a relatively
|
||||
constant bit-rate.
|
||||
|
||||
Args:
|
||||
fpath (str): The input path to the MP3 file
|
||||
|
||||
Returns:
|
||||
float: The duration of the MP3 file in seconds
|
||||
"""
|
||||
|
||||
conversion_factors = {
|
||||
"16_khz_single_channel": 0.000333318208471784,
|
||||
"16_khz_stereo": 0.000333318208471784/2
|
||||
}
|
||||
|
||||
duration_seconds = 0
|
||||
try:
|
||||
md = torchaudio.info(fpath)
|
||||
except RuntimeError:
|
||||
return duration_seconds
|
||||
|
||||
nbytes = os.path.getsize(fpath)
|
||||
if md.num_channels == 1:
|
||||
if md.sample_rate == 16000:
|
||||
duration_seconds = nbytes*conversion_factors["16_khz_single_channel"]
|
||||
elif md.num_channels == 2:
|
||||
if md.sample_rate == 16000:
|
||||
duration_seconds = nbytes*conversion_factors["16_khz_stereo"]
|
||||
|
||||
return duration_seconds
|
||||
|
||||
|
||||
def get_clip_duration(clip):
|
||||
"""Gets the duration of an audio clip in seconds from file header information"""
|
||||
try:
|
||||
metadata = torchaudio.info(clip)
|
||||
except RuntimeError: # skip cases where file metadata can't be read
|
||||
return 0
|
||||
|
||||
return metadata.num_frames/metadata.sample_rate
|
||||
|
||||
|
||||
def get_wav_duration_from_filesize(size, nbytes=2):
|
||||
"""
|
||||
Calculates the duration (in seconds) from a WAV file, assuming it contains 16 khz single-channel audio.
|
||||
The bit depth is user specified, and defaults to 2 for 16-bit PCM audio.
|
||||
|
||||
Args:
|
||||
size (int): The file size in bytes
|
||||
nbytes (int): How many bytes for each data point in the audio (e.g., 16-bit is 2, 32-bit is 4, etc.)
|
||||
|
||||
Returns:
|
||||
float: The duration of the audio file in seconds
|
||||
"""
|
||||
return (size-44)/nbytes/16000
|
||||
|
||||
|
||||
# Data augmentation utility function
|
||||
def mix_clips_batch(
|
||||
foreground_clips: List[str],
|
||||
background_clips: List[str],
|
||||
combined_size: int,
|
||||
labels: List[int] = [],
|
||||
batch_size: int = 32,
|
||||
snr_low: float = 0,
|
||||
snr_high: float = 0,
|
||||
start_index: List[int] = [],
|
||||
foreground_durations: List[float] = [],
|
||||
foreground_truncate_strategy: str = "random",
|
||||
rirs: List[str] = [],
|
||||
rir_probability: int = 1,
|
||||
volume_augmentation: bool = True,
|
||||
generated_noise_augmentation: float = 0.0,
|
||||
shuffle: bool = True,
|
||||
return_sequence_labels: bool = False,
|
||||
return_background_clips: bool = False,
|
||||
return_background_clips_delay: Tuple[int, int] = (0, 0),
|
||||
seed: int = 0
|
||||
):
|
||||
"""
|
||||
Mixes foreground and background clips at a random SNR level in batches.
|
||||
|
||||
References: https://pytorch.org/audio/main/tutorials/audio_data_augmentation_tutorial.html and
|
||||
https://speechbrain.readthedocs.io/en/latest/API/speechbrain.processing.speech_augmentation.html#speechbrain.processing.speech_augmentation.AddNoise
|
||||
|
||||
Args:
|
||||
foreground_clips (List[str]): A list of paths to the foreground clips
|
||||
background_clips (List[str]): A list of paths to the background clips (randomly selected for each
|
||||
foreground clip)
|
||||
combined_size (int): The total length (in samples) of the combined clip. If needed, the background
|
||||
clips are duplicated or truncated to reach this length.
|
||||
labels (List[int]): A list of integer labels corresponding 1:1 for the foreground clips. Will be updated
|
||||
as needed with foreground clips to ensure that mixed clips retain the proper labels.
|
||||
batch_size (int): The batch size
|
||||
snr_low (float): The low SNR level of the mixing in db
|
||||
snr_high (float): The high snr level of the mixing in db
|
||||
start_index (List[int]): The starting position (in samples) for the foreground clip to start in
|
||||
the background clip. If the foreground clip is longer than `combined_size`
|
||||
when starting at this point, the foreground clip will be truncated
|
||||
according to the `foreground_truncate_strategy` argument.
|
||||
foreground_durations (List[float]): The desired duration of each foreground clip (in seconds)
|
||||
foreground_truncate_strategy (str): The method used to truncate the foreground clip, if needed based on the
|
||||
`start_index`, `foreground_durations`, and `combined_size` arguments.
|
||||
See the options in the `truncate_clip` method.
|
||||
rirs (List[str]): A list of paths to room impulse response functions (RIR) to convolve with the
|
||||
clips to simulate different recording environments. Applies a single random selection from the
|
||||
list RIR file to the entire batch. If empty (the default), nothing is done.
|
||||
rir_probability (float): The probability (between 0 and 1) that the batch will be convolved with a RIR file.
|
||||
volume_augmentation (bool): Whether to randomly apply volume augmentation to the clips in the batch.
|
||||
This simply scales the data of each clip such that the maximum value is is between
|
||||
0.02 and 1.0 (the floor shouldn't be zero as beyond a certain point the audio data
|
||||
is no longer valid).
|
||||
generated_noise_augmentation: The probability of further mixing the mixed clip with generated random noise.
|
||||
Will be either "white", "brown", "blue", "pink", or "violet" noise, mixed at a
|
||||
random SNR between `snr_low` and `snr_high`.
|
||||
return_background_clips (bool): Whether to return the segment of the background clip that was mixed with each
|
||||
foreground clip in the batch.
|
||||
return_background_clips_delay (Tuple(int)): The lower and upper bound of a random delay (in samples)
|
||||
to apply to the segment of each returned backgroud clip mixed
|
||||
with each foreground clip in the batch. This is primarily intended to
|
||||
simulate the drift between input and output channels
|
||||
in audio devices, which means that the mixed audio is never
|
||||
exactly aligned with the two source clips.
|
||||
shuffle (bool): Whether to shuffle the foreground clips before mixing (default: True)
|
||||
return_sequence_labels (bool): Whether to return sequence labels (i.e., frame-level labels) for each clip
|
||||
based on the start/end positions of the foreground clip.
|
||||
seed (int): A random seed
|
||||
|
||||
Returns:
|
||||
generator: Returns a generator that yields batches of mixed foreground/background audio, labels, and the
|
||||
background segments used for each audio clip (or None is the
|
||||
`return_backgroun_clips` argument is False)
|
||||
"""
|
||||
# Set random seed, if needed
|
||||
if seed:
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
|
||||
# Check and Set start indices, if needed
|
||||
if not start_index:
|
||||
start_index = [0]*batch_size
|
||||
else:
|
||||
if min(start_index) < 0:
|
||||
raise ValueError("Error! At least one value of the `start_index` argument is <0. Check your inputs.")
|
||||
|
||||
# Make dummy labels
|
||||
if not labels:
|
||||
labels = [0]*len(foreground_clips)
|
||||
|
||||
if shuffle:
|
||||
p = np.random.permutation(len(foreground_clips))
|
||||
foreground_clips = np.array(foreground_clips)[p].tolist()
|
||||
start_index = np.array(start_index)[p].tolist()
|
||||
labels = np.array(labels)[p].tolist()
|
||||
if foreground_durations:
|
||||
foreground_durations = np.array(foreground_durations)[p].tolist()
|
||||
|
||||
for i in range(0, len(foreground_clips), batch_size):
|
||||
# Load foreground clips/start indices and truncate as needed
|
||||
sr = 16000
|
||||
start_index_batch = start_index[i:i+batch_size]
|
||||
foreground_clips_batch = [read_audio(j) for j in foreground_clips[i:i+batch_size]]
|
||||
foreground_clips_batch = [j[0] if len(j.shape) > 1 else j for j in foreground_clips_batch]
|
||||
if foreground_durations:
|
||||
foreground_clips_batch = [truncate_clip(j, int(k*sr), foreground_truncate_strategy)
|
||||
for j, k in zip(foreground_clips_batch, foreground_durations[i:i+batch_size])]
|
||||
labels_batch = np.array(labels[i:i+batch_size])
|
||||
|
||||
# Load background clips and pad/truncate as needed
|
||||
background_clips_batch = [read_audio(j) for j in random.sample(background_clips, batch_size)]
|
||||
background_clips_batch = [j[0] if len(j.shape) > 1 else j for j in background_clips_batch]
|
||||
background_clips_batch_delayed = []
|
||||
delay = np.random.randint(return_background_clips_delay[0], return_background_clips_delay[1] + 1)
|
||||
for ndx, background_clip in enumerate(background_clips_batch):
|
||||
if background_clip.shape[0] < (combined_size + delay):
|
||||
repeated = background_clip.repeat(
|
||||
np.ceil((combined_size + delay)/background_clip.shape[0]).astype(np.int32)
|
||||
)
|
||||
background_clips_batch[ndx] = repeated[0:combined_size]
|
||||
background_clips_batch_delayed.append(repeated[0+delay:combined_size + delay].clone())
|
||||
elif background_clip.shape[0] > (combined_size + delay):
|
||||
r = np.random.randint(0, max(1, background_clip.shape[0] - combined_size - delay))
|
||||
background_clips_batch[ndx] = background_clip[r:r + combined_size]
|
||||
background_clips_batch_delayed.append(background_clip[r+delay:r + combined_size + delay].clone())
|
||||
|
||||
# Mix clips at snr levels
|
||||
snrs_db = np.random.uniform(snr_low, snr_high, batch_size)
|
||||
mixed_clips = []
|
||||
sequence_labels = []
|
||||
for fg, bg, snr, start in zip(foreground_clips_batch, background_clips_batch,
|
||||
snrs_db, start_index_batch):
|
||||
if bg.shape[0] != combined_size:
|
||||
raise ValueError(bg.shape)
|
||||
mixed_clip = mix_clip(fg, bg, snr, start)
|
||||
sequence_labels.append(get_frame_labels(combined_size, start, start+fg.shape[0]))
|
||||
|
||||
if np.random.random() < generated_noise_augmentation:
|
||||
noise_color = ["white", "pink", "blue", "brown", "violet"]
|
||||
noise_clip = acoustics.generator.noise(combined_size, color=np.random.choice(noise_color))
|
||||
noise_clip = torch.from_numpy(noise_clip/noise_clip.max())
|
||||
mixed_clip = mix_clip(mixed_clip, noise_clip, np.random.choice(snrs_db), 0)
|
||||
|
||||
mixed_clips.append(mixed_clip)
|
||||
|
||||
mixed_clips_batch = torch.vstack(mixed_clips)
|
||||
sequence_labels_batch = torch.from_numpy(np.vstack(sequence_labels))
|
||||
|
||||
# Apply reverberation to the batch (from a single RIR file)
|
||||
if rirs:
|
||||
if np.random.random() <= rir_probability:
|
||||
rir_waveform, sr = torchaudio.load(random.choice(rirs))
|
||||
if rir_waveform.shape[0] > 1:
|
||||
rir_waveform = rir_waveform[random.randint(0, rir_waveform.shape[0]-1), :]
|
||||
mixed_clips_batch = reverberate(mixed_clips_batch, rir_waveform, rescale_amp="avg")
|
||||
|
||||
# Apply volume augmentation
|
||||
if volume_augmentation:
|
||||
volume_levels = np.random.uniform(0.02, 1.0, mixed_clips_batch.shape[0])
|
||||
mixed_clips_batch = (volume_levels/mixed_clips_batch.max(axis=1)[0])[..., None]*mixed_clips_batch
|
||||
else:
|
||||
# Normalize clips only if max value is outside of [-1, 1]
|
||||
abs_max, _ = torch.max(
|
||||
torch.abs(mixed_clips_batch), dim=1, keepdim=True
|
||||
)
|
||||
mixed_clips_batch = mixed_clips_batch / abs_max.clamp(min=1.0)
|
||||
|
||||
# Convert to 16-bit PCM audio
|
||||
mixed_clips_batch = (mixed_clips_batch.numpy()*32767).astype(np.int16)
|
||||
|
||||
# Remove any clips that are silent (happens rarely when mixing/reverberating)
|
||||
error_index = np.where(mixed_clips_batch.max(axis=1) != 0)[0]
|
||||
mixed_clips_batch = mixed_clips_batch[error_index]
|
||||
labels_batch = labels_batch[error_index]
|
||||
sequence_labels_batch = sequence_labels_batch[error_index]
|
||||
|
||||
if not return_background_clips:
|
||||
yield mixed_clips_batch, labels_batch if not return_sequence_labels else sequence_labels_batch, None
|
||||
else:
|
||||
background_clips_batch_delayed = (torch.vstack(background_clips_batch_delayed).numpy()
|
||||
* 32767).astype(np.int16)[error_index]
|
||||
yield (mixed_clips_batch,
|
||||
labels_batch if not return_sequence_labels else sequence_labels_batch,
|
||||
background_clips_batch_delayed)
|
||||
|
||||
|
||||
def get_frame_labels(combined_size, start, end, buffer=1):
|
||||
sequence_label = np.zeros(np.ceil((combined_size-12400)/1280).astype(int))
|
||||
frame_positions = np.arange(12400, combined_size, 1280)
|
||||
start_frame = np.argmin(abs(frame_positions - start))
|
||||
end_frame = np.argmin(abs(frame_positions - end))
|
||||
sequence_label[start_frame:start_frame+2] = 1
|
||||
sequence_label[end_frame-1:end_frame+1] = 1
|
||||
return sequence_label
|
||||
|
||||
|
||||
def mix_clip(fg, bg, snr, start):
|
||||
fg_rms, bg_rms = fg.norm(p=2), bg.norm(p=2)
|
||||
snr = 10 ** (snr / 20)
|
||||
scale = snr * bg_rms / fg_rms
|
||||
bg[start:start + fg.shape[0]] = bg[start:start + fg.shape[0]] + scale*fg
|
||||
return bg / 2
|
||||
|
||||
|
||||
def truncate_clip(x, max_size, method="truncate_start"):
|
||||
"""
|
||||
Truncates and audio clip with the specified method
|
||||
|
||||
Args:
|
||||
x (nd.array): An array of audio data
|
||||
max_size (int): The maximum size (in samples)
|
||||
method (str): Can be one of four options:
|
||||
- "truncate_start": Truncate the start of the clip
|
||||
- "truncate_end": Truncate the end of the clip
|
||||
- "truncate_both": Truncate both the start and end of the clip
|
||||
- "random": Randomly select a segment of the right size from the clip
|
||||
|
||||
Returns:
|
||||
nd.array: The truncated audio data
|
||||
"""
|
||||
if x.shape[0] > max_size:
|
||||
if method == "truncate_start":
|
||||
x = x[x.shape[0] - max_size:]
|
||||
if method == "truncate_end":
|
||||
x = x[0:max_size]
|
||||
if method == "truncate_both":
|
||||
n = int(np.ceil(x.shape[0] - max_size)/2)
|
||||
x = x[n:-n][0:max_size]
|
||||
if method == "random":
|
||||
rn = np.random.randint(0, x.shape[0] - max_size)
|
||||
x = x[rn:rn + max_size]
|
||||
|
||||
return x
|
||||
|
||||
|
||||
# Reverberation data augmentation function
|
||||
def apply_reverb(x, rir_files):
|
||||
"""
|
||||
Applies reverberation to the input audio clips
|
||||
|
||||
Args:
|
||||
x (nd.array): A numpy array of shape (batch, audio_samples) containing the audio clips
|
||||
rir_files (Union[str, list]): Either a path to an RIR (room impulse response) file or a list
|
||||
of RIR files. If a list, one file will be randomly chosen
|
||||
to apply to `x`
|
||||
|
||||
Returns:
|
||||
nd.array: The reverberated audio clips
|
||||
"""
|
||||
if isinstance(rir_files, str):
|
||||
rir_waveform, sr = torchaudio.load(rir_files[0])
|
||||
elif isinstance(rir_files, list):
|
||||
rir_waveform, sr = torchaudio.load(random.choice(rir_files))
|
||||
|
||||
# Apply reverberation to the batch (from a single RIR file)
|
||||
if rir_waveform.shape[0] > 1:
|
||||
rir_waveform = rir_waveform[random.randint(0, rir_waveform.shape[0]-1), :]
|
||||
reverbed = reverberate(torch.from_numpy(x), rir_waveform, rescale_amp="avg")
|
||||
|
||||
return reverbed.numpy()
|
||||
|
||||
|
||||
# Load batches of data from mmaped numpy arrays
|
||||
class mmap_batch_generator:
|
||||
"""
|
||||
A generator class designed to dynamically build batches from mmaped numpy arrays.
|
||||
|
||||
The generator will return tuples of (data, labels) with a batch size determined
|
||||
by the `n_per_class` initialization argument. When a mmaped numpy array has been
|
||||
fully interated over, it will restart at the zeroth index automatically.
|
||||
"""
|
||||
def __init__(self,
|
||||
data_files: dict,
|
||||
label_files: dict = {},
|
||||
batch_size: int = 128,
|
||||
n_per_class: dict = {},
|
||||
data_transform_funcs: dict = {},
|
||||
label_transform_funcs: dict = {}
|
||||
):
|
||||
"""
|
||||
Initialize the generator object
|
||||
|
||||
Args:
|
||||
data_files (dict): A dictionary of labels (as keys) and on-disk numpy array paths (as values).
|
||||
Keys should be integer strings representing class labels.
|
||||
label_files (dict): A dictionary where the keys are the class labels and the values are the per-example
|
||||
labels. The values must be the same shape as the correponding numpy data arrays
|
||||
from the `data_files` argument.
|
||||
batch_size (int): The number of samples per batch
|
||||
n_per_class (dict): A dictionary with integer string labels (as keys) and number of example per batch
|
||||
(as values). If None (the default), batch sizes for each class will be
|
||||
automatically calculated based on the the input dataframe shapes and transformation
|
||||
functions.
|
||||
|
||||
data_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of per class
|
||||
data loaded from the mmaped array. For example, with an array of shape
|
||||
(batch, timesteps, features), if the goal is to half the timesteps per example,
|
||||
(effectively doubling the size of the batch) this function could be passed:
|
||||
|
||||
lambda x: np.vstack(
|
||||
(x[:, 0:timesteps//2, :], x[:, timesteps//2:, :]
|
||||
))
|
||||
|
||||
The user should incorporate the effect of any transform on the values of the
|
||||
`n_per_class` argument accordingly, in order to end of with the desired
|
||||
total batch size for each iteration of the generator.
|
||||
label_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of labels.
|
||||
For example, strings can be mapped to integers or one-hot encoded,
|
||||
groups of classes can be merged together into one, etc.
|
||||
"""
|
||||
# inputs
|
||||
self.data_files = data_files
|
||||
self.label_files = label_files
|
||||
self.n_per_class = n_per_class
|
||||
self.data_transform_funcs = data_transform_funcs
|
||||
self.label_transform_funcs = label_transform_funcs
|
||||
|
||||
# Get array mmaps and store their shapes (but load files < 1 GB total size into memory)
|
||||
self.data = {label: np.load(fl, mmap_mode='r') for label, fl in data_files.items()}
|
||||
self.labels = {label: np.load(fl) for label, fl in label_files.items()}
|
||||
self.data_counter = {label: 0 for label in data_files.keys()}
|
||||
self.original_shapes = {label: self.data[label].shape for label in self.data.keys()}
|
||||
self.shapes = {label: self.data[label].shape for label in self.data.keys()}
|
||||
|
||||
# # Update effective shape of mmap array based on user-provided transforms (currently broken)
|
||||
# for lbl, f in self.data_transform_funcs.items():
|
||||
# dummy_data = np.random.random((1, self.original_shapes[lbl][1], self.original_shapes[lbl][2]))
|
||||
# new_shape = f(dummy_data).shape
|
||||
# self.shapes[lbl] = (new_shape[0]*self.original_shapes[lbl][0], new_shape[1], new_shape[2])
|
||||
|
||||
# Calculate batch sizes, if the user didn't specify them
|
||||
scale_factor = 1
|
||||
if not self.n_per_class:
|
||||
self.n_per_class = {}
|
||||
for lbl, shape in self.shapes.items():
|
||||
dummy_data = np.random.random((10, self.shapes[lbl][1], self.shapes[lbl][2]))
|
||||
if self.data_transform_funcs.get(lbl, None):
|
||||
scale_factor = self.data_transform_funcs.get(lbl, None)(dummy_data).shape[0]/10
|
||||
|
||||
ratio = self.shapes[lbl][0]/sum([i[0] for i in self.shapes.values()])
|
||||
self.n_per_class[lbl] = max(1, int(int(batch_size*ratio)/scale_factor))
|
||||
|
||||
# Get estimated batches per epoch, including the effect of any user-provided transforms
|
||||
batch_size = sum([val*scale_factor for val in self.n_per_class.values()])
|
||||
batches_per_epoch = sum([i[0] for i in self.shapes.values()])//batch_size
|
||||
self.batch_per_epoch = batches_per_epoch
|
||||
print("Batches/steps per epoch:", batches_per_epoch)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
# Build batch
|
||||
while True:
|
||||
X, y = [], []
|
||||
for label, n in self.n_per_class.items():
|
||||
# Restart at zeroth index if an array reaches the end
|
||||
if self.data_counter[label] >= self.shapes[label][0]:
|
||||
self.data_counter[label] = 0
|
||||
# self.data[label] = np.load(self.data_files[label], mmap_mode='r')
|
||||
|
||||
# Get data from mmaped file
|
||||
x = self.data[label][self.data_counter[label]:self.data_counter[label]+n]
|
||||
self.data_counter[label] += x.shape[0]
|
||||
|
||||
# Transform data
|
||||
if self.data_transform_funcs and self.data_transform_funcs.get(label):
|
||||
x = self.data_transform_funcs[label](x)
|
||||
|
||||
# Make labels for data (following whatever the current shape of `x` is)
|
||||
if self.label_files.get(label, None):
|
||||
y_batch = self.labels[label][self.data_counter[label]:self.data_counter[label]+n]
|
||||
else:
|
||||
y_batch = [label]*x.shape[0]
|
||||
|
||||
# Transform labels
|
||||
if self.label_transform_funcs and self.label_transform_funcs.get(label):
|
||||
y_batch = self.label_transform_funcs[label](y_batch)
|
||||
|
||||
# Add data to batch
|
||||
X.append(x)
|
||||
y.extend(y_batch)
|
||||
|
||||
return np.vstack(X), np.array(y)
|
||||
|
||||
|
||||
# Function to remove empty rows from the end of a mmap array
|
||||
def trim_mmap(mmap_path):
|
||||
"""
|
||||
Trims blank rows from the end of a mmaped numpy array by creates new mmap array without the blank rows.
|
||||
Note that a copy is created and disk usage will briefly double as the function runs.
|
||||
|
||||
Args:
|
||||
mmap_path (str): The path to mmap array file to trim
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# Identify the last full row in the mmaped file
|
||||
mmap_file1 = np.load(mmap_path, mmap_mode='r')
|
||||
i = -1
|
||||
while np.all(mmap_file1[i, :, :] == 0):
|
||||
i -= 1
|
||||
|
||||
N_new = mmap_file1.shape[0] + i + 1
|
||||
|
||||
# Create new mmap_file and copy over data in batches
|
||||
output_file2 = mmap_path.strip(".npy") + "2.npy"
|
||||
mmap_file2 = open_memmap(output_file2, mode='w+', dtype=np.float32,
|
||||
shape=(N_new, mmap_file1.shape[1], mmap_file1.shape[2]))
|
||||
|
||||
for i in tqdm(range(0, mmap_file1.shape[0], 1024), total=mmap_file1.shape[0]//1024):
|
||||
if i + 1024 > N_new:
|
||||
mmap_file2[i:N_new] = mmap_file1[i:N_new].copy()
|
||||
mmap_file2.flush()
|
||||
else:
|
||||
mmap_file2[i:i+1024] = mmap_file1[i:i+1024].copy()
|
||||
mmap_file2.flush()
|
||||
|
||||
# Remove old mmaped file
|
||||
os.remove(mmap_path)
|
||||
|
||||
# Rename new mmap file to match original
|
||||
os.rename(output_file2, mmap_path)
|
||||
100
venv/lib/python3.12/site-packages/openwakeword/metrics.py
Normal file
100
venv/lib/python3.12/site-packages/openwakeword/metrics.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# Copyright 2022 David Scripka. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Imports
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
|
||||
# Define metric utility functions specific to the wakeword detection use-case
|
||||
|
||||
def get_false_positives(scores: List, threshold: float, grouping_window: int = 50):
|
||||
"""
|
||||
Counts the number of false-positives based on a list of scores and a specified threshold.
|
||||
|
||||
Args:
|
||||
scores (List): A list of predicted scores, between 0 and 1
|
||||
threshold (float): The threshold to use to determine false-positive predictions
|
||||
grouping_window (int: The size (in number of frames) for grouping scores above
|
||||
the threshold into a single false positive for counting
|
||||
|
||||
Returns:
|
||||
int: The number of false positive predictions in the list of scores
|
||||
"""
|
||||
bin_pred = np.array(scores) >= threshold
|
||||
bin_pred_string = ''.join(["1" if i else "0" for i in bin_pred])
|
||||
transitions = list(re.finditer("01", bin_pred_string))
|
||||
n = grouping_window
|
||||
for t in transitions:
|
||||
if bin_pred[t.end()] != 0:
|
||||
bin_pred[t.end():t.end() + min(len(transitions) - t.end(), n)] = [0]*min(len(transitions) - t.end(), n)
|
||||
|
||||
return sum(bin_pred)
|
||||
|
||||
|
||||
def generate_roc_curve_fprs(
|
||||
scores: list,
|
||||
n_points: int = 25,
|
||||
time_per_prediction: float = .08,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Generates the false positive rate (fpr) per hour for the given predictions
|
||||
over a range of score thresholds. Assumes that all predictions should be less than the threshold,
|
||||
else the prediction is a false positive.
|
||||
|
||||
Args:
|
||||
scores (List): A list of predicted scores, between 0 and 1
|
||||
n_points (int): The number of points to use when calculating false positive rates
|
||||
time_per_prediction (float): The time (in seconds) that each prediction represents
|
||||
kwargs (dict): Any other keyword arguments to pass to the `get_false_positives` function
|
||||
|
||||
Returns:
|
||||
list: A list of false positive rates per hour at different score threshold levels
|
||||
"""
|
||||
|
||||
# Determine total time
|
||||
total_hours = time_per_prediction*len(scores)/3600 # convert to hours
|
||||
|
||||
# Calculate true positive rate
|
||||
fprs = []
|
||||
for threshold in tqdm(np.linspace(0.01, 0.99, num=n_points)):
|
||||
fpr = get_false_positives(scores, threshold=threshold, **kwargs)
|
||||
fprs.append(fpr/total_hours)
|
||||
|
||||
return fprs
|
||||
|
||||
|
||||
def generate_roc_curve_tprs(
|
||||
scores: list,
|
||||
n_points: int = 25
|
||||
):
|
||||
"""
|
||||
Generates the true positive rate (true accept rate) for the given predictions
|
||||
over a range score thresholds. Assumes that all predictions are supposed to be equal to 1.
|
||||
|
||||
Args:
|
||||
scores (list): A list of scores for each prediction
|
||||
|
||||
Returns:
|
||||
list: A list of true positive rates at different score threshold levels
|
||||
"""
|
||||
|
||||
tprs = []
|
||||
for threshold in tqdm(np.linspace(0.01, 0.99, num=n_points)):
|
||||
tprs.append(sum(scores >= threshold)/len(scores))
|
||||
|
||||
return tprs
|
||||
402
venv/lib/python3.12/site-packages/openwakeword/model.py
Normal file
402
venv/lib/python3.12/site-packages/openwakeword/model.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# Copyright 2022 David Scripka. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Imports
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
import openwakeword
|
||||
from openwakeword.utils import AudioFeatures
|
||||
|
||||
import wave
|
||||
import os
|
||||
import pickle
|
||||
from collections import deque, defaultdict
|
||||
from functools import partial
|
||||
import time
|
||||
from typing import List, Union, DefaultDict, Dict
|
||||
|
||||
|
||||
# Define main model class
|
||||
class Model():
|
||||
"""
|
||||
The main model class for openWakeWord. Creates a model object with the shared audio pre-processer
|
||||
and for arbitrarily many custom wake word/wake phrase models.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
wakeword_model_paths: List[str] = [],
|
||||
class_mapping_dicts: List[dict] = [],
|
||||
enable_speex_noise_suppression: bool = False,
|
||||
vad_threshold: float = 0,
|
||||
custom_verifier_models: dict = {},
|
||||
custom_verifier_threshold: float = 0.1,
|
||||
**kwargs
|
||||
):
|
||||
"""Initialize the openWakeWord model object.
|
||||
|
||||
Args:
|
||||
wakeword_model_paths (List[str]): A list of paths of ONNX models to load into the openWakeWord model object.
|
||||
If not provided, will load all of the pre-trained models.
|
||||
class_mapping_dicts (List[dict]): A list of dictionaries with integer to string class mappings for
|
||||
each model in the `wakeword_model_paths` arguments
|
||||
(e.g., {"0": "class_1", "1": "class_2"})
|
||||
enable_speex_noise_suppression (bool): Whether to use the noise suppresion from the SpeexDSP
|
||||
library to pre-process all incoming audio. May increase
|
||||
model performance when reasonably stationary background noise
|
||||
is present in the environment where openWakeWord will be used.
|
||||
It is very lightweight, so enabling it doesn't significantly
|
||||
impact efficiency.
|
||||
vad_threshold (float): Whether to use a voice activity detection model (VAD) from Silero
|
||||
(https://github.com/snakers4/silero-vad) to filter predictions.
|
||||
For every input audio frame, a VAD score is obtained and only those model predictions
|
||||
with VAD scores above the threshold will be returned. The default value (0),
|
||||
disables voice activity detection entirely.
|
||||
custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
|
||||
the keys are the model names (corresponding to the openwakeword.models
|
||||
attribute) and the values are the filepaths of the
|
||||
custom verifier models.
|
||||
custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score
|
||||
from a model for a given frame is greater than this value, the
|
||||
associated custom verifier model will also predict on that frame, and
|
||||
the verifier score will be returned.
|
||||
kwargs (dict): Any other keyword arguments to pass the the preprocessor instance
|
||||
"""
|
||||
|
||||
# Initialize the ONNX models and store them
|
||||
sessionOptions = ort.SessionOptions()
|
||||
sessionOptions.inter_op_num_threads = 1
|
||||
sessionOptions.intra_op_num_threads = 1
|
||||
|
||||
# Get model paths for pre-trained models if user doesn't provide models to load
|
||||
if wakeword_model_paths == []:
|
||||
wakeword_model_paths = openwakeword.get_pretrained_model_paths()
|
||||
wakeword_model_names = list(openwakeword.models.keys())
|
||||
else:
|
||||
wakeword_model_names = [os.path.basename(i[0:-5]) for i in wakeword_model_paths]
|
||||
|
||||
# Create attributes to store models and metadata
|
||||
self.models = {}
|
||||
self.model_inputs = {}
|
||||
self.model_outputs = {}
|
||||
self.class_mapping = {}
|
||||
self.model_input_names = {}
|
||||
self.custom_verifier_models = {}
|
||||
self.custom_verifier_threshold = custom_verifier_threshold
|
||||
for mdl_path, mdl_name in zip(wakeword_model_paths, wakeword_model_names):
|
||||
# Load openwakeword models
|
||||
self.models[mdl_name] = ort.InferenceSession(mdl_path, sess_options=sessionOptions,
|
||||
providers=["CPUExecutionProvider"])
|
||||
self.model_inputs[mdl_name] = self.models[mdl_name].get_inputs()[0].shape[1]
|
||||
self.model_outputs[mdl_name] = self.models[mdl_name].get_outputs()[0].shape[1]
|
||||
if class_mapping_dicts and class_mapping_dicts[wakeword_model_paths.index(mdl_path)].get(mdl_name, None):
|
||||
self.class_mapping[mdl_name] = class_mapping_dicts[wakeword_model_paths.index(mdl_path)]
|
||||
elif openwakeword.model_class_mappings.get(mdl_name, None):
|
||||
self.class_mapping[mdl_name] = openwakeword.model_class_mappings[mdl_name]
|
||||
else:
|
||||
self.class_mapping[mdl_name] = {str(i): str(i) for i in range(0, self.model_outputs[mdl_name])}
|
||||
self.model_input_names[mdl_name] = self.models[mdl_name].get_inputs()[0].name
|
||||
|
||||
# Load custom verifier models
|
||||
if isinstance(custom_verifier_models, dict):
|
||||
if custom_verifier_models.get(mdl_name, False):
|
||||
self.custom_verifier_models[mdl_name] = pickle.load(open(custom_verifier_models[mdl_name], 'rb'))
|
||||
|
||||
if len(self.custom_verifier_models.keys()) < len(custom_verifier_models.keys()):
|
||||
raise ValueError(
|
||||
"Custom verifier models were provided, but some were not matched with a base model!"
|
||||
" Make sure that the keys provided in the `custom_verifier_models` dictionary argument"
|
||||
" exactly match that of the `.models` attribute of an instantiated openWakeWord Model object"
|
||||
" that has the same base models but doesn't have custom verifier models."
|
||||
)
|
||||
|
||||
# Create buffer to store frame predictions
|
||||
self.prediction_buffer: DefaultDict[str, deque] = defaultdict(partial(deque, maxlen=30))
|
||||
|
||||
# Initialize SpeexDSP noise canceller
|
||||
if enable_speex_noise_suppression:
|
||||
from speexdsp_ns import NoiseSuppression
|
||||
self.speex_ns = NoiseSuppression.create(160, 16000)
|
||||
else:
|
||||
self.speex_ns = None
|
||||
|
||||
# Initialize Silero VAD
|
||||
self.vad_threshold = vad_threshold
|
||||
if vad_threshold > 0:
|
||||
self.vad = openwakeword.VAD()
|
||||
|
||||
# Create AudioFeatures object
|
||||
self.preprocessor = AudioFeatures(**kwargs)
|
||||
|
||||
def get_parent_model_from_label(self, label):
|
||||
"""Gets the parent model associated with a given prediction label"""
|
||||
parent_model = ""
|
||||
for mdl in self.class_mapping.keys():
|
||||
if label in self.class_mapping[mdl].values():
|
||||
parent_model = mdl
|
||||
elif label in self.class_mapping.keys() and label == mdl:
|
||||
parent_model = mdl
|
||||
|
||||
return parent_model
|
||||
|
||||
def reset(self):
|
||||
"""Reset the prediction buffer"""
|
||||
self.prediction_buffer = defaultdict(partial(deque, maxlen=30))
|
||||
|
||||
def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False):
|
||||
"""Predict with all of the wakeword models on the input audio frames
|
||||
|
||||
Args:
|
||||
x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms
|
||||
(1280 samples), with longer lengths reducing overall CPU usage
|
||||
but decreasing detection latency.
|
||||
patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
|
||||
be observed before the current frame will be returned as non-zero.
|
||||
Must be provided as an a dictionary where the keys are the
|
||||
model names and the values are the number of frames. Can reduce false-positive
|
||||
detections at the cost of a lower true-positive rate.
|
||||
By default, this behavior is disabled.
|
||||
threshold (dict): The threshold values to use when the `patience` behavior is enabled.
|
||||
Must be provided as an a dictionary where the keys are the
|
||||
model names and the values are the thresholds.
|
||||
timing (bool): Whether to return timing information of the models. Can be useful to debug and
|
||||
assess how efficiently models are running on the current hardware.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary of scores between 0 and 1 for each model, where 0 indicates no
|
||||
wake-word/wake-phrase detected. If the `timing` argument is true, returns a
|
||||
tuple of dicts containing model predictions and timing information, respectively.
|
||||
"""
|
||||
|
||||
# Setup timing dict
|
||||
if timing:
|
||||
timing_dict: Dict[str, Dict] = {}
|
||||
timing_dict["models"] = {}
|
||||
feature_start = time.time()
|
||||
|
||||
# Get audio features (optionally with Speex noise suppression)
|
||||
if self.speex_ns:
|
||||
self.preprocessor(self._suppress_noise_with_speex(x))
|
||||
else:
|
||||
self.preprocessor(x)
|
||||
|
||||
if timing:
|
||||
timing_dict["models"]["preprocessor"] = time.time() - feature_start
|
||||
|
||||
# Get predictions from model(s)
|
||||
predictions = {}
|
||||
for mdl in self.models.keys():
|
||||
input_name = self.model_input_names[mdl]
|
||||
|
||||
if timing:
|
||||
model_start = time.time()
|
||||
|
||||
# Run model to get predictions
|
||||
if len(x) > 1280:
|
||||
group_predictions = []
|
||||
for i in np.arange(len(x)//1280-1, -1, -1):
|
||||
group_predictions.extend(
|
||||
self.models[mdl].run(
|
||||
None,
|
||||
{input_name: self.preprocessor.get_features(
|
||||
self.model_inputs[mdl],
|
||||
start_ndx=-self.model_inputs[mdl] - i
|
||||
)}
|
||||
)
|
||||
)
|
||||
prediction = np.array(group_predictions).max(axis=0)[None, ]
|
||||
else:
|
||||
prediction = self.models[mdl].run(
|
||||
None,
|
||||
{input_name: self.preprocessor.get_features(self.model_inputs[mdl])}
|
||||
)
|
||||
|
||||
if self.model_outputs[mdl] == 1:
|
||||
predictions[mdl] = prediction[0][0][0]
|
||||
else:
|
||||
for int_label, cls in self.class_mapping[mdl].items():
|
||||
predictions[cls] = prediction[0][0][int(int_label)]
|
||||
|
||||
# Update scores based on custom verifier model
|
||||
if self.custom_verifier_models != {}:
|
||||
for cls in predictions.keys():
|
||||
if predictions[cls] >= self.custom_verifier_threshold:
|
||||
parent_model = self.get_parent_model_from_label(cls)
|
||||
if self.custom_verifier_models.get(parent_model, False):
|
||||
verifier_prediction = self.custom_verifier_models[parent_model].predict_proba(
|
||||
self.preprocessor.get_features(self.model_inputs[mdl])
|
||||
)[0][-1]
|
||||
predictions[cls] = verifier_prediction
|
||||
|
||||
# Update prediction buffer, and zero predictions for first 5 frames during model initialization
|
||||
for cls in predictions.keys():
|
||||
if len(self.prediction_buffer[cls]) < 5:
|
||||
predictions[cls] = 0.0
|
||||
self.prediction_buffer[cls].append(predictions[cls])
|
||||
|
||||
# Get timing information
|
||||
if timing:
|
||||
timing_dict["models"][mdl] = time.time() - model_start
|
||||
|
||||
# Update scores based on thresholds or patience arguments
|
||||
if patience != {}:
|
||||
if threshold == {}:
|
||||
raise ValueError("Error! When using the `patience` argument, threshold "
|
||||
"values must be provided via the `threshold` argument!")
|
||||
for mdl in predictions.keys():
|
||||
parent_model = self.get_parent_model_from_label(mdl)
|
||||
if parent_model in patience.keys():
|
||||
scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:]
|
||||
if (scores >= threshold[parent_model]).sum() < patience[parent_model]:
|
||||
predictions[mdl] = 0.0
|
||||
|
||||
# (optionally) get voice activity detection scores and update model scores
|
||||
if self.vad_threshold > 0:
|
||||
if timing:
|
||||
vad_start = time.time()
|
||||
|
||||
self.vad(x)
|
||||
|
||||
if timing:
|
||||
timing_dict["models"]["vad"] = time.time() - vad_start
|
||||
|
||||
# Get frames from last 0.4 to 0.56 seconds (3 frames) before the current
|
||||
# frame and get max VAD score
|
||||
vad_frames = list(self.vad.prediction_buffer)[-7:-4]
|
||||
vad_max_score = np.max(vad_frames) if len(vad_frames) > 0 else 0
|
||||
for mdl in predictions.keys():
|
||||
if vad_max_score < self.vad_threshold:
|
||||
predictions[mdl] = 0.0
|
||||
|
||||
if timing:
|
||||
return predictions, timing_dict
|
||||
else:
|
||||
return predictions
|
||||
|
||||
def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, chunk_size=1280, **kwargs):
|
||||
"""Predict on an full audio clip, simulating streaming prediction.
|
||||
The input clip must bit a 16-bit, 16 khz, single-channel WAV file.
|
||||
|
||||
Args:
|
||||
clip (Union[str, np.ndarray]): The path to a 16-bit PCM, 16 khz, single-channel WAV file,
|
||||
or an 1D array containing the same type of data
|
||||
padding (int): How many seconds of silence to pad the start/end of the clip with
|
||||
to make sure that short clips can be processed correctly (default: 1)
|
||||
chunk_size (int): The size (in samples) of each chunk of audio to pass to the model
|
||||
kwargs: Any keyword arguments to pass to the class `predict` method
|
||||
|
||||
Returns:
|
||||
list: A list containing the frame-level prediction dictionaries for the audio clip
|
||||
"""
|
||||
if isinstance(clip, str):
|
||||
# Load audio clip as 16-bit PCM data
|
||||
with wave.open(clip, mode='rb') as f:
|
||||
# Load WAV clip frames
|
||||
data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
|
||||
elif isinstance(clip, np.ndarray):
|
||||
data = clip
|
||||
|
||||
if padding:
|
||||
data = np.concatenate(
|
||||
(
|
||||
np.zeros(16000*padding).astype(np.int16),
|
||||
data,
|
||||
np.zeros(16000*padding).astype(np.int16)
|
||||
)
|
||||
)
|
||||
|
||||
# Iterate through clip, getting predictions
|
||||
predictions = []
|
||||
step_size = chunk_size
|
||||
for i in range(0, data.shape[0]-step_size, step_size):
|
||||
predictions.append(self.predict(data[i:i+step_size], **kwargs))
|
||||
|
||||
return predictions
|
||||
|
||||
def _get_positive_prediction_frames(
|
||||
self,
|
||||
file: str,
|
||||
threshold: float = 0.5,
|
||||
return_type: str = "features",
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Gets predictions for the input audio data, and returns the audio features (embeddings)
|
||||
or audio data for all of the frames with a score above the `threshold` argument.
|
||||
Can be a useful way to collect false-positive predictions.
|
||||
|
||||
Args:
|
||||
file (str): The path to a 16-bit 16khz WAV audio file to process
|
||||
threshold (float): The minimum score required for a frame of audio features
|
||||
to be returned.
|
||||
return_type (str): The type of data to return when a positive prediction is
|
||||
detected. Can be either 'features' or 'audio' to return
|
||||
audio embeddings or raw audio data, respectively.
|
||||
kwargs: Any keyword arguments to pass to the class `predict` method
|
||||
|
||||
Returns:
|
||||
dict: A dictionary with filenames as keys and N x M arrays as values,
|
||||
where N is the number of examples and M is the number
|
||||
of audio features, depending on the model input shape.
|
||||
"""
|
||||
# Load audio clip as 16-bit PCM data
|
||||
with wave.open(file, mode='rb') as f:
|
||||
# Load WAV clip frames
|
||||
data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
|
||||
|
||||
# Iterate through clip, getting predictions
|
||||
positive_data = defaultdict(list)
|
||||
step_size = 1280
|
||||
for i in range(0, data.shape[0]-step_size, step_size):
|
||||
predictions = self.predict(data[i:i+step_size], **kwargs)
|
||||
for lbl in predictions.keys():
|
||||
if predictions[lbl] >= threshold:
|
||||
mdl = self.get_parent_model_from_label(lbl)
|
||||
features = self.preprocessor.get_features(self.model_inputs[mdl])
|
||||
if return_type == 'features':
|
||||
positive_data[lbl].append(features)
|
||||
if return_type == 'audio':
|
||||
context = data[max(0, i - 16000*3):i + 16000]
|
||||
if len(context) == 16000*4:
|
||||
positive_data[lbl].append(context)
|
||||
|
||||
positive_data_combined = {}
|
||||
for lbl in positive_data.keys():
|
||||
positive_data_combined[lbl] = np.vstack(positive_data[lbl])
|
||||
|
||||
return positive_data_combined
|
||||
|
||||
def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160):
|
||||
"""
|
||||
Runs the input audio through the SpeexDSP noise suppression algorithm.
|
||||
Note that this function updates the state of the existing Speex noise
|
||||
suppression object, and isn't intended to be called externally.
|
||||
|
||||
Args:
|
||||
x (ndarray): The 16-bit, 16khz audio to process. Must always be an
|
||||
integer multiple of `frame_size`.
|
||||
frame_size (int): The frame size to use for the Speex Noise suppressor.
|
||||
Must match the frame size specified during the
|
||||
initialization of the noise suppressor.
|
||||
|
||||
Returns:
|
||||
ndarray: The input audio with noise suppression applied
|
||||
"""
|
||||
cleaned = []
|
||||
for i in range(0, x.shape[0], frame_size):
|
||||
chunk = x[i:i+frame_size]
|
||||
cleaned.append(self.speex_ns.process(chunk.tobytes()))
|
||||
|
||||
cleaned_bytestring = b''.join(cleaned)
|
||||
cleaned_array = np.frombuffer(cleaned_bytestring, np.int16)
|
||||
return cleaned_array
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
venv/lib/python3.12/site-packages/openwakeword/resources/models/silero_vad.onnx
Executable file
BIN
venv/lib/python3.12/site-packages/openwakeword/resources/models/silero_vad.onnx
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
407
venv/lib/python3.12/site-packages/openwakeword/utils.py
Normal file
407
venv/lib/python3.12/site-packages/openwakeword/utils.py
Normal file
@@ -0,0 +1,407 @@
|
||||
# Copyright 2022 David Scripka. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Imports
|
||||
import os
|
||||
import onnxruntime as ort
|
||||
import numpy as np
|
||||
import pathlib
|
||||
from collections import deque
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from multiprocessing import Process, Queue
|
||||
import time
|
||||
import openwakeword
|
||||
from typing import Union, List, Callable, Deque
|
||||
|
||||
|
||||
# Base class for computing audio features using Google's speech_embedding
|
||||
# model (https://tfhub.dev/google/speech_embedding/1)
|
||||
class AudioFeatures():
|
||||
"""
|
||||
A class for creating audio features from audio data, including melspectograms and Google's
|
||||
`speech_embedding` features.
|
||||
"""
|
||||
def __init__(self,
|
||||
melspec_onnx_model_path: str = os.path.join(
|
||||
pathlib.Path(__file__).parent.resolve(),
|
||||
"resources", "models", "melspectrogram.onnx"
|
||||
),
|
||||
embedding_onnx_model_path: str = os.path.join(
|
||||
pathlib.Path(__file__).parent.resolve(),
|
||||
"resources", "models", "embedding_model.onnx"
|
||||
),
|
||||
sr: int = 16000,
|
||||
ncpu: int = 1
|
||||
):
|
||||
"""
|
||||
Initialize the AudioFeatures object.
|
||||
|
||||
Args:
|
||||
melspec_onnx_model_path (str): The path to the ONNX model for computing melspectograms from audio data
|
||||
embedding_onnx_model_path (str): The path to the ONNX model for Google's `speech_embedding` model
|
||||
sr (int): The sample rate of the audio (default: 16000 khz)
|
||||
ncpu (int): The number of CPUs to use when computing melspectrograms and audio features (default: 1)
|
||||
"""
|
||||
# Initialize the ONNX models
|
||||
sessionOptions = ort.SessionOptions()
|
||||
sessionOptions.inter_op_num_threads = ncpu
|
||||
sessionOptions.intra_op_num_threads = ncpu
|
||||
self.melspec_model = ort.InferenceSession(melspec_onnx_model_path, sess_options=sessionOptions,
|
||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
|
||||
self.embedding_model = ort.InferenceSession(embedding_onnx_model_path, sess_options=sessionOptions,
|
||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
|
||||
self.onnx_execution_provider = self.melspec_model.get_providers()[0]
|
||||
|
||||
# Create databuffers
|
||||
self.raw_data_buffer: Deque = deque(maxlen=sr*10)
|
||||
self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features
|
||||
self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio
|
||||
self.accumulated_samples = 0 # the samples added to the buffer since the audio preprocessor was last called
|
||||
self.feature_buffer = self._get_embeddings(np.zeros(160000).astype(np.int16)) # fill with blank data to start
|
||||
self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history
|
||||
|
||||
def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2):
|
||||
"""
|
||||
Function to compute the mel-spectrogram of the provided audio samples.
|
||||
|
||||
Args:
|
||||
x (Union[np.ndarray, List]): The input audio data to compute the melspectrogram from
|
||||
melspec_transform (Callable): A function to transform the computed melspectrogram. Defaults to a transform
|
||||
that makes the ONNX melspectrogram model closer to the native Tensorflow
|
||||
implementation from Google (https://tfhub.dev/google/speech_embedding/1).
|
||||
|
||||
Return:
|
||||
np.ndarray: The computed melspectrogram of the input audio data
|
||||
"""
|
||||
# Get input data and adjust type/shape as needed
|
||||
x = np.array(x).astype(np.int16) if isinstance(x, list) else x
|
||||
if x.dtype != np.int16:
|
||||
raise ValueError("Input data must be 16-bit integers (i.e., 16-bit PCM audio)."
|
||||
f"You provided {x.dtype} data.")
|
||||
x = x[None, ] if len(x.shape) < 2 else x
|
||||
x = x.astype(np.float32) if x.dtype != np.float32 else x
|
||||
|
||||
# Get melspectrogram
|
||||
outputs = self.melspec_model.run(None, {'input': x})
|
||||
spec = np.squeeze(outputs[0])
|
||||
|
||||
# Arbitrary transform of melspectrogram
|
||||
spec = melspec_transform(spec)
|
||||
|
||||
return spec
|
||||
|
||||
def _get_embeddings_from_melspec(self, melspec):
|
||||
"""
|
||||
Computes the Google `speech_embedding` features from a melspectrogram input
|
||||
|
||||
Args:
|
||||
melspec (np.ndarray): The input melspectrogram
|
||||
|
||||
Returns:
|
||||
np.ndarray: The computed audio features/embeddings
|
||||
"""
|
||||
if melspec.shape[0] != 1:
|
||||
melspec = melspec[None, ]
|
||||
embedding = self.embedding_model.run(None, {'input_1': melspec})[0].squeeze()
|
||||
return embedding
|
||||
|
||||
def _get_embeddings(self, x: np.ndarray, window_size: int = 76, step_size: int = 8, **kwargs):
|
||||
"""Function to compute the embeddings of the provide audio samples."""
|
||||
spec = self._get_melspectrogram(x, **kwargs)
|
||||
windows = []
|
||||
for i in range(0, spec.shape[0], 8):
|
||||
window = spec[i:i+window_size]
|
||||
if window.shape[0] == window_size: # truncate short windows
|
||||
windows.append(window)
|
||||
|
||||
batch = np.expand_dims(np.array(windows), axis=-1).astype(np.float32)
|
||||
embedding = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
|
||||
return embedding
|
||||
|
||||
def get_embedding_shape(self, audio_length: float, sr: int = 16000):
|
||||
"""Function that determines the size of the output embedding array for a given audio clip length (in seconds)"""
|
||||
x = (np.random.uniform(-1, 1, int(audio_length*sr))*32767).astype(np.int16)
|
||||
return self._get_embeddings(x).shape
|
||||
|
||||
def _get_melspectrogram_batch(self, x, batch_size=128, ncpu=1):
|
||||
"""
|
||||
Compute the melspectrogram of the input audio samples in batches.
|
||||
|
||||
Note that the optimal performance will depend in the interaction between the device,
|
||||
batch size, and ncpu (if a CPU device is used). The user is encouraged
|
||||
to experiment with different values of these parameters to identify
|
||||
which combination is best for their data, as often differences of 1-4x are seen.
|
||||
|
||||
Args:
|
||||
x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
|
||||
Assumes that all of the audio data is the same length (same number of samples).
|
||||
batch_size (int): The batch size to use when computing the melspectrogram
|
||||
ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
|
||||
no effect if the underlying model is executing on a GPU.
|
||||
|
||||
Returns:
|
||||
ndarray: A numpy array of shape (N, frames, melbins) containing the melspectrogram of
|
||||
all N input audio examples
|
||||
"""
|
||||
|
||||
# Prepare ThreadPool object, if needed for multithreading
|
||||
pool = None
|
||||
if "CPU" in self.onnx_execution_provider:
|
||||
pool = ThreadPool(processes=ncpu)
|
||||
|
||||
# Make batches
|
||||
n_frames = int(np.ceil(x.shape[1]/160-3))
|
||||
mel_bins = 32 # fixed by melspectrogram model
|
||||
melspecs = np.empty((x.shape[0], n_frames, mel_bins), dtype=np.float32)
|
||||
for i in range(0, max(batch_size, x.shape[0]), batch_size):
|
||||
batch = x[i:i+batch_size]
|
||||
|
||||
if "CUDA" in self.onnx_execution_provider:
|
||||
result = self._get_melspectrogram(batch)
|
||||
|
||||
elif pool:
|
||||
result = np.array(pool.map(self._get_melspectrogram,
|
||||
batch, chunksize=batch.shape[0]//ncpu))
|
||||
|
||||
melspecs[i:i+batch_size, :, :] = result.squeeze()
|
||||
|
||||
# Cleanup ThreadPool
|
||||
if pool:
|
||||
pool.close()
|
||||
|
||||
return melspecs
|
||||
|
||||
def _get_embeddings_batch(self, x, batch_size=128, ncpu=1):
|
||||
"""
|
||||
Compute the embeddings of the input melspectrograms in batches.
|
||||
|
||||
Note that the optimal performance will depend in the interaction between the device,
|
||||
batch size, and ncpu (if a CPU device is used). The user is encouraged
|
||||
to experiment with different values of these parameters to identify
|
||||
which combination is best for their data, as often differences of 1-4x are seen.
|
||||
|
||||
Args:
|
||||
x (ndarray): A numpy array of melspectrograms of shape (N, frames, melbins).
|
||||
Assumes that all of the melspectrograms have the same shape.
|
||||
batch_size (int): The batch size to use when computing the embeddings
|
||||
ncpu (int): The number of CPUs to use when computing the embeddings. This argument has
|
||||
no effect if the underlying model is executing on a GPU.
|
||||
|
||||
Returns:
|
||||
ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
|
||||
all N input melspectrograms
|
||||
"""
|
||||
# Ensure input is the correct shape
|
||||
if x.shape[1] < 76:
|
||||
raise ValueError("Embedding model requires the input melspectrograms to have at least 76 frames")
|
||||
|
||||
# Prepare ThreadPool object, if needed for multithreading
|
||||
pool = None
|
||||
if "CPU" in self.onnx_execution_provider:
|
||||
pool = ThreadPool(processes=ncpu)
|
||||
|
||||
# Calculate array sizes and make batches
|
||||
n_frames = (x.shape[1] - 76)//8 + 1
|
||||
embedding_dim = 96 # fixed by embedding model
|
||||
embeddings = np.empty((x.shape[0], n_frames, embedding_dim), dtype=np.float32)
|
||||
|
||||
batch = []
|
||||
ndcs = []
|
||||
for ndx, melspec in enumerate(x):
|
||||
window_size = 76
|
||||
for i in range(0, melspec.shape[0], 8):
|
||||
window = melspec[i:i+window_size]
|
||||
if window.shape[0] == window_size: # ignore windows that are too short (truncates end of clip)
|
||||
batch.append(window)
|
||||
ndcs.append(ndx)
|
||||
|
||||
if len(batch) >= batch_size or ndx+1 == x.shape[0]:
|
||||
batch = np.array(batch).astype(np.float32)
|
||||
if "CUDA" in self.onnx_execution_provider:
|
||||
result = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
|
||||
|
||||
elif pool:
|
||||
result = np.array(pool.map(self._get_embeddings_from_melspec,
|
||||
batch, chunksize=batch.shape[0]//ncpu))
|
||||
|
||||
for j, ndx2 in zip(range(0, result.shape[0], n_frames), ndcs):
|
||||
embeddings[ndx2, :, :] = result[j:j+n_frames]
|
||||
|
||||
batch = []
|
||||
ndcs = []
|
||||
|
||||
# Cleanup ThreadPool
|
||||
if pool:
|
||||
pool.close()
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_clips(self, x, batch_size=128, ncpu=1):
|
||||
"""
|
||||
Compute the embeddings of the input audio clips in batches.
|
||||
|
||||
Note that the optimal performance will depend in the interaction between the device,
|
||||
batch size, and ncpu (if a CPU device is used). The user is encouraged
|
||||
to experiment with different values of these parameters to identify
|
||||
which combination is best for their data, as often differences of 1-4x are seen.
|
||||
|
||||
Args:
|
||||
x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
|
||||
Assumes that all of the audio data is the same length (same number of samples).
|
||||
batch_size (int): The batch size to use when computing the embeddings
|
||||
ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
|
||||
no effect if the underlying model is executing on a GPU.
|
||||
|
||||
Returns:
|
||||
ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
|
||||
all N input audio clips
|
||||
"""
|
||||
|
||||
# Compute melspectrograms
|
||||
melspecs = self._get_melspectrogram_batch(x, batch_size=batch_size, ncpu=ncpu)
|
||||
|
||||
# Compute embeddings from melspectrograms
|
||||
embeddings = self._get_embeddings_batch(melspecs[:, :, :, None], batch_size=batch_size, ncpu=ncpu)
|
||||
|
||||
return embeddings
|
||||
|
||||
def _streaming_melspectrogram(self, n_samples):
|
||||
"""Note! There seem to be some slight numerical issues depending on the underlying audio data
|
||||
such that the streaming method is not exactly the same as when the melspectrogram of the entire
|
||||
clip is calculated. It's unclear if this difference is significant and will impact model performance.
|
||||
In particular padding with 0 or very small values seems to demonstrate the differences well.
|
||||
"""
|
||||
self.melspectrogram_buffer = np.vstack(
|
||||
(self.melspectrogram_buffer, self._get_melspectrogram(list(self.raw_data_buffer)[-n_samples-160*3:]))
|
||||
)
|
||||
|
||||
if self.melspectrogram_buffer.shape[0] > self.melspectrogram_max_len:
|
||||
self.melspectrogram_buffer = self.melspectrogram_buffer[-self.melspectrogram_max_len:, :]
|
||||
|
||||
def _buffer_raw_data(self, x):
|
||||
"""
|
||||
Adds raw audio data to the input buffer
|
||||
"""
|
||||
if len(x) < 400:
|
||||
raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
|
||||
self.raw_data_buffer.extend(x.tolist() if isinstance(x, np.ndarray) else x)
|
||||
|
||||
def _streaming_features(self, x):
|
||||
# if len(x) != 1280:
|
||||
# raise ValueError("You must provide input samples in frames of 1280 samples @ 1600khz."
|
||||
# f"Received a frame of {len(x)} samples.")
|
||||
|
||||
# Add raw audio data to buffer
|
||||
self._buffer_raw_data(x)
|
||||
self.accumulated_samples += len(x)
|
||||
|
||||
# Only calculate melspectrogram every ~0.5 seconds to significantly increase efficiency
|
||||
if self.accumulated_samples >= 1280:
|
||||
self._streaming_melspectrogram(self.accumulated_samples)
|
||||
|
||||
# Calculate new audio embeddings/features based on update melspectrograms
|
||||
for i in np.arange(self.accumulated_samples//1280-1, -1, -1):
|
||||
ndx = -8*i
|
||||
ndx = ndx if ndx != 0 else len(self.melspectrogram_buffer)
|
||||
x = self.melspectrogram_buffer[-76 + ndx:ndx].astype(np.float32)[None, :, :, None]
|
||||
if x.shape[1] == 76:
|
||||
self.feature_buffer = np.vstack((self.feature_buffer,
|
||||
self.embedding_model.run(None, {'input_1': x})[0].squeeze()))
|
||||
|
||||
# Reset raw data buffer counter
|
||||
self.accumulated_samples = 0
|
||||
|
||||
if self.feature_buffer.shape[0] > self.feature_buffer_max_len:
|
||||
self.feature_buffer = self.feature_buffer[-self.feature_buffer_max_len:, :]
|
||||
|
||||
def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
|
||||
if start_ndx != -1:
|
||||
end_ndx = start_ndx + int(n_feature_frames) \
|
||||
if start_ndx + n_feature_frames != 0 else len(self.feature_buffer)
|
||||
return self.feature_buffer[start_ndx:end_ndx, :][None, ].astype(np.float32)
|
||||
else:
|
||||
return self.feature_buffer[int(-1*n_feature_frames):, :][None, ].astype(np.float32)
|
||||
|
||||
def __call__(self, x):
|
||||
self._streaming_features(x)
|
||||
|
||||
|
||||
# Bulk prediction function
|
||||
def bulk_predict(
|
||||
file_paths: List[str],
|
||||
wakeword_model_paths: List[str],
|
||||
prediction_function: str = 'predict_clip',
|
||||
ncpu: int = 1,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Bulk predict on the provided input files in parallel using multiprocessing using the specified model.
|
||||
|
||||
Args:
|
||||
input_paths (List[str]): The list of input file to predict
|
||||
wakeword_model_path (List[str])): The paths to the wakeword ONNX model files
|
||||
prediction_function (str): The name of the method used to predict on the input audio files
|
||||
(default is the `predict_clip` method)
|
||||
ncpu (int): How many processes to create (up to max of available CPUs)
|
||||
kwargs (dict): Any other keyword arguments to pass to the model initialization or
|
||||
specified prediction function
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the predictions for each file, with the filepath as the key
|
||||
"""
|
||||
|
||||
# Create openWakeWord model objects
|
||||
n_batches = max(1, len(file_paths)//ncpu)
|
||||
remainder = len(file_paths) % ncpu
|
||||
chunks = [file_paths[i:i+n_batches] for i in range(0, max(1, len(file_paths)-remainder), n_batches)]
|
||||
for i in range(1, remainder+1):
|
||||
chunks[i-1].append(file_paths[-1*i])
|
||||
|
||||
# Create jobs
|
||||
ps = []
|
||||
mdls = []
|
||||
q: Queue = Queue()
|
||||
for chunk in chunks:
|
||||
filtered_kwargs = {key: value for key, value in kwargs.items()
|
||||
if key in openwakeword.Model.__init__.__code__.co_varnames}
|
||||
oww = openwakeword.Model(
|
||||
wakeword_model_paths=wakeword_model_paths,
|
||||
**filtered_kwargs
|
||||
)
|
||||
mdls.append(oww)
|
||||
|
||||
def f(clips):
|
||||
results = []
|
||||
for clip in clips:
|
||||
func = getattr(mdls[-1], prediction_function)
|
||||
filtered_kwargs = {key: value for key, value in kwargs.items()
|
||||
if key in func.__code__.co_varnames}
|
||||
results.append({clip: func(clip, **filtered_kwargs)})
|
||||
q.put(results)
|
||||
|
||||
ps.append(Process(target=f, args=(chunk,)))
|
||||
|
||||
# Submit jobs
|
||||
for p in ps:
|
||||
p.start()
|
||||
|
||||
# Collection results
|
||||
results = []
|
||||
for p in ps:
|
||||
while q.empty():
|
||||
time.sleep(0.01)
|
||||
results.extend(q.get())
|
||||
|
||||
# Consolidate results and return
|
||||
return {list(i.keys())[0]: list(i.values())[0] for i in results}
|
||||
128
venv/lib/python3.12/site-packages/openwakeword/vad.py
Normal file
128
venv/lib/python3.12/site-packages/openwakeword/vad.py
Normal file
@@ -0,0 +1,128 @@
|
||||
# Copyright 2022 David Scripka. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#######################
|
||||
# Silero VAD License
|
||||
#######################
|
||||
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2020-present Silero Team
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
########################################
|
||||
|
||||
# This file contains the implementation of a class for voice activity detection (VAD),
|
||||
# based on the pre-trained model from Silero (https://github.com/snakers4/silero-vad).
|
||||
# It can be used as with the openWakeWord library, or independently.
|
||||
|
||||
# Imports
|
||||
import onnxruntime as ort
|
||||
import numpy as np
|
||||
import os
|
||||
from collections import deque
|
||||
|
||||
|
||||
class VAD():
|
||||
"""
|
||||
A model class for a voice activity detection (VAD) based on Silero's model:
|
||||
|
||||
https://github.com/snakers4/silero-vad
|
||||
"""
|
||||
def __init__(self,
|
||||
model_path: str = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"resources",
|
||||
"models",
|
||||
"silero_vad.onnx"
|
||||
)
|
||||
):
|
||||
"""Initialize the VAD model object.
|
||||
|
||||
Args:
|
||||
model_path (str): The path to the Silero VAD ONNX model.
|
||||
"""
|
||||
|
||||
# Initialize the ONNX model
|
||||
sessionOptions = ort.SessionOptions()
|
||||
sessionOptions.inter_op_num_threads = 1
|
||||
sessionOptions.intra_op_num_threads = 1
|
||||
self.model = ort.InferenceSession(model_path, sess_options=sessionOptions,
|
||||
providers=["CPUExecutionProvider"])
|
||||
|
||||
# Create buffer
|
||||
self.prediction_buffer: deque = deque(maxlen=125) # buffer lenght of 10 seconds
|
||||
|
||||
# Set model parameters
|
||||
self.sample_rate = np.array(16000).astype(np.int64)
|
||||
|
||||
# Reset model to start
|
||||
self.reset_states()
|
||||
|
||||
def reset_states(self, batch_size=1):
|
||||
self._h = np.zeros((2, batch_size, 64)).astype('float32')
|
||||
self._c = np.zeros((2, batch_size, 64)).astype('float32')
|
||||
self._last_sr = 0
|
||||
self._last_batch_size = 0
|
||||
|
||||
def predict(self, x, frame_size=480):
|
||||
"""
|
||||
Get the VAD predictions for the input audio frame.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): The input audio, must be 16 khz and 16-bit PCM format.
|
||||
If longer than the input frame, will be split into
|
||||
chunks of length `frame_size` and the predictions for
|
||||
each chunk returned. Must be a length that is integer
|
||||
multiples of the `frame_size` argument.
|
||||
frame_size (int): The frame size in samples. The reccomended
|
||||
default is 480 samples (30 ms @ 16khz),
|
||||
but smaller and larger values
|
||||
can be used (though performance may decrease).
|
||||
|
||||
Returns
|
||||
float: The average predicted score for the audio frame
|
||||
"""
|
||||
chunks = [(x[i:i+frame_size]/32767).astype(np.float32)
|
||||
for i in range(0, x.shape[0], frame_size)]
|
||||
|
||||
frame_predictions = []
|
||||
for chunk in chunks:
|
||||
ort_inputs = {'input': chunk[None, ],
|
||||
'h': self._h, 'c': self._c, 'sr': self.sample_rate}
|
||||
ort_outs = self.model.run(None, ort_inputs)
|
||||
out, self._h, self._c = ort_outs
|
||||
frame_predictions.append(out[0][0])
|
||||
|
||||
return np.mean(frame_predictions)
|
||||
|
||||
def __call__(self, x, frame_size=160*4):
|
||||
self.prediction_buffer.append(self.predict(x, frame_size))
|
||||
Reference in New Issue
Block a user