add read me

This commit is contained in:
2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
import os
from openwakeword.model import Model
from openwakeword.vad import VAD
from openwakeword.custom_verifier_model import train_custom_verifier
__all__ = ['Model', 'VAD', 'train_custom_verifier']
models = {
"alexa": {
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/alexa_v0.1.onnx")
},
"hey_mycroft": {
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_mycroft_v0.1.onnx")
},
"hey_jarvis": {
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/hey_jarvis_v0.1.onnx")
},
"timer": {
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/timer_v0.1.onnx")
},
"weather": {
"model_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/models/weather_v0.1.onnx")
}
}
model_class_mappings = {
"timer": {
"1": "1_minute_timer",
"2": "5_minute_timer",
"3": "10_minute_timer",
"4": "20_minute_timer",
"5": "30_minute_timer",
"6": "1_hour_timer"
}
}
def get_pretrained_model_paths():
return [models[i]["model_path"] for i in models.keys()]

View File

@@ -0,0 +1,174 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Imports
import os
from tqdm import tqdm
import collections
import openwakeword
import numpy as np
import scipy
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
# Define functions to prepare data for speaker dependent verifier model
def get_reference_clip_features(
reference_clip: str,
oww_model: openwakeword.Model,
model_name: str,
threshold: float = 0.5,
N: int = 3,
**kwargs
):
"""
Processes input audio files (16-bit, 16-khz single-channel WAV files) and gets the openWakeWord
audio features that produce a prediction from the specified model greater than the threshold value.
Args:
reference_clip (str): The target audio file to get features from
oww_model (openwakeword.Model): The openWakeWord model object used to get predictions
model_name (str): The name of the model to get predictions from (should correspond to
a python dictionary key in the oww_model.models attribute)
threshold (float): The minimum score from the model required to capture the associated features
N (int): How many times to run feature extraction for a given clip, adding some slight variation
in the starting position each time to ensure that the features are not identical
Returns:
ndarray: A numpy array of shape N x M x L, where N is the number of examples, M is the number
of frames in the window, and L is the audio feature/embedding dimension.
"""
# Create dictionary to store frames
positive_data = collections.defaultdict(list)
# Get predictions
for _ in range(N):
# Load clip
if type(reference_clip) == str:
sr, dat = scipy.io.wavfile.read(reference_clip)
else:
dat = reference_clip
# Set random starting point to get small variations in features
if N != 1:
dat = dat[np.random.randint(0, 1280):]
# Get predictions
step_size = 1280
for i in range(0, dat.shape[0]-step_size, step_size):
predictions = oww_model.predict(dat[i:i+step_size], **kwargs)
if predictions[model_name] >= threshold:
features = oww_model.preprocessor.get_features( # type: ignore[has-type]
oww_model.model_inputs[model_name] # type: ignore[has-type]
)
positive_data[model_name].append(features)
if len(positive_data[model_name]) == 0:
positive_data[model_name].append(
np.empty((0, oww_model.model_inputs[model_name], 96))) # type: ignore[has-type]
return np.vstack(positive_data[model_name])
def flatten_features(x):
return [i.flatten() for i in x]
def train_verifier_model(features: np.ndarray, labels: np.ndarray):
"""
Train a logistic regression binary classifier model on the provided features and labels
Args:
features (ndarray): A N x M numpy array, where N is the number of examples and M
is the number of features
labels (ndarray): A 1D numpy array where each value corresponds to the label of the Nth
example in the `features` argument
Returns:
The trained scikit-learn logistic regression model
"""
# C value matters alot here, depending on dataset size (larger datasets work better with larger C?)
clf = LogisticRegression(random_state=0, max_iter=2000, C=0.001)
pipeline = make_pipeline(FunctionTransformer(flatten_features), StandardScaler(), clf)
pipeline.fit(features, labels)
return pipeline
def train_custom_verifier(
positive_reference_clips: str,
negative_reference_clips: str,
output_path: str,
model_name: str,
**kwargs
):
"""
Trains a voice-specific custom verifier model on examples of wake word/phrase speech and other speech
from a single user.
Args:
positive_reference_clips (str): The path to a directory containing single-channel 16khz, 16-bit WAV files
of the target wake word/phrase.
negative_reference_clips (str): The path to a directory containing single-channel 16khz, 16-bit WAV files
of miscellaneous speech not containing the target wake word/phrase.
output_path (str): The location to save the trained verifier model (as a scikit-learn .joblib file)
model_name (str): The name or path of the trained openWakeWord model that the verifier model will be
based on. If only a name, it must be one of the pre-trained models included in the
openWakeWord release.
kwargs: Any other keyword arguments to pass to the openWakeWord model initialization
Returns:
None
"""
# Load target openWakeWord model
if os.path.exists(model_name):
oww = openwakeword.Model(
wakeword_model_paths=[model_name],
**kwargs
)
model_name = model_name.split(os.path.sep)[-1][0:-5]
else:
oww = openwakeword.Model(**kwargs)
# Get features from positive reference clips
positive_features = np.vstack(
[get_reference_clip_features(i, oww, model_name, N=5)
for i in tqdm(positive_reference_clips, desc="Processing positive reference clips")]
)
if positive_features.shape[0] == 0:
raise ValueError("The positive features were created! Make sure that"
" the positive reference clips contain the appropriate audio"
" for the desired model")
# Get features from negative reference clips
negative_features = np.vstack(
[get_reference_clip_features(i, oww, model_name, threshold=0.0, N=1)
for i in tqdm(negative_reference_clips, desc="Processing negative reference clips")]
)
# Train logistic regression model on reference clip features
print("Training and saving verifier model...")
lr_model = train_verifier_model(
np.vstack((positive_features, negative_features)),
np.array([1]*positive_features.shape[0] + [0]*negative_features.shape[0])
)
# Save logistic regression model to specified output location
print("Done!")
pickle.dump(lr_model, open(output_path, "wb"))

View File

@@ -0,0 +1,712 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# imports
from multiprocessing.pool import ThreadPool
import os
from functools import partial
from pathlib import Path
import random
from tqdm import tqdm
from typing import List, Tuple
import numpy as np
import torch
from numpy.lib.format import open_memmap
from speechbrain.dataio.dataio import read_audio
from speechbrain.processing.signal_processing import reverberate
import torchaudio
import mutagen
import acoustics
# Load audio clips and structure into clips of the same length
def stack_clips(audio_data, clip_size=16000*2):
"""
Takes an input list of 1D arrays (of different lengths), concatenates them together,
and then extracts clips of a uniform size by dividing the combined array.
Args:
audio_data (List[ndarray]): A list of 1D numpy arrays to combine and stack
clip_size (int): The desired total length of the uniform clip size (in samples)
Returns:
ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
"""
# Combine all clips into single clip
combined_data = np.hstack((audio_data))
# Get chunks of the specified size
new_examples = []
for i in range(0, combined_data.shape[0], clip_size):
chunk = combined_data[i:i+clip_size]
if chunk.shape[0] != clip_size:
chunk = np.hstack((chunk, np.zeros(clip_size - chunk.shape[0])))
new_examples.append(chunk)
return np.array(new_examples)
def load_audio_clips(files, clip_size=32000):
"""
Takes the specified audio files and shapes them into an array of N by `clip_size`,
where N is determined by the length of the audio files and `clip_size` at run time.
Clips longer than `clip size` are truncated and extended into the N+1 row.
Clips shorter than `clip_size` are combined with the previous or next clip
(except for the last clip in `files`, which is ignored if it is too short.)
Args:
files (List[str]): A list of filepaths
clip_size (int): The number of samples (of 16khz audio) for all of the rows in the array
Returns:
ndarray: A N by `clip_size` array with the audio data, converted to 16-bit PCM
"""
# Load audio files
audio_data = []
for i in files:
try:
audio_data.append(read_audio(i))
except ValueError:
continue
# Get shape of output array
N = sum([i.shape[0] for i in audio_data])//clip_size
X = np.empty((N, clip_size))
# Add audio data to rows
previous_row_remainder = None
cnt = 0
for row in audio_data:
row = np.hstack((previous_row_remainder, row))
while row.shape[0] >= clip_size:
X[cnt, :] = row[0:clip_size]
row = row[clip_size:]
cnt += 1
previous_row_remainder = row if row.size > 0 else None
# Convert to 16-bit PCM data
X = (X*32767).astype(np.int16)
return X
# Dato I/O utils
# Convert clips with sox
def _convert_clip(input_file, output_file, backend="ffmpeg"):
if backend == "sox":
cmd = f"sox \"{input_file}\" -G -r 16000 -c 1 \"{output_file}\""
elif backend == "ffmpeg":
cmd = f"ffmpeg -y -i \"{input_file}\" -ar 16000 \"{output_file}\""
os.system(cmd)
return None
def convert_clips(input_files, output_files, sr=16000, ncpu=1, backend="ffmpeg"):
"""
Converts files in parallel with multithreading using Sox or ffmpeg.
Intended to only convert input audio files into single-channel, 16 khz clips.
Args:
input_files (List[str]): A list of paths to input files
output_files (List[str]): A list of paths to output files, corresponding 1:1 to the input files
sr (int): The output sample rate of the converted clip
ncpu (int): The number of CPUs to use for the conversion
backend (str): The utilty to use for conversion, "sox" or "ffmpeg"
Returns:
None
"""
# Setup ThreadPool object
pool = ThreadPool(processes=ncpu)
# Set backend for conversion
f = partial(_convert_clip, backend=backend)
# Submit jobs
pool.starmap(f, [(i, j) for i, j in zip(input_files, output_files)])
def filter_audio_paths(target_dirs, min_length_secs, max_length_secs, duration_method="size", glob_filter=None):
"""
Gets the paths of wav files in flat target directories, automatically filtering
out files below/above the specified length (in seconds). Assumes that all
wav files are sampled at 16khz, are single channel, and have 16-bit PCM data.
Uses `os.scandir` in Python for highly efficient file system exploration,
and doesn't require loading the files into memory for length estimation.
Args:
target_dir (List[str]): The target directories containing the audio files
min_length_secs (float): The minimum length in seconds (otherwise the clip is skipped)
max_length_secs (float): The maximum length in seconds (otherwise the clip is skipped)
duration_method (str): Whether to use the file size ('size'), or header information ('header')
to estimate the duration of the audio file. 'size' is generally
much faster, but assumes that all files in the target directory
are the same type, sample rate, and bitrate. If None, durations are not calculated.
glob_filter (str): A pathlib glob filter string to select specific files within the target directory
Returns:
tuple: A list of strings corresponding to the paths of the wav files that met the length criteria,
and a list of their durations (in seconds)
"""
file_paths = []
durations = []
for target_dir in target_dirs:
sizes = []
dir_paths = []
if glob_filter:
dir_paths = [str(i) for i in Path(target_dir).glob(glob_filter)]
file_paths.extend(dir_paths)
sizes.extend([os.path.getsize(i) for i in dir_paths])
else:
for i in tqdm(os.scandir(target_dir)):
dir_paths.append(i.path)
file_paths.append(i.path)
sizes.append(i.stat().st_size)
if duration_method == "size":
durations.extend(estimate_clip_duration(dir_paths, sizes))
elif duration_method == "header":
durations.extend([get_clip_duration(i) for i in tqdm(dir_paths)])
if durations != []:
filtered = [(i, j) for i, j in zip(file_paths, durations) if j >= min_length_secs and j <= max_length_secs]
return [i[0] for i in filtered], [i[1] for i in filtered]
else:
return file_paths, []
def estimate_clip_duration(audio_files: list, sizes: list):
"""Estimates the duration of each audio file in a list.
Assumes that all of the audio files have the same audio format,
bit depth, and sample rate.
Args:
audio_file (str): A list of audio file paths
sizes (int): The size of each audio file in bytes
Returns:
list: A list of durations (in seconds) for the audio files
"""
# Determine file type by checking the first file
details = torchaudio.info(audio_files[0])
# Caculate any correction factors needed from the first file
details = mutagen.File(audio_files[0])
correction = 8*os.path.getsize(audio_files[0]) - details.info.bitrate*details.info.length
# Estimate duration for all remaining clips from file size only
durations = []
for size in sizes:
durations.append((size*8-correction)/details.info.bitrate)
return durations
def estimate_mp3_duration(fpath):
"""Estimates the duration of an MP3 file from metadata and file-size.
Is only accurate for 16000 khz sample rate audio with a relatively
constant bit-rate.
Args:
fpath (str): The input path to the MP3 file
Returns:
float: The duration of the MP3 file in seconds
"""
conversion_factors = {
"16_khz_single_channel": 0.000333318208471784,
"16_khz_stereo": 0.000333318208471784/2
}
duration_seconds = 0
try:
md = torchaudio.info(fpath)
except RuntimeError:
return duration_seconds
nbytes = os.path.getsize(fpath)
if md.num_channels == 1:
if md.sample_rate == 16000:
duration_seconds = nbytes*conversion_factors["16_khz_single_channel"]
elif md.num_channels == 2:
if md.sample_rate == 16000:
duration_seconds = nbytes*conversion_factors["16_khz_stereo"]
return duration_seconds
def get_clip_duration(clip):
"""Gets the duration of an audio clip in seconds from file header information"""
try:
metadata = torchaudio.info(clip)
except RuntimeError: # skip cases where file metadata can't be read
return 0
return metadata.num_frames/metadata.sample_rate
def get_wav_duration_from_filesize(size, nbytes=2):
"""
Calculates the duration (in seconds) from a WAV file, assuming it contains 16 khz single-channel audio.
The bit depth is user specified, and defaults to 2 for 16-bit PCM audio.
Args:
size (int): The file size in bytes
nbytes (int): How many bytes for each data point in the audio (e.g., 16-bit is 2, 32-bit is 4, etc.)
Returns:
float: The duration of the audio file in seconds
"""
return (size-44)/nbytes/16000
# Data augmentation utility function
def mix_clips_batch(
foreground_clips: List[str],
background_clips: List[str],
combined_size: int,
labels: List[int] = [],
batch_size: int = 32,
snr_low: float = 0,
snr_high: float = 0,
start_index: List[int] = [],
foreground_durations: List[float] = [],
foreground_truncate_strategy: str = "random",
rirs: List[str] = [],
rir_probability: int = 1,
volume_augmentation: bool = True,
generated_noise_augmentation: float = 0.0,
shuffle: bool = True,
return_sequence_labels: bool = False,
return_background_clips: bool = False,
return_background_clips_delay: Tuple[int, int] = (0, 0),
seed: int = 0
):
"""
Mixes foreground and background clips at a random SNR level in batches.
References: https://pytorch.org/audio/main/tutorials/audio_data_augmentation_tutorial.html and
https://speechbrain.readthedocs.io/en/latest/API/speechbrain.processing.speech_augmentation.html#speechbrain.processing.speech_augmentation.AddNoise
Args:
foreground_clips (List[str]): A list of paths to the foreground clips
background_clips (List[str]): A list of paths to the background clips (randomly selected for each
foreground clip)
combined_size (int): The total length (in samples) of the combined clip. If needed, the background
clips are duplicated or truncated to reach this length.
labels (List[int]): A list of integer labels corresponding 1:1 for the foreground clips. Will be updated
as needed with foreground clips to ensure that mixed clips retain the proper labels.
batch_size (int): The batch size
snr_low (float): The low SNR level of the mixing in db
snr_high (float): The high snr level of the mixing in db
start_index (List[int]): The starting position (in samples) for the foreground clip to start in
the background clip. If the foreground clip is longer than `combined_size`
when starting at this point, the foreground clip will be truncated
according to the `foreground_truncate_strategy` argument.
foreground_durations (List[float]): The desired duration of each foreground clip (in seconds)
foreground_truncate_strategy (str): The method used to truncate the foreground clip, if needed based on the
`start_index`, `foreground_durations`, and `combined_size` arguments.
See the options in the `truncate_clip` method.
rirs (List[str]): A list of paths to room impulse response functions (RIR) to convolve with the
clips to simulate different recording environments. Applies a single random selection from the
list RIR file to the entire batch. If empty (the default), nothing is done.
rir_probability (float): The probability (between 0 and 1) that the batch will be convolved with a RIR file.
volume_augmentation (bool): Whether to randomly apply volume augmentation to the clips in the batch.
This simply scales the data of each clip such that the maximum value is is between
0.02 and 1.0 (the floor shouldn't be zero as beyond a certain point the audio data
is no longer valid).
generated_noise_augmentation: The probability of further mixing the mixed clip with generated random noise.
Will be either "white", "brown", "blue", "pink", or "violet" noise, mixed at a
random SNR between `snr_low` and `snr_high`.
return_background_clips (bool): Whether to return the segment of the background clip that was mixed with each
foreground clip in the batch.
return_background_clips_delay (Tuple(int)): The lower and upper bound of a random delay (in samples)
to apply to the segment of each returned backgroud clip mixed
with each foreground clip in the batch. This is primarily intended to
simulate the drift between input and output channels
in audio devices, which means that the mixed audio is never
exactly aligned with the two source clips.
shuffle (bool): Whether to shuffle the foreground clips before mixing (default: True)
return_sequence_labels (bool): Whether to return sequence labels (i.e., frame-level labels) for each clip
based on the start/end positions of the foreground clip.
seed (int): A random seed
Returns:
generator: Returns a generator that yields batches of mixed foreground/background audio, labels, and the
background segments used for each audio clip (or None is the
`return_backgroun_clips` argument is False)
"""
# Set random seed, if needed
if seed:
np.random.seed(seed)
random.seed(seed)
# Check and Set start indices, if needed
if not start_index:
start_index = [0]*batch_size
else:
if min(start_index) < 0:
raise ValueError("Error! At least one value of the `start_index` argument is <0. Check your inputs.")
# Make dummy labels
if not labels:
labels = [0]*len(foreground_clips)
if shuffle:
p = np.random.permutation(len(foreground_clips))
foreground_clips = np.array(foreground_clips)[p].tolist()
start_index = np.array(start_index)[p].tolist()
labels = np.array(labels)[p].tolist()
if foreground_durations:
foreground_durations = np.array(foreground_durations)[p].tolist()
for i in range(0, len(foreground_clips), batch_size):
# Load foreground clips/start indices and truncate as needed
sr = 16000
start_index_batch = start_index[i:i+batch_size]
foreground_clips_batch = [read_audio(j) for j in foreground_clips[i:i+batch_size]]
foreground_clips_batch = [j[0] if len(j.shape) > 1 else j for j in foreground_clips_batch]
if foreground_durations:
foreground_clips_batch = [truncate_clip(j, int(k*sr), foreground_truncate_strategy)
for j, k in zip(foreground_clips_batch, foreground_durations[i:i+batch_size])]
labels_batch = np.array(labels[i:i+batch_size])
# Load background clips and pad/truncate as needed
background_clips_batch = [read_audio(j) for j in random.sample(background_clips, batch_size)]
background_clips_batch = [j[0] if len(j.shape) > 1 else j for j in background_clips_batch]
background_clips_batch_delayed = []
delay = np.random.randint(return_background_clips_delay[0], return_background_clips_delay[1] + 1)
for ndx, background_clip in enumerate(background_clips_batch):
if background_clip.shape[0] < (combined_size + delay):
repeated = background_clip.repeat(
np.ceil((combined_size + delay)/background_clip.shape[0]).astype(np.int32)
)
background_clips_batch[ndx] = repeated[0:combined_size]
background_clips_batch_delayed.append(repeated[0+delay:combined_size + delay].clone())
elif background_clip.shape[0] > (combined_size + delay):
r = np.random.randint(0, max(1, background_clip.shape[0] - combined_size - delay))
background_clips_batch[ndx] = background_clip[r:r + combined_size]
background_clips_batch_delayed.append(background_clip[r+delay:r + combined_size + delay].clone())
# Mix clips at snr levels
snrs_db = np.random.uniform(snr_low, snr_high, batch_size)
mixed_clips = []
sequence_labels = []
for fg, bg, snr, start in zip(foreground_clips_batch, background_clips_batch,
snrs_db, start_index_batch):
if bg.shape[0] != combined_size:
raise ValueError(bg.shape)
mixed_clip = mix_clip(fg, bg, snr, start)
sequence_labels.append(get_frame_labels(combined_size, start, start+fg.shape[0]))
if np.random.random() < generated_noise_augmentation:
noise_color = ["white", "pink", "blue", "brown", "violet"]
noise_clip = acoustics.generator.noise(combined_size, color=np.random.choice(noise_color))
noise_clip = torch.from_numpy(noise_clip/noise_clip.max())
mixed_clip = mix_clip(mixed_clip, noise_clip, np.random.choice(snrs_db), 0)
mixed_clips.append(mixed_clip)
mixed_clips_batch = torch.vstack(mixed_clips)
sequence_labels_batch = torch.from_numpy(np.vstack(sequence_labels))
# Apply reverberation to the batch (from a single RIR file)
if rirs:
if np.random.random() <= rir_probability:
rir_waveform, sr = torchaudio.load(random.choice(rirs))
if rir_waveform.shape[0] > 1:
rir_waveform = rir_waveform[random.randint(0, rir_waveform.shape[0]-1), :]
mixed_clips_batch = reverberate(mixed_clips_batch, rir_waveform, rescale_amp="avg")
# Apply volume augmentation
if volume_augmentation:
volume_levels = np.random.uniform(0.02, 1.0, mixed_clips_batch.shape[0])
mixed_clips_batch = (volume_levels/mixed_clips_batch.max(axis=1)[0])[..., None]*mixed_clips_batch
else:
# Normalize clips only if max value is outside of [-1, 1]
abs_max, _ = torch.max(
torch.abs(mixed_clips_batch), dim=1, keepdim=True
)
mixed_clips_batch = mixed_clips_batch / abs_max.clamp(min=1.0)
# Convert to 16-bit PCM audio
mixed_clips_batch = (mixed_clips_batch.numpy()*32767).astype(np.int16)
# Remove any clips that are silent (happens rarely when mixing/reverberating)
error_index = np.where(mixed_clips_batch.max(axis=1) != 0)[0]
mixed_clips_batch = mixed_clips_batch[error_index]
labels_batch = labels_batch[error_index]
sequence_labels_batch = sequence_labels_batch[error_index]
if not return_background_clips:
yield mixed_clips_batch, labels_batch if not return_sequence_labels else sequence_labels_batch, None
else:
background_clips_batch_delayed = (torch.vstack(background_clips_batch_delayed).numpy()
* 32767).astype(np.int16)[error_index]
yield (mixed_clips_batch,
labels_batch if not return_sequence_labels else sequence_labels_batch,
background_clips_batch_delayed)
def get_frame_labels(combined_size, start, end, buffer=1):
sequence_label = np.zeros(np.ceil((combined_size-12400)/1280).astype(int))
frame_positions = np.arange(12400, combined_size, 1280)
start_frame = np.argmin(abs(frame_positions - start))
end_frame = np.argmin(abs(frame_positions - end))
sequence_label[start_frame:start_frame+2] = 1
sequence_label[end_frame-1:end_frame+1] = 1
return sequence_label
def mix_clip(fg, bg, snr, start):
fg_rms, bg_rms = fg.norm(p=2), bg.norm(p=2)
snr = 10 ** (snr / 20)
scale = snr * bg_rms / fg_rms
bg[start:start + fg.shape[0]] = bg[start:start + fg.shape[0]] + scale*fg
return bg / 2
def truncate_clip(x, max_size, method="truncate_start"):
"""
Truncates and audio clip with the specified method
Args:
x (nd.array): An array of audio data
max_size (int): The maximum size (in samples)
method (str): Can be one of four options:
- "truncate_start": Truncate the start of the clip
- "truncate_end": Truncate the end of the clip
- "truncate_both": Truncate both the start and end of the clip
- "random": Randomly select a segment of the right size from the clip
Returns:
nd.array: The truncated audio data
"""
if x.shape[0] > max_size:
if method == "truncate_start":
x = x[x.shape[0] - max_size:]
if method == "truncate_end":
x = x[0:max_size]
if method == "truncate_both":
n = int(np.ceil(x.shape[0] - max_size)/2)
x = x[n:-n][0:max_size]
if method == "random":
rn = np.random.randint(0, x.shape[0] - max_size)
x = x[rn:rn + max_size]
return x
# Reverberation data augmentation function
def apply_reverb(x, rir_files):
"""
Applies reverberation to the input audio clips
Args:
x (nd.array): A numpy array of shape (batch, audio_samples) containing the audio clips
rir_files (Union[str, list]): Either a path to an RIR (room impulse response) file or a list
of RIR files. If a list, one file will be randomly chosen
to apply to `x`
Returns:
nd.array: The reverberated audio clips
"""
if isinstance(rir_files, str):
rir_waveform, sr = torchaudio.load(rir_files[0])
elif isinstance(rir_files, list):
rir_waveform, sr = torchaudio.load(random.choice(rir_files))
# Apply reverberation to the batch (from a single RIR file)
if rir_waveform.shape[0] > 1:
rir_waveform = rir_waveform[random.randint(0, rir_waveform.shape[0]-1), :]
reverbed = reverberate(torch.from_numpy(x), rir_waveform, rescale_amp="avg")
return reverbed.numpy()
# Load batches of data from mmaped numpy arrays
class mmap_batch_generator:
"""
A generator class designed to dynamically build batches from mmaped numpy arrays.
The generator will return tuples of (data, labels) with a batch size determined
by the `n_per_class` initialization argument. When a mmaped numpy array has been
fully interated over, it will restart at the zeroth index automatically.
"""
def __init__(self,
data_files: dict,
label_files: dict = {},
batch_size: int = 128,
n_per_class: dict = {},
data_transform_funcs: dict = {},
label_transform_funcs: dict = {}
):
"""
Initialize the generator object
Args:
data_files (dict): A dictionary of labels (as keys) and on-disk numpy array paths (as values).
Keys should be integer strings representing class labels.
label_files (dict): A dictionary where the keys are the class labels and the values are the per-example
labels. The values must be the same shape as the correponding numpy data arrays
from the `data_files` argument.
batch_size (int): The number of samples per batch
n_per_class (dict): A dictionary with integer string labels (as keys) and number of example per batch
(as values). If None (the default), batch sizes for each class will be
automatically calculated based on the the input dataframe shapes and transformation
functions.
data_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of per class
data loaded from the mmaped array. For example, with an array of shape
(batch, timesteps, features), if the goal is to half the timesteps per example,
(effectively doubling the size of the batch) this function could be passed:
lambda x: np.vstack(
(x[:, 0:timesteps//2, :], x[:, timesteps//2:, :]
))
The user should incorporate the effect of any transform on the values of the
`n_per_class` argument accordingly, in order to end of with the desired
total batch size for each iteration of the generator.
label_transform_funcs (dict): A dictionary of transformation functions to apply to each batch of labels.
For example, strings can be mapped to integers or one-hot encoded,
groups of classes can be merged together into one, etc.
"""
# inputs
self.data_files = data_files
self.label_files = label_files
self.n_per_class = n_per_class
self.data_transform_funcs = data_transform_funcs
self.label_transform_funcs = label_transform_funcs
# Get array mmaps and store their shapes (but load files < 1 GB total size into memory)
self.data = {label: np.load(fl, mmap_mode='r') for label, fl in data_files.items()}
self.labels = {label: np.load(fl) for label, fl in label_files.items()}
self.data_counter = {label: 0 for label in data_files.keys()}
self.original_shapes = {label: self.data[label].shape for label in self.data.keys()}
self.shapes = {label: self.data[label].shape for label in self.data.keys()}
# # Update effective shape of mmap array based on user-provided transforms (currently broken)
# for lbl, f in self.data_transform_funcs.items():
# dummy_data = np.random.random((1, self.original_shapes[lbl][1], self.original_shapes[lbl][2]))
# new_shape = f(dummy_data).shape
# self.shapes[lbl] = (new_shape[0]*self.original_shapes[lbl][0], new_shape[1], new_shape[2])
# Calculate batch sizes, if the user didn't specify them
scale_factor = 1
if not self.n_per_class:
self.n_per_class = {}
for lbl, shape in self.shapes.items():
dummy_data = np.random.random((10, self.shapes[lbl][1], self.shapes[lbl][2]))
if self.data_transform_funcs.get(lbl, None):
scale_factor = self.data_transform_funcs.get(lbl, None)(dummy_data).shape[0]/10
ratio = self.shapes[lbl][0]/sum([i[0] for i in self.shapes.values()])
self.n_per_class[lbl] = max(1, int(int(batch_size*ratio)/scale_factor))
# Get estimated batches per epoch, including the effect of any user-provided transforms
batch_size = sum([val*scale_factor for val in self.n_per_class.values()])
batches_per_epoch = sum([i[0] for i in self.shapes.values()])//batch_size
self.batch_per_epoch = batches_per_epoch
print("Batches/steps per epoch:", batches_per_epoch)
def __iter__(self):
return self
def __next__(self):
# Build batch
while True:
X, y = [], []
for label, n in self.n_per_class.items():
# Restart at zeroth index if an array reaches the end
if self.data_counter[label] >= self.shapes[label][0]:
self.data_counter[label] = 0
# self.data[label] = np.load(self.data_files[label], mmap_mode='r')
# Get data from mmaped file
x = self.data[label][self.data_counter[label]:self.data_counter[label]+n]
self.data_counter[label] += x.shape[0]
# Transform data
if self.data_transform_funcs and self.data_transform_funcs.get(label):
x = self.data_transform_funcs[label](x)
# Make labels for data (following whatever the current shape of `x` is)
if self.label_files.get(label, None):
y_batch = self.labels[label][self.data_counter[label]:self.data_counter[label]+n]
else:
y_batch = [label]*x.shape[0]
# Transform labels
if self.label_transform_funcs and self.label_transform_funcs.get(label):
y_batch = self.label_transform_funcs[label](y_batch)
# Add data to batch
X.append(x)
y.extend(y_batch)
return np.vstack(X), np.array(y)
# Function to remove empty rows from the end of a mmap array
def trim_mmap(mmap_path):
"""
Trims blank rows from the end of a mmaped numpy array by creates new mmap array without the blank rows.
Note that a copy is created and disk usage will briefly double as the function runs.
Args:
mmap_path (str): The path to mmap array file to trim
Returns:
None
"""
# Identify the last full row in the mmaped file
mmap_file1 = np.load(mmap_path, mmap_mode='r')
i = -1
while np.all(mmap_file1[i, :, :] == 0):
i -= 1
N_new = mmap_file1.shape[0] + i + 1
# Create new mmap_file and copy over data in batches
output_file2 = mmap_path.strip(".npy") + "2.npy"
mmap_file2 = open_memmap(output_file2, mode='w+', dtype=np.float32,
shape=(N_new, mmap_file1.shape[1], mmap_file1.shape[2]))
for i in tqdm(range(0, mmap_file1.shape[0], 1024), total=mmap_file1.shape[0]//1024):
if i + 1024 > N_new:
mmap_file2[i:N_new] = mmap_file1[i:N_new].copy()
mmap_file2.flush()
else:
mmap_file2[i:i+1024] = mmap_file1[i:i+1024].copy()
mmap_file2.flush()
# Remove old mmaped file
os.remove(mmap_path)
# Rename new mmap file to match original
os.rename(output_file2, mmap_path)

View File

@@ -0,0 +1,100 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Imports
import re
from tqdm import tqdm
import numpy as np
from typing import List
# Define metric utility functions specific to the wakeword detection use-case
def get_false_positives(scores: List, threshold: float, grouping_window: int = 50):
"""
Counts the number of false-positives based on a list of scores and a specified threshold.
Args:
scores (List): A list of predicted scores, between 0 and 1
threshold (float): The threshold to use to determine false-positive predictions
grouping_window (int: The size (in number of frames) for grouping scores above
the threshold into a single false positive for counting
Returns:
int: The number of false positive predictions in the list of scores
"""
bin_pred = np.array(scores) >= threshold
bin_pred_string = ''.join(["1" if i else "0" for i in bin_pred])
transitions = list(re.finditer("01", bin_pred_string))
n = grouping_window
for t in transitions:
if bin_pred[t.end()] != 0:
bin_pred[t.end():t.end() + min(len(transitions) - t.end(), n)] = [0]*min(len(transitions) - t.end(), n)
return sum(bin_pred)
def generate_roc_curve_fprs(
scores: list,
n_points: int = 25,
time_per_prediction: float = .08,
**kwargs
):
"""
Generates the false positive rate (fpr) per hour for the given predictions
over a range of score thresholds. Assumes that all predictions should be less than the threshold,
else the prediction is a false positive.
Args:
scores (List): A list of predicted scores, between 0 and 1
n_points (int): The number of points to use when calculating false positive rates
time_per_prediction (float): The time (in seconds) that each prediction represents
kwargs (dict): Any other keyword arguments to pass to the `get_false_positives` function
Returns:
list: A list of false positive rates per hour at different score threshold levels
"""
# Determine total time
total_hours = time_per_prediction*len(scores)/3600 # convert to hours
# Calculate true positive rate
fprs = []
for threshold in tqdm(np.linspace(0.01, 0.99, num=n_points)):
fpr = get_false_positives(scores, threshold=threshold, **kwargs)
fprs.append(fpr/total_hours)
return fprs
def generate_roc_curve_tprs(
scores: list,
n_points: int = 25
):
"""
Generates the true positive rate (true accept rate) for the given predictions
over a range score thresholds. Assumes that all predictions are supposed to be equal to 1.
Args:
scores (list): A list of scores for each prediction
Returns:
list: A list of true positive rates at different score threshold levels
"""
tprs = []
for threshold in tqdm(np.linspace(0.01, 0.99, num=n_points)):
tprs.append(sum(scores >= threshold)/len(scores))
return tprs

View File

@@ -0,0 +1,402 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Imports
import numpy as np
import onnxruntime as ort
import openwakeword
from openwakeword.utils import AudioFeatures
import wave
import os
import pickle
from collections import deque, defaultdict
from functools import partial
import time
from typing import List, Union, DefaultDict, Dict
# Define main model class
class Model():
"""
The main model class for openWakeWord. Creates a model object with the shared audio pre-processer
and for arbitrarily many custom wake word/wake phrase models.
"""
def __init__(
self,
wakeword_model_paths: List[str] = [],
class_mapping_dicts: List[dict] = [],
enable_speex_noise_suppression: bool = False,
vad_threshold: float = 0,
custom_verifier_models: dict = {},
custom_verifier_threshold: float = 0.1,
**kwargs
):
"""Initialize the openWakeWord model object.
Args:
wakeword_model_paths (List[str]): A list of paths of ONNX models to load into the openWakeWord model object.
If not provided, will load all of the pre-trained models.
class_mapping_dicts (List[dict]): A list of dictionaries with integer to string class mappings for
each model in the `wakeword_model_paths` arguments
(e.g., {"0": "class_1", "1": "class_2"})
enable_speex_noise_suppression (bool): Whether to use the noise suppresion from the SpeexDSP
library to pre-process all incoming audio. May increase
model performance when reasonably stationary background noise
is present in the environment where openWakeWord will be used.
It is very lightweight, so enabling it doesn't significantly
impact efficiency.
vad_threshold (float): Whether to use a voice activity detection model (VAD) from Silero
(https://github.com/snakers4/silero-vad) to filter predictions.
For every input audio frame, a VAD score is obtained and only those model predictions
with VAD scores above the threshold will be returned. The default value (0),
disables voice activity detection entirely.
custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
the keys are the model names (corresponding to the openwakeword.models
attribute) and the values are the filepaths of the
custom verifier models.
custom_verifier_threshold (float): The score threshold to use a custom verifier model. If the score
from a model for a given frame is greater than this value, the
associated custom verifier model will also predict on that frame, and
the verifier score will be returned.
kwargs (dict): Any other keyword arguments to pass the the preprocessor instance
"""
# Initialize the ONNX models and store them
sessionOptions = ort.SessionOptions()
sessionOptions.inter_op_num_threads = 1
sessionOptions.intra_op_num_threads = 1
# Get model paths for pre-trained models if user doesn't provide models to load
if wakeword_model_paths == []:
wakeword_model_paths = openwakeword.get_pretrained_model_paths()
wakeword_model_names = list(openwakeword.models.keys())
else:
wakeword_model_names = [os.path.basename(i[0:-5]) for i in wakeword_model_paths]
# Create attributes to store models and metadata
self.models = {}
self.model_inputs = {}
self.model_outputs = {}
self.class_mapping = {}
self.model_input_names = {}
self.custom_verifier_models = {}
self.custom_verifier_threshold = custom_verifier_threshold
for mdl_path, mdl_name in zip(wakeword_model_paths, wakeword_model_names):
# Load openwakeword models
self.models[mdl_name] = ort.InferenceSession(mdl_path, sess_options=sessionOptions,
providers=["CPUExecutionProvider"])
self.model_inputs[mdl_name] = self.models[mdl_name].get_inputs()[0].shape[1]
self.model_outputs[mdl_name] = self.models[mdl_name].get_outputs()[0].shape[1]
if class_mapping_dicts and class_mapping_dicts[wakeword_model_paths.index(mdl_path)].get(mdl_name, None):
self.class_mapping[mdl_name] = class_mapping_dicts[wakeword_model_paths.index(mdl_path)]
elif openwakeword.model_class_mappings.get(mdl_name, None):
self.class_mapping[mdl_name] = openwakeword.model_class_mappings[mdl_name]
else:
self.class_mapping[mdl_name] = {str(i): str(i) for i in range(0, self.model_outputs[mdl_name])}
self.model_input_names[mdl_name] = self.models[mdl_name].get_inputs()[0].name
# Load custom verifier models
if isinstance(custom_verifier_models, dict):
if custom_verifier_models.get(mdl_name, False):
self.custom_verifier_models[mdl_name] = pickle.load(open(custom_verifier_models[mdl_name], 'rb'))
if len(self.custom_verifier_models.keys()) < len(custom_verifier_models.keys()):
raise ValueError(
"Custom verifier models were provided, but some were not matched with a base model!"
" Make sure that the keys provided in the `custom_verifier_models` dictionary argument"
" exactly match that of the `.models` attribute of an instantiated openWakeWord Model object"
" that has the same base models but doesn't have custom verifier models."
)
# Create buffer to store frame predictions
self.prediction_buffer: DefaultDict[str, deque] = defaultdict(partial(deque, maxlen=30))
# Initialize SpeexDSP noise canceller
if enable_speex_noise_suppression:
from speexdsp_ns import NoiseSuppression
self.speex_ns = NoiseSuppression.create(160, 16000)
else:
self.speex_ns = None
# Initialize Silero VAD
self.vad_threshold = vad_threshold
if vad_threshold > 0:
self.vad = openwakeword.VAD()
# Create AudioFeatures object
self.preprocessor = AudioFeatures(**kwargs)
def get_parent_model_from_label(self, label):
"""Gets the parent model associated with a given prediction label"""
parent_model = ""
for mdl in self.class_mapping.keys():
if label in self.class_mapping[mdl].values():
parent_model = mdl
elif label in self.class_mapping.keys() and label == mdl:
parent_model = mdl
return parent_model
def reset(self):
"""Reset the prediction buffer"""
self.prediction_buffer = defaultdict(partial(deque, maxlen=30))
def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False):
"""Predict with all of the wakeword models on the input audio frames
Args:
x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms
(1280 samples), with longer lengths reducing overall CPU usage
but decreasing detection latency.
patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
be observed before the current frame will be returned as non-zero.
Must be provided as an a dictionary where the keys are the
model names and the values are the number of frames. Can reduce false-positive
detections at the cost of a lower true-positive rate.
By default, this behavior is disabled.
threshold (dict): The threshold values to use when the `patience` behavior is enabled.
Must be provided as an a dictionary where the keys are the
model names and the values are the thresholds.
timing (bool): Whether to return timing information of the models. Can be useful to debug and
assess how efficiently models are running on the current hardware.
Returns:
dict: A dictionary of scores between 0 and 1 for each model, where 0 indicates no
wake-word/wake-phrase detected. If the `timing` argument is true, returns a
tuple of dicts containing model predictions and timing information, respectively.
"""
# Setup timing dict
if timing:
timing_dict: Dict[str, Dict] = {}
timing_dict["models"] = {}
feature_start = time.time()
# Get audio features (optionally with Speex noise suppression)
if self.speex_ns:
self.preprocessor(self._suppress_noise_with_speex(x))
else:
self.preprocessor(x)
if timing:
timing_dict["models"]["preprocessor"] = time.time() - feature_start
# Get predictions from model(s)
predictions = {}
for mdl in self.models.keys():
input_name = self.model_input_names[mdl]
if timing:
model_start = time.time()
# Run model to get predictions
if len(x) > 1280:
group_predictions = []
for i in np.arange(len(x)//1280-1, -1, -1):
group_predictions.extend(
self.models[mdl].run(
None,
{input_name: self.preprocessor.get_features(
self.model_inputs[mdl],
start_ndx=-self.model_inputs[mdl] - i
)}
)
)
prediction = np.array(group_predictions).max(axis=0)[None, ]
else:
prediction = self.models[mdl].run(
None,
{input_name: self.preprocessor.get_features(self.model_inputs[mdl])}
)
if self.model_outputs[mdl] == 1:
predictions[mdl] = prediction[0][0][0]
else:
for int_label, cls in self.class_mapping[mdl].items():
predictions[cls] = prediction[0][0][int(int_label)]
# Update scores based on custom verifier model
if self.custom_verifier_models != {}:
for cls in predictions.keys():
if predictions[cls] >= self.custom_verifier_threshold:
parent_model = self.get_parent_model_from_label(cls)
if self.custom_verifier_models.get(parent_model, False):
verifier_prediction = self.custom_verifier_models[parent_model].predict_proba(
self.preprocessor.get_features(self.model_inputs[mdl])
)[0][-1]
predictions[cls] = verifier_prediction
# Update prediction buffer, and zero predictions for first 5 frames during model initialization
for cls in predictions.keys():
if len(self.prediction_buffer[cls]) < 5:
predictions[cls] = 0.0
self.prediction_buffer[cls].append(predictions[cls])
# Get timing information
if timing:
timing_dict["models"][mdl] = time.time() - model_start
# Update scores based on thresholds or patience arguments
if patience != {}:
if threshold == {}:
raise ValueError("Error! When using the `patience` argument, threshold "
"values must be provided via the `threshold` argument!")
for mdl in predictions.keys():
parent_model = self.get_parent_model_from_label(mdl)
if parent_model in patience.keys():
scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:]
if (scores >= threshold[parent_model]).sum() < patience[parent_model]:
predictions[mdl] = 0.0
# (optionally) get voice activity detection scores and update model scores
if self.vad_threshold > 0:
if timing:
vad_start = time.time()
self.vad(x)
if timing:
timing_dict["models"]["vad"] = time.time() - vad_start
# Get frames from last 0.4 to 0.56 seconds (3 frames) before the current
# frame and get max VAD score
vad_frames = list(self.vad.prediction_buffer)[-7:-4]
vad_max_score = np.max(vad_frames) if len(vad_frames) > 0 else 0
for mdl in predictions.keys():
if vad_max_score < self.vad_threshold:
predictions[mdl] = 0.0
if timing:
return predictions, timing_dict
else:
return predictions
def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, chunk_size=1280, **kwargs):
"""Predict on an full audio clip, simulating streaming prediction.
The input clip must bit a 16-bit, 16 khz, single-channel WAV file.
Args:
clip (Union[str, np.ndarray]): The path to a 16-bit PCM, 16 khz, single-channel WAV file,
or an 1D array containing the same type of data
padding (int): How many seconds of silence to pad the start/end of the clip with
to make sure that short clips can be processed correctly (default: 1)
chunk_size (int): The size (in samples) of each chunk of audio to pass to the model
kwargs: Any keyword arguments to pass to the class `predict` method
Returns:
list: A list containing the frame-level prediction dictionaries for the audio clip
"""
if isinstance(clip, str):
# Load audio clip as 16-bit PCM data
with wave.open(clip, mode='rb') as f:
# Load WAV clip frames
data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
elif isinstance(clip, np.ndarray):
data = clip
if padding:
data = np.concatenate(
(
np.zeros(16000*padding).astype(np.int16),
data,
np.zeros(16000*padding).astype(np.int16)
)
)
# Iterate through clip, getting predictions
predictions = []
step_size = chunk_size
for i in range(0, data.shape[0]-step_size, step_size):
predictions.append(self.predict(data[i:i+step_size], **kwargs))
return predictions
def _get_positive_prediction_frames(
self,
file: str,
threshold: float = 0.5,
return_type: str = "features",
**kwargs
):
"""
Gets predictions for the input audio data, and returns the audio features (embeddings)
or audio data for all of the frames with a score above the `threshold` argument.
Can be a useful way to collect false-positive predictions.
Args:
file (str): The path to a 16-bit 16khz WAV audio file to process
threshold (float): The minimum score required for a frame of audio features
to be returned.
return_type (str): The type of data to return when a positive prediction is
detected. Can be either 'features' or 'audio' to return
audio embeddings or raw audio data, respectively.
kwargs: Any keyword arguments to pass to the class `predict` method
Returns:
dict: A dictionary with filenames as keys and N x M arrays as values,
where N is the number of examples and M is the number
of audio features, depending on the model input shape.
"""
# Load audio clip as 16-bit PCM data
with wave.open(file, mode='rb') as f:
# Load WAV clip frames
data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
# Iterate through clip, getting predictions
positive_data = defaultdict(list)
step_size = 1280
for i in range(0, data.shape[0]-step_size, step_size):
predictions = self.predict(data[i:i+step_size], **kwargs)
for lbl in predictions.keys():
if predictions[lbl] >= threshold:
mdl = self.get_parent_model_from_label(lbl)
features = self.preprocessor.get_features(self.model_inputs[mdl])
if return_type == 'features':
positive_data[lbl].append(features)
if return_type == 'audio':
context = data[max(0, i - 16000*3):i + 16000]
if len(context) == 16000*4:
positive_data[lbl].append(context)
positive_data_combined = {}
for lbl in positive_data.keys():
positive_data_combined[lbl] = np.vstack(positive_data[lbl])
return positive_data_combined
def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160):
"""
Runs the input audio through the SpeexDSP noise suppression algorithm.
Note that this function updates the state of the existing Speex noise
suppression object, and isn't intended to be called externally.
Args:
x (ndarray): The 16-bit, 16khz audio to process. Must always be an
integer multiple of `frame_size`.
frame_size (int): The frame size to use for the Speex Noise suppressor.
Must match the frame size specified during the
initialization of the noise suppressor.
Returns:
ndarray: The input audio with noise suppression applied
"""
cleaned = []
for i in range(0, x.shape[0], frame_size):
chunk = x[i:i+frame_size]
cleaned.append(self.speex_ns.process(chunk.tobytes()))
cleaned_bytestring = b''.join(cleaned)
cleaned_array = np.frombuffer(cleaned_bytestring, np.int16)
return cleaned_array

View File

@@ -0,0 +1,407 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Imports
import os
import onnxruntime as ort
import numpy as np
import pathlib
from collections import deque
from multiprocessing.pool import ThreadPool
from multiprocessing import Process, Queue
import time
import openwakeword
from typing import Union, List, Callable, Deque
# Base class for computing audio features using Google's speech_embedding
# model (https://tfhub.dev/google/speech_embedding/1)
class AudioFeatures():
"""
A class for creating audio features from audio data, including melspectograms and Google's
`speech_embedding` features.
"""
def __init__(self,
melspec_onnx_model_path: str = os.path.join(
pathlib.Path(__file__).parent.resolve(),
"resources", "models", "melspectrogram.onnx"
),
embedding_onnx_model_path: str = os.path.join(
pathlib.Path(__file__).parent.resolve(),
"resources", "models", "embedding_model.onnx"
),
sr: int = 16000,
ncpu: int = 1
):
"""
Initialize the AudioFeatures object.
Args:
melspec_onnx_model_path (str): The path to the ONNX model for computing melspectograms from audio data
embedding_onnx_model_path (str): The path to the ONNX model for Google's `speech_embedding` model
sr (int): The sample rate of the audio (default: 16000 khz)
ncpu (int): The number of CPUs to use when computing melspectrograms and audio features (default: 1)
"""
# Initialize the ONNX models
sessionOptions = ort.SessionOptions()
sessionOptions.inter_op_num_threads = ncpu
sessionOptions.intra_op_num_threads = ncpu
self.melspec_model = ort.InferenceSession(melspec_onnx_model_path, sess_options=sessionOptions,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
self.embedding_model = ort.InferenceSession(embedding_onnx_model_path, sess_options=sessionOptions,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
self.onnx_execution_provider = self.melspec_model.get_providers()[0]
# Create databuffers
self.raw_data_buffer: Deque = deque(maxlen=sr*10)
self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features
self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio
self.accumulated_samples = 0 # the samples added to the buffer since the audio preprocessor was last called
self.feature_buffer = self._get_embeddings(np.zeros(160000).astype(np.int16)) # fill with blank data to start
self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history
def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2):
"""
Function to compute the mel-spectrogram of the provided audio samples.
Args:
x (Union[np.ndarray, List]): The input audio data to compute the melspectrogram from
melspec_transform (Callable): A function to transform the computed melspectrogram. Defaults to a transform
that makes the ONNX melspectrogram model closer to the native Tensorflow
implementation from Google (https://tfhub.dev/google/speech_embedding/1).
Return:
np.ndarray: The computed melspectrogram of the input audio data
"""
# Get input data and adjust type/shape as needed
x = np.array(x).astype(np.int16) if isinstance(x, list) else x
if x.dtype != np.int16:
raise ValueError("Input data must be 16-bit integers (i.e., 16-bit PCM audio)."
f"You provided {x.dtype} data.")
x = x[None, ] if len(x.shape) < 2 else x
x = x.astype(np.float32) if x.dtype != np.float32 else x
# Get melspectrogram
outputs = self.melspec_model.run(None, {'input': x})
spec = np.squeeze(outputs[0])
# Arbitrary transform of melspectrogram
spec = melspec_transform(spec)
return spec
def _get_embeddings_from_melspec(self, melspec):
"""
Computes the Google `speech_embedding` features from a melspectrogram input
Args:
melspec (np.ndarray): The input melspectrogram
Returns:
np.ndarray: The computed audio features/embeddings
"""
if melspec.shape[0] != 1:
melspec = melspec[None, ]
embedding = self.embedding_model.run(None, {'input_1': melspec})[0].squeeze()
return embedding
def _get_embeddings(self, x: np.ndarray, window_size: int = 76, step_size: int = 8, **kwargs):
"""Function to compute the embeddings of the provide audio samples."""
spec = self._get_melspectrogram(x, **kwargs)
windows = []
for i in range(0, spec.shape[0], 8):
window = spec[i:i+window_size]
if window.shape[0] == window_size: # truncate short windows
windows.append(window)
batch = np.expand_dims(np.array(windows), axis=-1).astype(np.float32)
embedding = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
return embedding
def get_embedding_shape(self, audio_length: float, sr: int = 16000):
"""Function that determines the size of the output embedding array for a given audio clip length (in seconds)"""
x = (np.random.uniform(-1, 1, int(audio_length*sr))*32767).astype(np.int16)
return self._get_embeddings(x).shape
def _get_melspectrogram_batch(self, x, batch_size=128, ncpu=1):
"""
Compute the melspectrogram of the input audio samples in batches.
Note that the optimal performance will depend in the interaction between the device,
batch size, and ncpu (if a CPU device is used). The user is encouraged
to experiment with different values of these parameters to identify
which combination is best for their data, as often differences of 1-4x are seen.
Args:
x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
Assumes that all of the audio data is the same length (same number of samples).
batch_size (int): The batch size to use when computing the melspectrogram
ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
no effect if the underlying model is executing on a GPU.
Returns:
ndarray: A numpy array of shape (N, frames, melbins) containing the melspectrogram of
all N input audio examples
"""
# Prepare ThreadPool object, if needed for multithreading
pool = None
if "CPU" in self.onnx_execution_provider:
pool = ThreadPool(processes=ncpu)
# Make batches
n_frames = int(np.ceil(x.shape[1]/160-3))
mel_bins = 32 # fixed by melspectrogram model
melspecs = np.empty((x.shape[0], n_frames, mel_bins), dtype=np.float32)
for i in range(0, max(batch_size, x.shape[0]), batch_size):
batch = x[i:i+batch_size]
if "CUDA" in self.onnx_execution_provider:
result = self._get_melspectrogram(batch)
elif pool:
result = np.array(pool.map(self._get_melspectrogram,
batch, chunksize=batch.shape[0]//ncpu))
melspecs[i:i+batch_size, :, :] = result.squeeze()
# Cleanup ThreadPool
if pool:
pool.close()
return melspecs
def _get_embeddings_batch(self, x, batch_size=128, ncpu=1):
"""
Compute the embeddings of the input melspectrograms in batches.
Note that the optimal performance will depend in the interaction between the device,
batch size, and ncpu (if a CPU device is used). The user is encouraged
to experiment with different values of these parameters to identify
which combination is best for their data, as often differences of 1-4x are seen.
Args:
x (ndarray): A numpy array of melspectrograms of shape (N, frames, melbins).
Assumes that all of the melspectrograms have the same shape.
batch_size (int): The batch size to use when computing the embeddings
ncpu (int): The number of CPUs to use when computing the embeddings. This argument has
no effect if the underlying model is executing on a GPU.
Returns:
ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
all N input melspectrograms
"""
# Ensure input is the correct shape
if x.shape[1] < 76:
raise ValueError("Embedding model requires the input melspectrograms to have at least 76 frames")
# Prepare ThreadPool object, if needed for multithreading
pool = None
if "CPU" in self.onnx_execution_provider:
pool = ThreadPool(processes=ncpu)
# Calculate array sizes and make batches
n_frames = (x.shape[1] - 76)//8 + 1
embedding_dim = 96 # fixed by embedding model
embeddings = np.empty((x.shape[0], n_frames, embedding_dim), dtype=np.float32)
batch = []
ndcs = []
for ndx, melspec in enumerate(x):
window_size = 76
for i in range(0, melspec.shape[0], 8):
window = melspec[i:i+window_size]
if window.shape[0] == window_size: # ignore windows that are too short (truncates end of clip)
batch.append(window)
ndcs.append(ndx)
if len(batch) >= batch_size or ndx+1 == x.shape[0]:
batch = np.array(batch).astype(np.float32)
if "CUDA" in self.onnx_execution_provider:
result = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
elif pool:
result = np.array(pool.map(self._get_embeddings_from_melspec,
batch, chunksize=batch.shape[0]//ncpu))
for j, ndx2 in zip(range(0, result.shape[0], n_frames), ndcs):
embeddings[ndx2, :, :] = result[j:j+n_frames]
batch = []
ndcs = []
# Cleanup ThreadPool
if pool:
pool.close()
return embeddings
def embed_clips(self, x, batch_size=128, ncpu=1):
"""
Compute the embeddings of the input audio clips in batches.
Note that the optimal performance will depend in the interaction between the device,
batch size, and ncpu (if a CPU device is used). The user is encouraged
to experiment with different values of these parameters to identify
which combination is best for their data, as often differences of 1-4x are seen.
Args:
x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
Assumes that all of the audio data is the same length (same number of samples).
batch_size (int): The batch size to use when computing the embeddings
ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
no effect if the underlying model is executing on a GPU.
Returns:
ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
all N input audio clips
"""
# Compute melspectrograms
melspecs = self._get_melspectrogram_batch(x, batch_size=batch_size, ncpu=ncpu)
# Compute embeddings from melspectrograms
embeddings = self._get_embeddings_batch(melspecs[:, :, :, None], batch_size=batch_size, ncpu=ncpu)
return embeddings
def _streaming_melspectrogram(self, n_samples):
"""Note! There seem to be some slight numerical issues depending on the underlying audio data
such that the streaming method is not exactly the same as when the melspectrogram of the entire
clip is calculated. It's unclear if this difference is significant and will impact model performance.
In particular padding with 0 or very small values seems to demonstrate the differences well.
"""
self.melspectrogram_buffer = np.vstack(
(self.melspectrogram_buffer, self._get_melspectrogram(list(self.raw_data_buffer)[-n_samples-160*3:]))
)
if self.melspectrogram_buffer.shape[0] > self.melspectrogram_max_len:
self.melspectrogram_buffer = self.melspectrogram_buffer[-self.melspectrogram_max_len:, :]
def _buffer_raw_data(self, x):
"""
Adds raw audio data to the input buffer
"""
if len(x) < 400:
raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
self.raw_data_buffer.extend(x.tolist() if isinstance(x, np.ndarray) else x)
def _streaming_features(self, x):
# if len(x) != 1280:
# raise ValueError("You must provide input samples in frames of 1280 samples @ 1600khz."
# f"Received a frame of {len(x)} samples.")
# Add raw audio data to buffer
self._buffer_raw_data(x)
self.accumulated_samples += len(x)
# Only calculate melspectrogram every ~0.5 seconds to significantly increase efficiency
if self.accumulated_samples >= 1280:
self._streaming_melspectrogram(self.accumulated_samples)
# Calculate new audio embeddings/features based on update melspectrograms
for i in np.arange(self.accumulated_samples//1280-1, -1, -1):
ndx = -8*i
ndx = ndx if ndx != 0 else len(self.melspectrogram_buffer)
x = self.melspectrogram_buffer[-76 + ndx:ndx].astype(np.float32)[None, :, :, None]
if x.shape[1] == 76:
self.feature_buffer = np.vstack((self.feature_buffer,
self.embedding_model.run(None, {'input_1': x})[0].squeeze()))
# Reset raw data buffer counter
self.accumulated_samples = 0
if self.feature_buffer.shape[0] > self.feature_buffer_max_len:
self.feature_buffer = self.feature_buffer[-self.feature_buffer_max_len:, :]
def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
if start_ndx != -1:
end_ndx = start_ndx + int(n_feature_frames) \
if start_ndx + n_feature_frames != 0 else len(self.feature_buffer)
return self.feature_buffer[start_ndx:end_ndx, :][None, ].astype(np.float32)
else:
return self.feature_buffer[int(-1*n_feature_frames):, :][None, ].astype(np.float32)
def __call__(self, x):
self._streaming_features(x)
# Bulk prediction function
def bulk_predict(
file_paths: List[str],
wakeword_model_paths: List[str],
prediction_function: str = 'predict_clip',
ncpu: int = 1,
**kwargs
):
"""
Bulk predict on the provided input files in parallel using multiprocessing using the specified model.
Args:
input_paths (List[str]): The list of input file to predict
wakeword_model_path (List[str])): The paths to the wakeword ONNX model files
prediction_function (str): The name of the method used to predict on the input audio files
(default is the `predict_clip` method)
ncpu (int): How many processes to create (up to max of available CPUs)
kwargs (dict): Any other keyword arguments to pass to the model initialization or
specified prediction function
Returns:
dict: A dictionary containing the predictions for each file, with the filepath as the key
"""
# Create openWakeWord model objects
n_batches = max(1, len(file_paths)//ncpu)
remainder = len(file_paths) % ncpu
chunks = [file_paths[i:i+n_batches] for i in range(0, max(1, len(file_paths)-remainder), n_batches)]
for i in range(1, remainder+1):
chunks[i-1].append(file_paths[-1*i])
# Create jobs
ps = []
mdls = []
q: Queue = Queue()
for chunk in chunks:
filtered_kwargs = {key: value for key, value in kwargs.items()
if key in openwakeword.Model.__init__.__code__.co_varnames}
oww = openwakeword.Model(
wakeword_model_paths=wakeword_model_paths,
**filtered_kwargs
)
mdls.append(oww)
def f(clips):
results = []
for clip in clips:
func = getattr(mdls[-1], prediction_function)
filtered_kwargs = {key: value for key, value in kwargs.items()
if key in func.__code__.co_varnames}
results.append({clip: func(clip, **filtered_kwargs)})
q.put(results)
ps.append(Process(target=f, args=(chunk,)))
# Submit jobs
for p in ps:
p.start()
# Collection results
results = []
for p in ps:
while q.empty():
time.sleep(0.01)
results.extend(q.get())
# Consolidate results and return
return {list(i.keys())[0]: list(i.values())[0] for i in results}

View File

@@ -0,0 +1,128 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#######################
# Silero VAD License
#######################
# MIT License
# Copyright (c) 2020-present Silero Team
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
########################################
# This file contains the implementation of a class for voice activity detection (VAD),
# based on the pre-trained model from Silero (https://github.com/snakers4/silero-vad).
# It can be used as with the openWakeWord library, or independently.
# Imports
import onnxruntime as ort
import numpy as np
import os
from collections import deque
class VAD():
"""
A model class for a voice activity detection (VAD) based on Silero's model:
https://github.com/snakers4/silero-vad
"""
def __init__(self,
model_path: str = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"resources",
"models",
"silero_vad.onnx"
)
):
"""Initialize the VAD model object.
Args:
model_path (str): The path to the Silero VAD ONNX model.
"""
# Initialize the ONNX model
sessionOptions = ort.SessionOptions()
sessionOptions.inter_op_num_threads = 1
sessionOptions.intra_op_num_threads = 1
self.model = ort.InferenceSession(model_path, sess_options=sessionOptions,
providers=["CPUExecutionProvider"])
# Create buffer
self.prediction_buffer: deque = deque(maxlen=125) # buffer lenght of 10 seconds
# Set model parameters
self.sample_rate = np.array(16000).astype(np.int64)
# Reset model to start
self.reset_states()
def reset_states(self, batch_size=1):
self._h = np.zeros((2, batch_size, 64)).astype('float32')
self._c = np.zeros((2, batch_size, 64)).astype('float32')
self._last_sr = 0
self._last_batch_size = 0
def predict(self, x, frame_size=480):
"""
Get the VAD predictions for the input audio frame.
Args:
x (np.ndarray): The input audio, must be 16 khz and 16-bit PCM format.
If longer than the input frame, will be split into
chunks of length `frame_size` and the predictions for
each chunk returned. Must be a length that is integer
multiples of the `frame_size` argument.
frame_size (int): The frame size in samples. The reccomended
default is 480 samples (30 ms @ 16khz),
but smaller and larger values
can be used (though performance may decrease).
Returns
float: The average predicted score for the audio frame
"""
chunks = [(x[i:i+frame_size]/32767).astype(np.float32)
for i in range(0, x.shape[0], frame_size)]
frame_predictions = []
for chunk in chunks:
ort_inputs = {'input': chunk[None, ],
'h': self._h, 'c': self._c, 'sr': self.sample_rate}
ort_outs = self.model.run(None, ort_inputs)
out, self._h, self._c = ort_outs
frame_predictions.append(out[0][0])
return np.mean(frame_predictions)
def __call__(self, x, frame_size=160*4):
self.prediction_buffer.append(self.predict(x, frame_size))