Files
voice_bridge/venv/lib/python3.12/site-packages/openwakeword/utils.py
2026-01-09 10:28:44 +11:00

408 lines
18 KiB
Python

# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Imports
import os
import onnxruntime as ort
import numpy as np
import pathlib
from collections import deque
from multiprocessing.pool import ThreadPool
from multiprocessing import Process, Queue
import time
import openwakeword
from typing import Union, List, Callable, Deque
# Base class for computing audio features using Google's speech_embedding
# model (https://tfhub.dev/google/speech_embedding/1)
class AudioFeatures():
"""
A class for creating audio features from audio data, including melspectograms and Google's
`speech_embedding` features.
"""
def __init__(self,
melspec_onnx_model_path: str = os.path.join(
pathlib.Path(__file__).parent.resolve(),
"resources", "models", "melspectrogram.onnx"
),
embedding_onnx_model_path: str = os.path.join(
pathlib.Path(__file__).parent.resolve(),
"resources", "models", "embedding_model.onnx"
),
sr: int = 16000,
ncpu: int = 1
):
"""
Initialize the AudioFeatures object.
Args:
melspec_onnx_model_path (str): The path to the ONNX model for computing melspectograms from audio data
embedding_onnx_model_path (str): The path to the ONNX model for Google's `speech_embedding` model
sr (int): The sample rate of the audio (default: 16000 khz)
ncpu (int): The number of CPUs to use when computing melspectrograms and audio features (default: 1)
"""
# Initialize the ONNX models
sessionOptions = ort.SessionOptions()
sessionOptions.inter_op_num_threads = ncpu
sessionOptions.intra_op_num_threads = ncpu
self.melspec_model = ort.InferenceSession(melspec_onnx_model_path, sess_options=sessionOptions,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
self.embedding_model = ort.InferenceSession(embedding_onnx_model_path, sess_options=sessionOptions,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
self.onnx_execution_provider = self.melspec_model.get_providers()[0]
# Create databuffers
self.raw_data_buffer: Deque = deque(maxlen=sr*10)
self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features
self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio
self.accumulated_samples = 0 # the samples added to the buffer since the audio preprocessor was last called
self.feature_buffer = self._get_embeddings(np.zeros(160000).astype(np.int16)) # fill with blank data to start
self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history
def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2):
"""
Function to compute the mel-spectrogram of the provided audio samples.
Args:
x (Union[np.ndarray, List]): The input audio data to compute the melspectrogram from
melspec_transform (Callable): A function to transform the computed melspectrogram. Defaults to a transform
that makes the ONNX melspectrogram model closer to the native Tensorflow
implementation from Google (https://tfhub.dev/google/speech_embedding/1).
Return:
np.ndarray: The computed melspectrogram of the input audio data
"""
# Get input data and adjust type/shape as needed
x = np.array(x).astype(np.int16) if isinstance(x, list) else x
if x.dtype != np.int16:
raise ValueError("Input data must be 16-bit integers (i.e., 16-bit PCM audio)."
f"You provided {x.dtype} data.")
x = x[None, ] if len(x.shape) < 2 else x
x = x.astype(np.float32) if x.dtype != np.float32 else x
# Get melspectrogram
outputs = self.melspec_model.run(None, {'input': x})
spec = np.squeeze(outputs[0])
# Arbitrary transform of melspectrogram
spec = melspec_transform(spec)
return spec
def _get_embeddings_from_melspec(self, melspec):
"""
Computes the Google `speech_embedding` features from a melspectrogram input
Args:
melspec (np.ndarray): The input melspectrogram
Returns:
np.ndarray: The computed audio features/embeddings
"""
if melspec.shape[0] != 1:
melspec = melspec[None, ]
embedding = self.embedding_model.run(None, {'input_1': melspec})[0].squeeze()
return embedding
def _get_embeddings(self, x: np.ndarray, window_size: int = 76, step_size: int = 8, **kwargs):
"""Function to compute the embeddings of the provide audio samples."""
spec = self._get_melspectrogram(x, **kwargs)
windows = []
for i in range(0, spec.shape[0], 8):
window = spec[i:i+window_size]
if window.shape[0] == window_size: # truncate short windows
windows.append(window)
batch = np.expand_dims(np.array(windows), axis=-1).astype(np.float32)
embedding = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
return embedding
def get_embedding_shape(self, audio_length: float, sr: int = 16000):
"""Function that determines the size of the output embedding array for a given audio clip length (in seconds)"""
x = (np.random.uniform(-1, 1, int(audio_length*sr))*32767).astype(np.int16)
return self._get_embeddings(x).shape
def _get_melspectrogram_batch(self, x, batch_size=128, ncpu=1):
"""
Compute the melspectrogram of the input audio samples in batches.
Note that the optimal performance will depend in the interaction between the device,
batch size, and ncpu (if a CPU device is used). The user is encouraged
to experiment with different values of these parameters to identify
which combination is best for their data, as often differences of 1-4x are seen.
Args:
x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
Assumes that all of the audio data is the same length (same number of samples).
batch_size (int): The batch size to use when computing the melspectrogram
ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
no effect if the underlying model is executing on a GPU.
Returns:
ndarray: A numpy array of shape (N, frames, melbins) containing the melspectrogram of
all N input audio examples
"""
# Prepare ThreadPool object, if needed for multithreading
pool = None
if "CPU" in self.onnx_execution_provider:
pool = ThreadPool(processes=ncpu)
# Make batches
n_frames = int(np.ceil(x.shape[1]/160-3))
mel_bins = 32 # fixed by melspectrogram model
melspecs = np.empty((x.shape[0], n_frames, mel_bins), dtype=np.float32)
for i in range(0, max(batch_size, x.shape[0]), batch_size):
batch = x[i:i+batch_size]
if "CUDA" in self.onnx_execution_provider:
result = self._get_melspectrogram(batch)
elif pool:
result = np.array(pool.map(self._get_melspectrogram,
batch, chunksize=batch.shape[0]//ncpu))
melspecs[i:i+batch_size, :, :] = result.squeeze()
# Cleanup ThreadPool
if pool:
pool.close()
return melspecs
def _get_embeddings_batch(self, x, batch_size=128, ncpu=1):
"""
Compute the embeddings of the input melspectrograms in batches.
Note that the optimal performance will depend in the interaction between the device,
batch size, and ncpu (if a CPU device is used). The user is encouraged
to experiment with different values of these parameters to identify
which combination is best for their data, as often differences of 1-4x are seen.
Args:
x (ndarray): A numpy array of melspectrograms of shape (N, frames, melbins).
Assumes that all of the melspectrograms have the same shape.
batch_size (int): The batch size to use when computing the embeddings
ncpu (int): The number of CPUs to use when computing the embeddings. This argument has
no effect if the underlying model is executing on a GPU.
Returns:
ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
all N input melspectrograms
"""
# Ensure input is the correct shape
if x.shape[1] < 76:
raise ValueError("Embedding model requires the input melspectrograms to have at least 76 frames")
# Prepare ThreadPool object, if needed for multithreading
pool = None
if "CPU" in self.onnx_execution_provider:
pool = ThreadPool(processes=ncpu)
# Calculate array sizes and make batches
n_frames = (x.shape[1] - 76)//8 + 1
embedding_dim = 96 # fixed by embedding model
embeddings = np.empty((x.shape[0], n_frames, embedding_dim), dtype=np.float32)
batch = []
ndcs = []
for ndx, melspec in enumerate(x):
window_size = 76
for i in range(0, melspec.shape[0], 8):
window = melspec[i:i+window_size]
if window.shape[0] == window_size: # ignore windows that are too short (truncates end of clip)
batch.append(window)
ndcs.append(ndx)
if len(batch) >= batch_size or ndx+1 == x.shape[0]:
batch = np.array(batch).astype(np.float32)
if "CUDA" in self.onnx_execution_provider:
result = self.embedding_model.run(None, {'input_1': batch})[0].squeeze()
elif pool:
result = np.array(pool.map(self._get_embeddings_from_melspec,
batch, chunksize=batch.shape[0]//ncpu))
for j, ndx2 in zip(range(0, result.shape[0], n_frames), ndcs):
embeddings[ndx2, :, :] = result[j:j+n_frames]
batch = []
ndcs = []
# Cleanup ThreadPool
if pool:
pool.close()
return embeddings
def embed_clips(self, x, batch_size=128, ncpu=1):
"""
Compute the embeddings of the input audio clips in batches.
Note that the optimal performance will depend in the interaction between the device,
batch size, and ncpu (if a CPU device is used). The user is encouraged
to experiment with different values of these parameters to identify
which combination is best for their data, as often differences of 1-4x are seen.
Args:
x (ndarray): A numpy array of 16 khz input audio data in shape (N, samples).
Assumes that all of the audio data is the same length (same number of samples).
batch_size (int): The batch size to use when computing the embeddings
ncpu (int): The number of CPUs to use when computing the melspectrogram. This argument has
no effect if the underlying model is executing on a GPU.
Returns:
ndarray: A numpy array of shape (N, frames, embedding_dim) containing the embeddings of
all N input audio clips
"""
# Compute melspectrograms
melspecs = self._get_melspectrogram_batch(x, batch_size=batch_size, ncpu=ncpu)
# Compute embeddings from melspectrograms
embeddings = self._get_embeddings_batch(melspecs[:, :, :, None], batch_size=batch_size, ncpu=ncpu)
return embeddings
def _streaming_melspectrogram(self, n_samples):
"""Note! There seem to be some slight numerical issues depending on the underlying audio data
such that the streaming method is not exactly the same as when the melspectrogram of the entire
clip is calculated. It's unclear if this difference is significant and will impact model performance.
In particular padding with 0 or very small values seems to demonstrate the differences well.
"""
self.melspectrogram_buffer = np.vstack(
(self.melspectrogram_buffer, self._get_melspectrogram(list(self.raw_data_buffer)[-n_samples-160*3:]))
)
if self.melspectrogram_buffer.shape[0] > self.melspectrogram_max_len:
self.melspectrogram_buffer = self.melspectrogram_buffer[-self.melspectrogram_max_len:, :]
def _buffer_raw_data(self, x):
"""
Adds raw audio data to the input buffer
"""
if len(x) < 400:
raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
self.raw_data_buffer.extend(x.tolist() if isinstance(x, np.ndarray) else x)
def _streaming_features(self, x):
# if len(x) != 1280:
# raise ValueError("You must provide input samples in frames of 1280 samples @ 1600khz."
# f"Received a frame of {len(x)} samples.")
# Add raw audio data to buffer
self._buffer_raw_data(x)
self.accumulated_samples += len(x)
# Only calculate melspectrogram every ~0.5 seconds to significantly increase efficiency
if self.accumulated_samples >= 1280:
self._streaming_melspectrogram(self.accumulated_samples)
# Calculate new audio embeddings/features based on update melspectrograms
for i in np.arange(self.accumulated_samples//1280-1, -1, -1):
ndx = -8*i
ndx = ndx if ndx != 0 else len(self.melspectrogram_buffer)
x = self.melspectrogram_buffer[-76 + ndx:ndx].astype(np.float32)[None, :, :, None]
if x.shape[1] == 76:
self.feature_buffer = np.vstack((self.feature_buffer,
self.embedding_model.run(None, {'input_1': x})[0].squeeze()))
# Reset raw data buffer counter
self.accumulated_samples = 0
if self.feature_buffer.shape[0] > self.feature_buffer_max_len:
self.feature_buffer = self.feature_buffer[-self.feature_buffer_max_len:, :]
def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
if start_ndx != -1:
end_ndx = start_ndx + int(n_feature_frames) \
if start_ndx + n_feature_frames != 0 else len(self.feature_buffer)
return self.feature_buffer[start_ndx:end_ndx, :][None, ].astype(np.float32)
else:
return self.feature_buffer[int(-1*n_feature_frames):, :][None, ].astype(np.float32)
def __call__(self, x):
self._streaming_features(x)
# Bulk prediction function
def bulk_predict(
file_paths: List[str],
wakeword_model_paths: List[str],
prediction_function: str = 'predict_clip',
ncpu: int = 1,
**kwargs
):
"""
Bulk predict on the provided input files in parallel using multiprocessing using the specified model.
Args:
input_paths (List[str]): The list of input file to predict
wakeword_model_path (List[str])): The paths to the wakeword ONNX model files
prediction_function (str): The name of the method used to predict on the input audio files
(default is the `predict_clip` method)
ncpu (int): How many processes to create (up to max of available CPUs)
kwargs (dict): Any other keyword arguments to pass to the model initialization or
specified prediction function
Returns:
dict: A dictionary containing the predictions for each file, with the filepath as the key
"""
# Create openWakeWord model objects
n_batches = max(1, len(file_paths)//ncpu)
remainder = len(file_paths) % ncpu
chunks = [file_paths[i:i+n_batches] for i in range(0, max(1, len(file_paths)-remainder), n_batches)]
for i in range(1, remainder+1):
chunks[i-1].append(file_paths[-1*i])
# Create jobs
ps = []
mdls = []
q: Queue = Queue()
for chunk in chunks:
filtered_kwargs = {key: value for key, value in kwargs.items()
if key in openwakeword.Model.__init__.__code__.co_varnames}
oww = openwakeword.Model(
wakeword_model_paths=wakeword_model_paths,
**filtered_kwargs
)
mdls.append(oww)
def f(clips):
results = []
for clip in clips:
func = getattr(mdls[-1], prediction_function)
filtered_kwargs = {key: value for key, value in kwargs.items()
if key in func.__code__.co_varnames}
results.append({clip: func(clip, **filtered_kwargs)})
q.put(results)
ps.append(Process(target=f, args=(chunk,)))
# Submit jobs
for p in ps:
p.start()
# Collection results
results = []
for p in ps:
while q.empty():
time.sleep(0.01)
results.extend(q.get())
# Consolidate results and return
return {list(i.keys())[0]: list(i.values())[0] for i in results}