add read me

This commit is contained in:
2026-01-09 10:28:44 +11:00
commit edaf914b73
13417 changed files with 2952119 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
from faster_whisper.audio import decode_audio
from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
from faster_whisper.utils import available_models, download_model, format_timestamp
from faster_whisper.version import __version__
__all__ = [
"available_models",
"decode_audio",
"WhisperModel",
"BatchedInferencePipeline",
"download_model",
"format_timestamp",
"__version__",
]

View File

@@ -0,0 +1,123 @@
"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
system dependencies. FFmpeg does not need to be installed on the system.
However, the API is quite low-level so we need to manipulate audio frames directly.
"""
import gc
import io
import itertools
from typing import BinaryIO, Union
import av
import numpy as np
def decode_audio(
input_file: Union[str, BinaryIO],
sampling_rate: int = 16000,
split_stereo: bool = False,
):
"""Decodes the audio.
Args:
input_file: Path to the input file or a file-like object.
sampling_rate: Resample the audio to this sample rate.
split_stereo: Return separate left and right channels.
Returns:
A float32 Numpy array.
If `split_stereo` is enabled, the function returns a 2-tuple with the
separated left and right channels.
"""
resampler = av.audio.resampler.AudioResampler(
format="s16",
layout="mono" if not split_stereo else "stereo",
rate=sampling_rate,
)
raw_buffer = io.BytesIO()
dtype = None
with av.open(input_file, mode="r", metadata_errors="ignore") as container:
frames = container.decode(audio=0)
frames = _ignore_invalid_frames(frames)
frames = _group_frames(frames, 500000)
frames = _resample_frames(frames, resampler)
for frame in frames:
array = frame.to_ndarray()
dtype = array.dtype
raw_buffer.write(array)
# It appears that some objects related to the resampler are not freed
# unless the garbage collector is manually run.
# https://github.com/SYSTRAN/faster-whisper/issues/390
# note that this slows down loading the audio a little bit
# if that is a concern, please use ffmpeg directly as in here:
# https://github.com/openai/whisper/blob/25639fc/whisper/audio.py#L25-L62
del resampler
gc.collect()
audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
# Convert s16 back to f32.
audio = audio.astype(np.float32) / 32768.0
if split_stereo:
left_channel = audio[0::2]
right_channel = audio[1::2]
return left_channel, right_channel
return audio
def _ignore_invalid_frames(frames):
iterator = iter(frames)
while True:
try:
yield next(iterator)
except StopIteration:
break
except av.error.InvalidDataError:
continue
def _group_frames(frames, num_samples=None):
fifo = av.audio.fifo.AudioFifo()
for frame in frames:
frame.pts = None # Ignore timestamp check.
fifo.write(frame)
if num_samples is not None and fifo.samples >= num_samples:
yield fifo.read()
if fifo.samples > 0:
yield fifo.read()
def _resample_frames(frames, resampler):
# Add None to flush the resampler.
for frame in itertools.chain(frames, [None]):
yield from resampler.resample(frame)
def pad_or_trim(array, length: int = 3000, *, axis: int = -1):
"""
Pad or trim the Mel features array to 3000, as expected by the encoder.
"""
if array.shape[axis] > length:
array = array.take(indices=range(length), axis=axis)
if array.shape[axis] < length:
pad_widths = [(0, 0)] * array.ndim
pad_widths[axis] = (0, length - array.shape[axis])
array = np.pad(array, pad_widths)
return array

View File

@@ -0,0 +1,230 @@
import numpy as np
class FeatureExtractor:
def __init__(
self,
feature_size=80,
sampling_rate=16000,
hop_length=160,
chunk_length=30,
n_fft=400,
):
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate
self.nb_max_frames = self.n_samples // hop_length
self.time_per_frame = hop_length / sampling_rate
self.sampling_rate = sampling_rate
self.mel_filters = self.get_mel_filters(
sampling_rate, n_fft, n_mels=feature_size
).astype("float32")
@staticmethod
def get_mel_filters(sr, n_fft, n_mels=128):
# Initialize the weights
n_mels = int(n_mels)
# Center freqs of each FFT bin
fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = 0.0
max_mel = 45.245640471924965
mels = np.linspace(min_mel, max_mel, n_mels + 2)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mels
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
# If we have vector data, vectorize
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
fdiff = np.diff(freqs)
ramps = freqs.reshape(-1, 1) - fftfreqs.reshape(1, -1)
lower = -ramps[:-2] / np.expand_dims(fdiff[:-1], axis=1)
upper = ramps[2:] / np.expand_dims(fdiff[1:], axis=1)
# Intersect them with each other and zero, vectorized across all i
weights = np.maximum(np.zeros_like(lower), np.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (freqs[2 : n_mels + 2] - freqs[:n_mels])
weights *= np.expand_dims(enorm, axis=1)
return weights
@staticmethod
def stft(
input_array: np.ndarray,
n_fft: int,
hop_length: int = None,
win_length: int = None,
window: np.ndarray = None,
center: bool = True,
mode: str = "reflect",
normalized: bool = False,
onesided: bool = None,
return_complex: bool = None,
):
# Default initialization for hop_length and win_length
hop_length = hop_length if hop_length is not None else n_fft // 4
win_length = win_length if win_length is not None else n_fft
input_is_complex = np.iscomplexobj(input_array)
# Determine if the output should be complex
return_complex = (
return_complex
if return_complex is not None
else (input_is_complex or (window is not None and np.iscomplexobj(window)))
)
if not return_complex and return_complex is None:
raise ValueError(
"stft requires the return_complex parameter for real inputs."
)
# Input checks
if not np.issubdtype(input_array.dtype, np.floating) and not input_is_complex:
raise ValueError(
"stft: expected an array of floating point or complex values,"
f" got {input_array.dtype}"
)
if input_array.ndim > 2 or input_array.ndim < 1:
raise ValueError(
f"stft: expected a 1D or 2D array, but got {input_array.ndim}D array"
)
# Handle 1D input
if input_array.ndim == 1:
input_array = np.expand_dims(input_array, axis=0)
input_array_1d = True
else:
input_array_1d = False
# Center padding if required
if center:
pad_amount = n_fft // 2
input_array = np.pad(
input_array, ((0, 0), (pad_amount, pad_amount)), mode=mode
)
batch, length = input_array.shape
# Additional input checks
if n_fft <= 0 or n_fft > length:
raise ValueError(
f"stft: expected 0 < n_fft <= {length}, but got n_fft={n_fft}"
)
if hop_length <= 0:
raise ValueError(
f"stft: expected hop_length > 0, but got hop_length={hop_length}"
)
if win_length <= 0 or win_length > n_fft:
raise ValueError(
f"stft: expected 0 < win_length <= n_fft, but got win_length={win_length}"
)
if window is not None:
if window.ndim != 1 or window.shape[0] != win_length:
raise ValueError(
f"stft: expected a 1D window array of size equal to win_length={win_length}, "
f"but got window with size {window.shape}"
)
# Handle padding of the window if necessary
if win_length < n_fft:
left = (n_fft - win_length) // 2
window_ = np.zeros(n_fft, dtype=window.dtype)
window_[left : left + win_length] = window
else:
window_ = window
# Calculate the number of frames
n_frames = 1 + (length - n_fft) // hop_length
# Time to columns
input_array = np.lib.stride_tricks.as_strided(
input_array,
(batch, n_frames, n_fft),
(
input_array.strides[0],
hop_length * input_array.strides[1],
input_array.strides[1],
),
)
if window_ is not None:
input_array = input_array * window_
# FFT and transpose
complex_fft = input_is_complex
onesided = onesided if onesided is not None else not complex_fft
if normalized:
norm = "ortho"
else:
norm = None
if complex_fft:
if onesided:
raise ValueError(
"Cannot have onesided output if window or input is complex"
)
output = np.fft.fft(input_array, n=n_fft, axis=-1, norm=norm)
else:
output = np.fft.rfft(input_array, n=n_fft, axis=-1, norm=norm)
output = output.transpose((0, 2, 1))
if input_array_1d:
output = output.squeeze(0)
return output if return_complex else np.real(output)
def __call__(self, waveform: np.ndarray, padding=160, chunk_length=None):
"""
Compute the log-Mel spectrogram of the provided audio.
"""
if chunk_length is not None:
self.n_samples = chunk_length * self.sampling_rate
self.nb_max_frames = self.n_samples // self.hop_length
if waveform.dtype is not np.float32:
waveform = waveform.astype(np.float32)
if padding:
waveform = np.pad(waveform, (0, padding))
window = np.hanning(self.n_fft + 1)[:-1].astype("float32")
stft = self.stft(
waveform,
self.n_fft,
self.hop_length,
window=window,
return_complex=True,
).astype("complex64")
magnitudes = np.abs(stft[..., :-1]) ** 2
mel_spec = self.mel_filters @ magnitudes
log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
return log_spec

View File

@@ -0,0 +1,320 @@
import string
from functools import cached_property
from typing import List, Optional, Tuple
import tokenizers
class Tokenizer:
"""Simple wrapper around a tokenizers.Tokenizer."""
def __init__(
self,
tokenizer: tokenizers.Tokenizer,
multilingual: bool,
task: Optional[str] = None,
language: Optional[str] = None,
):
self.tokenizer = tokenizer
if multilingual:
if task not in _TASKS:
raise ValueError(
"'%s' is not a valid task (accepted tasks: %s)"
% (task, ", ".join(_TASKS))
)
if language not in _LANGUAGE_CODES:
raise ValueError(
"'%s' is not a valid language code (accepted language codes: %s)"
% (language, ", ".join(_LANGUAGE_CODES))
)
self.task = self.tokenizer.token_to_id("<|%s|>" % task)
self.language = self.tokenizer.token_to_id("<|%s|>" % language)
self.language_code = language
else:
self.task = None
self.language = None
self.language_code = "en"
@cached_property
def transcribe(self) -> int:
return self.tokenizer.token_to_id("<|transcribe|>")
@cached_property
def translate(self) -> int:
return self.tokenizer.token_to_id("<|translate|>")
@cached_property
def sot(self) -> int:
return self.tokenizer.token_to_id("<|startoftranscript|>")
@cached_property
def sot_lm(self) -> int:
return self.tokenizer.token_to_id("<|startoflm|>")
@cached_property
def sot_prev(self) -> int:
return self.tokenizer.token_to_id("<|startofprev|>")
@cached_property
def eot(self) -> int:
return self.tokenizer.token_to_id("<|endoftext|>")
@cached_property
def no_timestamps(self) -> int:
return self.tokenizer.token_to_id("<|notimestamps|>")
@cached_property
def no_speech(self) -> int:
return self.tokenizer.token_to_id("<|nospeech|>") or self.tokenizer.token_to_id(
"<|nocaptions|>"
)
@property
def timestamp_begin(self) -> int:
return self.no_timestamps + 1
@property
def sot_sequence(self) -> List[int]:
sequence = [self.sot]
if self.language is not None:
sequence.append(self.language)
if self.task is not None:
sequence.append(self.task)
return sequence
def encode(self, text: str) -> List[int]:
return self.tokenizer.encode(text, add_special_tokens=False).ids
def decode(self, tokens: List[int]) -> str:
text_tokens = [token for token in tokens if token < self.eot]
return self.tokenizer.decode(text_tokens)
def decode_with_timestamps(self, tokens: List[int]) -> str:
outputs = [[]]
for token in tokens:
if token >= self.timestamp_begin:
timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
outputs.append(timestamp)
outputs.append([])
else:
outputs[-1].append(token)
return "".join(
[s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
)
@cached_property
def non_speech_tokens(self) -> Tuple[int]:
"""
Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
- ♪♪♪
- ( SPEAKING FOREIGN LANGUAGE )
- [DAVID] Hey there,
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
"""
symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
symbols += (
"<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
)
# symbols that may be a single token or multiple tokens depending on the tokenizer.
# In case they're multiple tokens, suppress the first token, which is safe because:
# These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
# in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
miscellaneous = set("♩♪♫♬♭♮♯")
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
result = {self.encode(" -")[0], self.encode(" '")[0]}
for symbol in symbols + list(miscellaneous):
for tokens in [
self.encode(symbol),
self.encode(" " + symbol),
]:
if len(tokens) == 1 or symbol in miscellaneous:
result.add(tokens[0])
return tuple(sorted(result))
def split_to_word_tokens(
self, tokens: List[int]
) -> Tuple[List[str], List[List[int]]]:
if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
# These languages don't typically use spaces, so it is difficult to split words
# without morpheme analysis. Here, we instead split words at any
# position where the tokens are decoded as valid unicode points
return self.split_tokens_on_unicode(tokens)
return self.split_tokens_on_spaces(tokens)
def split_tokens_on_unicode(
self, tokens: List[int]
) -> Tuple[List[str], List[List[int]]]:
decoded_full = self.decode_with_timestamps(tokens)
replacement_char = "\ufffd"
words = []
word_tokens = []
current_tokens = []
unicode_offset = 0
for token in tokens:
current_tokens.append(token)
decoded = self.decode_with_timestamps(current_tokens)
try:
replacement_char_index = decoded.index(replacement_char)
replacement_char_index += unicode_offset
except ValueError:
replacement_char_index = None
if replacement_char_index is None or (
replacement_char_index < len(decoded_full)
and decoded_full[replacement_char_index] == replacement_char
):
words.append(decoded)
word_tokens.append(current_tokens)
current_tokens = []
unicode_offset += len(decoded)
return words, word_tokens
def split_tokens_on_spaces(
self, tokens: List[int]
) -> Tuple[List[str], List[List[int]]]:
subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
words = []
word_tokens = []
for subword, subword_tokens in zip(subwords, subword_tokens_list):
special = subword_tokens[0] >= self.eot
with_space = subword.startswith(" ")
punctuation = subword.strip() in string.punctuation
if special or with_space or punctuation or len(words) == 0:
words.append(subword)
word_tokens.append(subword_tokens)
else:
words[-1] = words[-1] + subword
word_tokens[-1].extend(subword_tokens)
return words, word_tokens
_TASKS = (
"transcribe",
"translate",
)
_LANGUAGE_CODES = (
"af",
"am",
"ar",
"as",
"az",
"ba",
"be",
"bg",
"bn",
"bo",
"br",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"et",
"eu",
"fa",
"fi",
"fo",
"fr",
"gl",
"gu",
"ha",
"haw",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"jw",
"ka",
"kk",
"km",
"kn",
"ko",
"la",
"lb",
"ln",
"lo",
"lt",
"lv",
"mg",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"my",
"ne",
"nl",
"nn",
"no",
"oc",
"pa",
"pl",
"ps",
"pt",
"ro",
"ru",
"sa",
"sd",
"si",
"sk",
"sl",
"sn",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"uk",
"ur",
"uz",
"vi",
"yi",
"yo",
"zh",
"yue",
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
import logging
import os
import re
from typing import List, Optional, Union
import huggingface_hub
from tqdm.auto import tqdm
_MODELS = {
"tiny.en": "Systran/faster-whisper-tiny.en",
"tiny": "Systran/faster-whisper-tiny",
"base.en": "Systran/faster-whisper-base.en",
"base": "Systran/faster-whisper-base",
"small.en": "Systran/faster-whisper-small.en",
"small": "Systran/faster-whisper-small",
"medium.en": "Systran/faster-whisper-medium.en",
"medium": "Systran/faster-whisper-medium",
"large-v1": "Systran/faster-whisper-large-v1",
"large-v2": "Systran/faster-whisper-large-v2",
"large-v3": "Systran/faster-whisper-large-v3",
"large": "Systran/faster-whisper-large-v3",
"distil-large-v2": "Systran/faster-distil-whisper-large-v2",
"distil-medium.en": "Systran/faster-distil-whisper-medium.en",
"distil-small.en": "Systran/faster-distil-whisper-small.en",
"distil-large-v3": "Systran/faster-distil-whisper-large-v3",
"distil-large-v3.5": "distil-whisper/distil-large-v3.5-ct2",
"large-v3-turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
"turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
}
def available_models() -> List[str]:
"""Returns the names of available models."""
return list(_MODELS.keys())
def get_assets_path():
"""Returns the path to the assets directory."""
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
def get_logger():
"""Returns the module logger."""
return logging.getLogger("faster_whisper")
def download_model(
size_or_id: str,
output_dir: Optional[str] = None,
local_files_only: bool = False,
cache_dir: Optional[str] = None,
revision: Optional[str] = None,
use_auth_token: Optional[Union[str, bool]] = None,
):
"""Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
Args:
size_or_id: Size of the model to download from https://huggingface.co/Systran
(tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en,
distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2,
distil-large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub
(e.g. Systran/faster-whisper-large-v3).
output_dir: Directory where the model should be saved. If not set, the model is saved in
the cache directory.
local_files_only: If True, avoid downloading the file and return the path to the local
cached file if it exists.
cache_dir: Path to the folder where cached files are stored.
revision: An optional Git revision id which can be a branch name, a tag, or a
commit hash.
use_auth_token: HuggingFace authentication token or True to use the
token stored by the HuggingFace config folder.
Returns:
The path to the downloaded model.
Raises:
ValueError: if the model size is invalid.
"""
if re.match(r".*/.*", size_or_id):
repo_id = size_or_id
else:
repo_id = _MODELS.get(size_or_id)
if repo_id is None:
raise ValueError(
"Invalid model size '%s', expected one of: %s"
% (size_or_id, ", ".join(_MODELS.keys()))
)
allow_patterns = [
"config.json",
"preprocessor_config.json",
"model.bin",
"tokenizer.json",
"vocabulary.*",
]
kwargs = {
"local_files_only": local_files_only,
"allow_patterns": allow_patterns,
"tqdm_class": disabled_tqdm,
"revision": revision,
}
if output_dir is not None:
kwargs["local_dir"] = output_dir
kwargs["local_dir_use_symlinks"] = False
if cache_dir is not None:
kwargs["cache_dir"] = cache_dir
if use_auth_token is not None:
kwargs["token"] = use_auth_token
return huggingface_hub.snapshot_download(repo_id, **kwargs)
def format_timestamp(
seconds: float,
always_include_hours: bool = False,
decimal_marker: str = ".",
) -> str:
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return (
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
)
class disabled_tqdm(tqdm):
def __init__(self, *args, **kwargs):
kwargs["disable"] = True
super().__init__(*args, **kwargs)
def get_end(segments: List[dict]) -> Optional[float]:
return next(
(w["end"] for s in reversed(segments) for w in reversed(s["words"])),
segments[-1]["end"] if segments else None,
)

View File

@@ -0,0 +1,351 @@
import bisect
import functools
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import numpy as np
from faster_whisper.utils import get_assets_path
# The code below is adapted from https://github.com/snakers4/silero-vad.
@dataclass
class VadOptions:
"""VAD options.
Attributes:
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
than neg_threshold, it is always considered silence. Values higher than neg_threshold
are only considered speech if the previous sample was classified as speech; otherwise,
they are treated as silence. This parameter helps refine the detection of speech
transitions, ensuring smoother segment boundaries.
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
"""
threshold: float = 0.5
neg_threshold: float = None
min_speech_duration_ms: int = 0
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400
def get_speech_timestamps(
audio: np.ndarray,
vad_options: Optional[VadOptions] = None,
sampling_rate: int = 16000,
**kwargs,
) -> List[dict]:
"""This method is used for splitting long audios into speech chunks using silero VAD.
Args:
audio: One dimensional float array.
vad_options: Options for VAD processing.
sampling rate: Sampling rate of the audio.
kwargs: VAD options passed as keyword arguments for backward compatibility.
Returns:
List of dicts containing begin and end samples of each speech chunk.
"""
if vad_options is None:
vad_options = VadOptions(**kwargs)
threshold = vad_options.threshold
neg_threshold = vad_options.neg_threshold
min_speech_duration_ms = vad_options.min_speech_duration_ms
max_speech_duration_s = vad_options.max_speech_duration_s
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
sampling_rate * max_speech_duration_s
- window_size_samples
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
audio_length_samples = len(audio)
model = get_vad_model()
padded_audio = np.pad(
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
)
speech_probs = model(padded_audio)
triggered = False
speeches = []
current_speech = {}
if neg_threshold is None:
neg_threshold = max(threshold - 0.15, 0.01)
# to save potential segment end (and tolerate some silence)
temp_end = 0
# to save potential segment limits in case of maximum segment size reached
prev_end = next_start = 0
for i, speech_prob in enumerate(speech_probs):
if (speech_prob >= threshold) and temp_end:
temp_end = 0
if next_start < prev_end:
next_start = window_size_samples * i
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech["start"] = window_size_samples * i
continue
if (
triggered
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
):
if prev_end:
current_speech["end"] = prev_end
speeches.append(current_speech)
current_speech = {}
# previously reached silence (< neg_thres) and is still not speech (< thres)
if next_start < prev_end:
triggered = False
else:
current_speech["start"] = next_start
prev_end = next_start = temp_end = 0
else:
current_speech["end"] = window_size_samples * i
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
continue
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
# condition to avoid cutting in very short silence
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
prev_end = temp_end
if (window_size_samples * i) - temp_end < min_silence_samples:
continue
else:
current_speech["end"] = temp_end
if (
current_speech["end"] - current_speech["start"]
) > min_speech_samples:
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
continue
if (
current_speech
and (audio_length_samples - current_speech["start"]) > min_speech_samples
):
current_speech["end"] = audio_length_samples
speeches.append(current_speech)
for i, speech in enumerate(speeches):
if i == 0:
speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
if i != len(speeches) - 1:
silence_duration = speeches[i + 1]["start"] - speech["end"]
if silence_duration < 2 * speech_pad_samples:
speech["end"] += int(silence_duration // 2)
speeches[i + 1]["start"] = int(
max(0, speeches[i + 1]["start"] - silence_duration // 2)
)
else:
speech["end"] = int(
min(audio_length_samples, speech["end"] + speech_pad_samples)
)
speeches[i + 1]["start"] = int(
max(0, speeches[i + 1]["start"] - speech_pad_samples)
)
else:
speech["end"] = int(
min(audio_length_samples, speech["end"] + speech_pad_samples)
)
return speeches
def collect_chunks(
audio: np.ndarray,
chunks: List[dict],
sampling_rate: int = 16000,
max_duration: float = float("inf"),
) -> Tuple[List[np.ndarray], List[Dict[str, float]]]:
"""This function merges the chunks of audio into chunks of max_duration (s) length."""
if not chunks:
chunk_metadata = {
"offset": 0,
"duration": 0,
"segments": [],
}
return [np.array([], dtype=np.float32)], [chunk_metadata]
audio_chunks = []
chunks_metadata = []
current_segments = []
current_duration = 0
total_duration = 0
current_audio = np.array([], dtype=np.float32)
for chunk in chunks:
if (
current_duration + chunk["end"] - chunk["start"]
> max_duration * sampling_rate
):
audio_chunks.append(current_audio)
chunk_metadata = {
"offset": total_duration / sampling_rate,
"duration": current_duration / sampling_rate,
"segments": current_segments,
}
total_duration += current_duration
chunks_metadata.append(chunk_metadata)
current_segments = []
current_audio = audio[chunk["start"] : chunk["end"]]
current_duration = chunk["end"] - chunk["start"]
else:
current_segments.append(chunk)
current_audio = np.concatenate(
(current_audio, audio[chunk["start"] : chunk["end"]])
)
current_duration += chunk["end"] - chunk["start"]
audio_chunks.append(current_audio)
chunk_metadata = {
"offset": total_duration / sampling_rate,
"duration": current_duration / sampling_rate,
"segments": current_segments,
}
chunks_metadata.append(chunk_metadata)
return audio_chunks, chunks_metadata
class SpeechTimestampsMap:
"""Helper class to restore original speech timestamps."""
def __init__(self, chunks: List[dict], sampling_rate: int, time_precision: int = 2):
self.sampling_rate = sampling_rate
self.time_precision = time_precision
self.chunk_end_sample = []
self.total_silence_before = []
previous_end = 0
silent_samples = 0
for chunk in chunks:
silent_samples += chunk["start"] - previous_end
previous_end = chunk["end"]
self.chunk_end_sample.append(chunk["end"] - silent_samples)
self.total_silence_before.append(silent_samples / sampling_rate)
def get_original_time(
self,
time: float,
chunk_index: Optional[int] = None,
is_end: bool = False,
) -> float:
if chunk_index is None:
chunk_index = self.get_chunk_index(time, is_end)
total_silence_before = self.total_silence_before[chunk_index]
return round(total_silence_before + time, self.time_precision)
def get_chunk_index(self, time: float, is_end: bool = False) -> int:
sample = int(time * self.sampling_rate)
if sample in self.chunk_end_sample and is_end:
return self.chunk_end_sample.index(sample)
return min(
bisect.bisect(self.chunk_end_sample, sample),
len(self.chunk_end_sample) - 1,
)
@functools.lru_cache
def get_vad_model():
"""Returns the VAD model instance."""
path = os.path.join(get_assets_path(), "silero_vad_v6.onnx")
return SileroVADModel(path)
class SileroVADModel:
def __init__(self, path):
try:
import onnxruntime
except ImportError as e:
raise RuntimeError(
"Applying the VAD filter requires the onnxruntime package"
) from e
opts = onnxruntime.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
opts.enable_cpu_mem_arena = False
opts.log_severity_level = 4
self.session = onnxruntime.InferenceSession(
path,
providers=["CPUExecutionProvider"],
sess_options=opts,
)
def __call__(
self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64
):
assert audio.ndim == 1, "Input should be a 1D array"
assert (
audio.shape[0] % num_samples == 0
), "Input size should be a multiple of num_samples"
h = np.zeros((1, 1, 128), dtype="float32")
c = np.zeros((1, 1, 128), dtype="float32")
context = np.zeros(
(1, context_size_samples),
dtype="float32",
)
batched_audio = audio.reshape(-1, num_samples)
context = batched_audio[..., -context_size_samples:]
context[-1] = 0
context = np.roll(context, 1, 0)
batched_audio = np.concatenate([context, batched_audio], 1)
batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
encoder_batch_size = 10000
num_segments = batched_audio.shape[0]
outputs = []
for i in range(0, num_segments, encoder_batch_size):
output, h, c = self.session.run(
None,
{"input": batched_audio[i : i + encoder_batch_size], "h": h, "c": c},
)
outputs.append(output)
out = np.concatenate(outputs, axis=0)
return out

View File

@@ -0,0 +1,3 @@
"""Version information."""
__version__ = "1.2.1"