add read me
This commit is contained in:
14
venv/lib/python3.12/site-packages/faster_whisper/__init__.py
Normal file
14
venv/lib/python3.12/site-packages/faster_whisper/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from faster_whisper.audio import decode_audio
|
||||
from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
|
||||
from faster_whisper.utils import available_models, download_model, format_timestamp
|
||||
from faster_whisper.version import __version__
|
||||
|
||||
__all__ = [
|
||||
"available_models",
|
||||
"decode_audio",
|
||||
"WhisperModel",
|
||||
"BatchedInferencePipeline",
|
||||
"download_model",
|
||||
"format_timestamp",
|
||||
"__version__",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
123
venv/lib/python3.12/site-packages/faster_whisper/audio.py
Normal file
123
venv/lib/python3.12/site-packages/faster_whisper/audio.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
|
||||
|
||||
The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
|
||||
system dependencies. FFmpeg does not need to be installed on the system.
|
||||
|
||||
However, the API is quite low-level so we need to manipulate audio frames directly.
|
||||
"""
|
||||
|
||||
import gc
|
||||
import io
|
||||
import itertools
|
||||
|
||||
from typing import BinaryIO, Union
|
||||
|
||||
import av
|
||||
import numpy as np
|
||||
|
||||
|
||||
def decode_audio(
|
||||
input_file: Union[str, BinaryIO],
|
||||
sampling_rate: int = 16000,
|
||||
split_stereo: bool = False,
|
||||
):
|
||||
"""Decodes the audio.
|
||||
|
||||
Args:
|
||||
input_file: Path to the input file or a file-like object.
|
||||
sampling_rate: Resample the audio to this sample rate.
|
||||
split_stereo: Return separate left and right channels.
|
||||
|
||||
Returns:
|
||||
A float32 Numpy array.
|
||||
|
||||
If `split_stereo` is enabled, the function returns a 2-tuple with the
|
||||
separated left and right channels.
|
||||
"""
|
||||
resampler = av.audio.resampler.AudioResampler(
|
||||
format="s16",
|
||||
layout="mono" if not split_stereo else "stereo",
|
||||
rate=sampling_rate,
|
||||
)
|
||||
|
||||
raw_buffer = io.BytesIO()
|
||||
dtype = None
|
||||
|
||||
with av.open(input_file, mode="r", metadata_errors="ignore") as container:
|
||||
frames = container.decode(audio=0)
|
||||
frames = _ignore_invalid_frames(frames)
|
||||
frames = _group_frames(frames, 500000)
|
||||
frames = _resample_frames(frames, resampler)
|
||||
|
||||
for frame in frames:
|
||||
array = frame.to_ndarray()
|
||||
dtype = array.dtype
|
||||
raw_buffer.write(array)
|
||||
|
||||
# It appears that some objects related to the resampler are not freed
|
||||
# unless the garbage collector is manually run.
|
||||
# https://github.com/SYSTRAN/faster-whisper/issues/390
|
||||
# note that this slows down loading the audio a little bit
|
||||
# if that is a concern, please use ffmpeg directly as in here:
|
||||
# https://github.com/openai/whisper/blob/25639fc/whisper/audio.py#L25-L62
|
||||
del resampler
|
||||
gc.collect()
|
||||
|
||||
audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
|
||||
|
||||
# Convert s16 back to f32.
|
||||
audio = audio.astype(np.float32) / 32768.0
|
||||
|
||||
if split_stereo:
|
||||
left_channel = audio[0::2]
|
||||
right_channel = audio[1::2]
|
||||
return left_channel, right_channel
|
||||
|
||||
return audio
|
||||
|
||||
|
||||
def _ignore_invalid_frames(frames):
|
||||
iterator = iter(frames)
|
||||
|
||||
while True:
|
||||
try:
|
||||
yield next(iterator)
|
||||
except StopIteration:
|
||||
break
|
||||
except av.error.InvalidDataError:
|
||||
continue
|
||||
|
||||
|
||||
def _group_frames(frames, num_samples=None):
|
||||
fifo = av.audio.fifo.AudioFifo()
|
||||
|
||||
for frame in frames:
|
||||
frame.pts = None # Ignore timestamp check.
|
||||
fifo.write(frame)
|
||||
|
||||
if num_samples is not None and fifo.samples >= num_samples:
|
||||
yield fifo.read()
|
||||
|
||||
if fifo.samples > 0:
|
||||
yield fifo.read()
|
||||
|
||||
|
||||
def _resample_frames(frames, resampler):
|
||||
# Add None to flush the resampler.
|
||||
for frame in itertools.chain(frames, [None]):
|
||||
yield from resampler.resample(frame)
|
||||
|
||||
|
||||
def pad_or_trim(array, length: int = 3000, *, axis: int = -1):
|
||||
"""
|
||||
Pad or trim the Mel features array to 3000, as expected by the encoder.
|
||||
"""
|
||||
if array.shape[axis] > length:
|
||||
array = array.take(indices=range(length), axis=axis)
|
||||
|
||||
if array.shape[axis] < length:
|
||||
pad_widths = [(0, 0)] * array.ndim
|
||||
pad_widths[axis] = (0, length - array.shape[axis])
|
||||
array = np.pad(array, pad_widths)
|
||||
|
||||
return array
|
||||
@@ -0,0 +1,230 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
feature_size=80,
|
||||
sampling_rate=16000,
|
||||
hop_length=160,
|
||||
chunk_length=30,
|
||||
n_fft=400,
|
||||
):
|
||||
self.n_fft = n_fft
|
||||
self.hop_length = hop_length
|
||||
self.chunk_length = chunk_length
|
||||
self.n_samples = chunk_length * sampling_rate
|
||||
self.nb_max_frames = self.n_samples // hop_length
|
||||
self.time_per_frame = hop_length / sampling_rate
|
||||
self.sampling_rate = sampling_rate
|
||||
self.mel_filters = self.get_mel_filters(
|
||||
sampling_rate, n_fft, n_mels=feature_size
|
||||
).astype("float32")
|
||||
|
||||
@staticmethod
|
||||
def get_mel_filters(sr, n_fft, n_mels=128):
|
||||
# Initialize the weights
|
||||
n_mels = int(n_mels)
|
||||
|
||||
# Center freqs of each FFT bin
|
||||
fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
|
||||
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
min_mel = 0.0
|
||||
max_mel = 45.245640471924965
|
||||
|
||||
mels = np.linspace(min_mel, max_mel, n_mels + 2)
|
||||
|
||||
# Fill in the linear scale
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
freqs = f_min + f_sp * mels
|
||||
|
||||
# And now the nonlinear scale
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
# If we have vector data, vectorize
|
||||
log_t = mels >= min_log_mel
|
||||
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
|
||||
|
||||
fdiff = np.diff(freqs)
|
||||
ramps = freqs.reshape(-1, 1) - fftfreqs.reshape(1, -1)
|
||||
|
||||
lower = -ramps[:-2] / np.expand_dims(fdiff[:-1], axis=1)
|
||||
upper = ramps[2:] / np.expand_dims(fdiff[1:], axis=1)
|
||||
|
||||
# Intersect them with each other and zero, vectorized across all i
|
||||
weights = np.maximum(np.zeros_like(lower), np.minimum(lower, upper))
|
||||
|
||||
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||
enorm = 2.0 / (freqs[2 : n_mels + 2] - freqs[:n_mels])
|
||||
weights *= np.expand_dims(enorm, axis=1)
|
||||
|
||||
return weights
|
||||
|
||||
@staticmethod
|
||||
def stft(
|
||||
input_array: np.ndarray,
|
||||
n_fft: int,
|
||||
hop_length: int = None,
|
||||
win_length: int = None,
|
||||
window: np.ndarray = None,
|
||||
center: bool = True,
|
||||
mode: str = "reflect",
|
||||
normalized: bool = False,
|
||||
onesided: bool = None,
|
||||
return_complex: bool = None,
|
||||
):
|
||||
# Default initialization for hop_length and win_length
|
||||
hop_length = hop_length if hop_length is not None else n_fft // 4
|
||||
win_length = win_length if win_length is not None else n_fft
|
||||
input_is_complex = np.iscomplexobj(input_array)
|
||||
|
||||
# Determine if the output should be complex
|
||||
return_complex = (
|
||||
return_complex
|
||||
if return_complex is not None
|
||||
else (input_is_complex or (window is not None and np.iscomplexobj(window)))
|
||||
)
|
||||
|
||||
if not return_complex and return_complex is None:
|
||||
raise ValueError(
|
||||
"stft requires the return_complex parameter for real inputs."
|
||||
)
|
||||
|
||||
# Input checks
|
||||
if not np.issubdtype(input_array.dtype, np.floating) and not input_is_complex:
|
||||
raise ValueError(
|
||||
"stft: expected an array of floating point or complex values,"
|
||||
f" got {input_array.dtype}"
|
||||
)
|
||||
|
||||
if input_array.ndim > 2 or input_array.ndim < 1:
|
||||
raise ValueError(
|
||||
f"stft: expected a 1D or 2D array, but got {input_array.ndim}D array"
|
||||
)
|
||||
|
||||
# Handle 1D input
|
||||
if input_array.ndim == 1:
|
||||
input_array = np.expand_dims(input_array, axis=0)
|
||||
input_array_1d = True
|
||||
else:
|
||||
input_array_1d = False
|
||||
|
||||
# Center padding if required
|
||||
if center:
|
||||
pad_amount = n_fft // 2
|
||||
input_array = np.pad(
|
||||
input_array, ((0, 0), (pad_amount, pad_amount)), mode=mode
|
||||
)
|
||||
|
||||
batch, length = input_array.shape
|
||||
|
||||
# Additional input checks
|
||||
if n_fft <= 0 or n_fft > length:
|
||||
raise ValueError(
|
||||
f"stft: expected 0 < n_fft <= {length}, but got n_fft={n_fft}"
|
||||
)
|
||||
|
||||
if hop_length <= 0:
|
||||
raise ValueError(
|
||||
f"stft: expected hop_length > 0, but got hop_length={hop_length}"
|
||||
)
|
||||
|
||||
if win_length <= 0 or win_length > n_fft:
|
||||
raise ValueError(
|
||||
f"stft: expected 0 < win_length <= n_fft, but got win_length={win_length}"
|
||||
)
|
||||
|
||||
if window is not None:
|
||||
if window.ndim != 1 or window.shape[0] != win_length:
|
||||
raise ValueError(
|
||||
f"stft: expected a 1D window array of size equal to win_length={win_length}, "
|
||||
f"but got window with size {window.shape}"
|
||||
)
|
||||
|
||||
# Handle padding of the window if necessary
|
||||
if win_length < n_fft:
|
||||
left = (n_fft - win_length) // 2
|
||||
window_ = np.zeros(n_fft, dtype=window.dtype)
|
||||
window_[left : left + win_length] = window
|
||||
else:
|
||||
window_ = window
|
||||
|
||||
# Calculate the number of frames
|
||||
n_frames = 1 + (length - n_fft) // hop_length
|
||||
|
||||
# Time to columns
|
||||
input_array = np.lib.stride_tricks.as_strided(
|
||||
input_array,
|
||||
(batch, n_frames, n_fft),
|
||||
(
|
||||
input_array.strides[0],
|
||||
hop_length * input_array.strides[1],
|
||||
input_array.strides[1],
|
||||
),
|
||||
)
|
||||
|
||||
if window_ is not None:
|
||||
input_array = input_array * window_
|
||||
|
||||
# FFT and transpose
|
||||
complex_fft = input_is_complex
|
||||
onesided = onesided if onesided is not None else not complex_fft
|
||||
|
||||
if normalized:
|
||||
norm = "ortho"
|
||||
else:
|
||||
norm = None
|
||||
|
||||
if complex_fft:
|
||||
if onesided:
|
||||
raise ValueError(
|
||||
"Cannot have onesided output if window or input is complex"
|
||||
)
|
||||
output = np.fft.fft(input_array, n=n_fft, axis=-1, norm=norm)
|
||||
else:
|
||||
output = np.fft.rfft(input_array, n=n_fft, axis=-1, norm=norm)
|
||||
|
||||
output = output.transpose((0, 2, 1))
|
||||
|
||||
if input_array_1d:
|
||||
output = output.squeeze(0)
|
||||
|
||||
return output if return_complex else np.real(output)
|
||||
|
||||
def __call__(self, waveform: np.ndarray, padding=160, chunk_length=None):
|
||||
"""
|
||||
Compute the log-Mel spectrogram of the provided audio.
|
||||
"""
|
||||
|
||||
if chunk_length is not None:
|
||||
self.n_samples = chunk_length * self.sampling_rate
|
||||
self.nb_max_frames = self.n_samples // self.hop_length
|
||||
|
||||
if waveform.dtype is not np.float32:
|
||||
waveform = waveform.astype(np.float32)
|
||||
|
||||
if padding:
|
||||
waveform = np.pad(waveform, (0, padding))
|
||||
|
||||
window = np.hanning(self.n_fft + 1)[:-1].astype("float32")
|
||||
|
||||
stft = self.stft(
|
||||
waveform,
|
||||
self.n_fft,
|
||||
self.hop_length,
|
||||
window=window,
|
||||
return_complex=True,
|
||||
).astype("complex64")
|
||||
magnitudes = np.abs(stft[..., :-1]) ** 2
|
||||
|
||||
mel_spec = self.mel_filters @ magnitudes
|
||||
|
||||
log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
|
||||
log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
|
||||
log_spec = (log_spec + 4.0) / 4.0
|
||||
|
||||
return log_spec
|
||||
320
venv/lib/python3.12/site-packages/faster_whisper/tokenizer.py
Normal file
320
venv/lib/python3.12/site-packages/faster_whisper/tokenizer.py
Normal file
@@ -0,0 +1,320 @@
|
||||
import string
|
||||
|
||||
from functools import cached_property
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import tokenizers
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""Simple wrapper around a tokenizers.Tokenizer."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: tokenizers.Tokenizer,
|
||||
multilingual: bool,
|
||||
task: Optional[str] = None,
|
||||
language: Optional[str] = None,
|
||||
):
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
if multilingual:
|
||||
if task not in _TASKS:
|
||||
raise ValueError(
|
||||
"'%s' is not a valid task (accepted tasks: %s)"
|
||||
% (task, ", ".join(_TASKS))
|
||||
)
|
||||
|
||||
if language not in _LANGUAGE_CODES:
|
||||
raise ValueError(
|
||||
"'%s' is not a valid language code (accepted language codes: %s)"
|
||||
% (language, ", ".join(_LANGUAGE_CODES))
|
||||
)
|
||||
|
||||
self.task = self.tokenizer.token_to_id("<|%s|>" % task)
|
||||
self.language = self.tokenizer.token_to_id("<|%s|>" % language)
|
||||
self.language_code = language
|
||||
else:
|
||||
self.task = None
|
||||
self.language = None
|
||||
self.language_code = "en"
|
||||
|
||||
@cached_property
|
||||
def transcribe(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|transcribe|>")
|
||||
|
||||
@cached_property
|
||||
def translate(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|translate|>")
|
||||
|
||||
@cached_property
|
||||
def sot(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|startoftranscript|>")
|
||||
|
||||
@cached_property
|
||||
def sot_lm(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|startoflm|>")
|
||||
|
||||
@cached_property
|
||||
def sot_prev(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|startofprev|>")
|
||||
|
||||
@cached_property
|
||||
def eot(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|endoftext|>")
|
||||
|
||||
@cached_property
|
||||
def no_timestamps(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|notimestamps|>")
|
||||
|
||||
@cached_property
|
||||
def no_speech(self) -> int:
|
||||
return self.tokenizer.token_to_id("<|nospeech|>") or self.tokenizer.token_to_id(
|
||||
"<|nocaptions|>"
|
||||
)
|
||||
|
||||
@property
|
||||
def timestamp_begin(self) -> int:
|
||||
return self.no_timestamps + 1
|
||||
|
||||
@property
|
||||
def sot_sequence(self) -> List[int]:
|
||||
sequence = [self.sot]
|
||||
|
||||
if self.language is not None:
|
||||
sequence.append(self.language)
|
||||
|
||||
if self.task is not None:
|
||||
sequence.append(self.task)
|
||||
|
||||
return sequence
|
||||
|
||||
def encode(self, text: str) -> List[int]:
|
||||
return self.tokenizer.encode(text, add_special_tokens=False).ids
|
||||
|
||||
def decode(self, tokens: List[int]) -> str:
|
||||
text_tokens = [token for token in tokens if token < self.eot]
|
||||
return self.tokenizer.decode(text_tokens)
|
||||
|
||||
def decode_with_timestamps(self, tokens: List[int]) -> str:
|
||||
outputs = [[]]
|
||||
|
||||
for token in tokens:
|
||||
if token >= self.timestamp_begin:
|
||||
timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
|
||||
outputs.append(timestamp)
|
||||
outputs.append([])
|
||||
else:
|
||||
outputs[-1].append(token)
|
||||
|
||||
return "".join(
|
||||
[s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def non_speech_tokens(self) -> Tuple[int]:
|
||||
"""
|
||||
Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
|
||||
annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
|
||||
|
||||
- ♪♪♪
|
||||
- ( SPEAKING FOREIGN LANGUAGE )
|
||||
- [DAVID] Hey there,
|
||||
|
||||
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
|
||||
"""
|
||||
symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
|
||||
symbols += (
|
||||
"<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
|
||||
)
|
||||
|
||||
# symbols that may be a single token or multiple tokens depending on the tokenizer.
|
||||
# In case they're multiple tokens, suppress the first token, which is safe because:
|
||||
# These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
|
||||
# in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
|
||||
miscellaneous = set("♩♪♫♬♭♮♯")
|
||||
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
|
||||
|
||||
# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
|
||||
result = {self.encode(" -")[0], self.encode(" '")[0]}
|
||||
for symbol in symbols + list(miscellaneous):
|
||||
for tokens in [
|
||||
self.encode(symbol),
|
||||
self.encode(" " + symbol),
|
||||
]:
|
||||
if len(tokens) == 1 or symbol in miscellaneous:
|
||||
result.add(tokens[0])
|
||||
|
||||
return tuple(sorted(result))
|
||||
|
||||
def split_to_word_tokens(
|
||||
self, tokens: List[int]
|
||||
) -> Tuple[List[str], List[List[int]]]:
|
||||
if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
|
||||
# These languages don't typically use spaces, so it is difficult to split words
|
||||
# without morpheme analysis. Here, we instead split words at any
|
||||
# position where the tokens are decoded as valid unicode points
|
||||
return self.split_tokens_on_unicode(tokens)
|
||||
|
||||
return self.split_tokens_on_spaces(tokens)
|
||||
|
||||
def split_tokens_on_unicode(
|
||||
self, tokens: List[int]
|
||||
) -> Tuple[List[str], List[List[int]]]:
|
||||
decoded_full = self.decode_with_timestamps(tokens)
|
||||
replacement_char = "\ufffd"
|
||||
|
||||
words = []
|
||||
word_tokens = []
|
||||
current_tokens = []
|
||||
unicode_offset = 0
|
||||
|
||||
for token in tokens:
|
||||
current_tokens.append(token)
|
||||
decoded = self.decode_with_timestamps(current_tokens)
|
||||
|
||||
try:
|
||||
replacement_char_index = decoded.index(replacement_char)
|
||||
replacement_char_index += unicode_offset
|
||||
except ValueError:
|
||||
replacement_char_index = None
|
||||
|
||||
if replacement_char_index is None or (
|
||||
replacement_char_index < len(decoded_full)
|
||||
and decoded_full[replacement_char_index] == replacement_char
|
||||
):
|
||||
words.append(decoded)
|
||||
word_tokens.append(current_tokens)
|
||||
current_tokens = []
|
||||
unicode_offset += len(decoded)
|
||||
|
||||
return words, word_tokens
|
||||
|
||||
def split_tokens_on_spaces(
|
||||
self, tokens: List[int]
|
||||
) -> Tuple[List[str], List[List[int]]]:
|
||||
subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
|
||||
words = []
|
||||
word_tokens = []
|
||||
|
||||
for subword, subword_tokens in zip(subwords, subword_tokens_list):
|
||||
special = subword_tokens[0] >= self.eot
|
||||
with_space = subword.startswith(" ")
|
||||
punctuation = subword.strip() in string.punctuation
|
||||
if special or with_space or punctuation or len(words) == 0:
|
||||
words.append(subword)
|
||||
word_tokens.append(subword_tokens)
|
||||
else:
|
||||
words[-1] = words[-1] + subword
|
||||
word_tokens[-1].extend(subword_tokens)
|
||||
|
||||
return words, word_tokens
|
||||
|
||||
|
||||
_TASKS = (
|
||||
"transcribe",
|
||||
"translate",
|
||||
)
|
||||
|
||||
_LANGUAGE_CODES = (
|
||||
"af",
|
||||
"am",
|
||||
"ar",
|
||||
"as",
|
||||
"az",
|
||||
"ba",
|
||||
"be",
|
||||
"bg",
|
||||
"bn",
|
||||
"bo",
|
||||
"br",
|
||||
"bs",
|
||||
"ca",
|
||||
"cs",
|
||||
"cy",
|
||||
"da",
|
||||
"de",
|
||||
"el",
|
||||
"en",
|
||||
"es",
|
||||
"et",
|
||||
"eu",
|
||||
"fa",
|
||||
"fi",
|
||||
"fo",
|
||||
"fr",
|
||||
"gl",
|
||||
"gu",
|
||||
"ha",
|
||||
"haw",
|
||||
"he",
|
||||
"hi",
|
||||
"hr",
|
||||
"ht",
|
||||
"hu",
|
||||
"hy",
|
||||
"id",
|
||||
"is",
|
||||
"it",
|
||||
"ja",
|
||||
"jw",
|
||||
"ka",
|
||||
"kk",
|
||||
"km",
|
||||
"kn",
|
||||
"ko",
|
||||
"la",
|
||||
"lb",
|
||||
"ln",
|
||||
"lo",
|
||||
"lt",
|
||||
"lv",
|
||||
"mg",
|
||||
"mi",
|
||||
"mk",
|
||||
"ml",
|
||||
"mn",
|
||||
"mr",
|
||||
"ms",
|
||||
"mt",
|
||||
"my",
|
||||
"ne",
|
||||
"nl",
|
||||
"nn",
|
||||
"no",
|
||||
"oc",
|
||||
"pa",
|
||||
"pl",
|
||||
"ps",
|
||||
"pt",
|
||||
"ro",
|
||||
"ru",
|
||||
"sa",
|
||||
"sd",
|
||||
"si",
|
||||
"sk",
|
||||
"sl",
|
||||
"sn",
|
||||
"so",
|
||||
"sq",
|
||||
"sr",
|
||||
"su",
|
||||
"sv",
|
||||
"sw",
|
||||
"ta",
|
||||
"te",
|
||||
"tg",
|
||||
"th",
|
||||
"tk",
|
||||
"tl",
|
||||
"tr",
|
||||
"tt",
|
||||
"uk",
|
||||
"ur",
|
||||
"uz",
|
||||
"vi",
|
||||
"yi",
|
||||
"yo",
|
||||
"zh",
|
||||
"yue",
|
||||
)
|
||||
1941
venv/lib/python3.12/site-packages/faster_whisper/transcribe.py
Normal file
1941
venv/lib/python3.12/site-packages/faster_whisper/transcribe.py
Normal file
File diff suppressed because it is too large
Load Diff
152
venv/lib/python3.12/site-packages/faster_whisper/utils.py
Normal file
152
venv/lib/python3.12/site-packages/faster_whisper/utils.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
_MODELS = {
|
||||
"tiny.en": "Systran/faster-whisper-tiny.en",
|
||||
"tiny": "Systran/faster-whisper-tiny",
|
||||
"base.en": "Systran/faster-whisper-base.en",
|
||||
"base": "Systran/faster-whisper-base",
|
||||
"small.en": "Systran/faster-whisper-small.en",
|
||||
"small": "Systran/faster-whisper-small",
|
||||
"medium.en": "Systran/faster-whisper-medium.en",
|
||||
"medium": "Systran/faster-whisper-medium",
|
||||
"large-v1": "Systran/faster-whisper-large-v1",
|
||||
"large-v2": "Systran/faster-whisper-large-v2",
|
||||
"large-v3": "Systran/faster-whisper-large-v3",
|
||||
"large": "Systran/faster-whisper-large-v3",
|
||||
"distil-large-v2": "Systran/faster-distil-whisper-large-v2",
|
||||
"distil-medium.en": "Systran/faster-distil-whisper-medium.en",
|
||||
"distil-small.en": "Systran/faster-distil-whisper-small.en",
|
||||
"distil-large-v3": "Systran/faster-distil-whisper-large-v3",
|
||||
"distil-large-v3.5": "distil-whisper/distil-large-v3.5-ct2",
|
||||
"large-v3-turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
|
||||
"turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
|
||||
}
|
||||
|
||||
|
||||
def available_models() -> List[str]:
|
||||
"""Returns the names of available models."""
|
||||
return list(_MODELS.keys())
|
||||
|
||||
|
||||
def get_assets_path():
|
||||
"""Returns the path to the assets directory."""
|
||||
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
|
||||
|
||||
|
||||
def get_logger():
|
||||
"""Returns the module logger."""
|
||||
return logging.getLogger("faster_whisper")
|
||||
|
||||
|
||||
def download_model(
|
||||
size_or_id: str,
|
||||
output_dir: Optional[str] = None,
|
||||
local_files_only: bool = False,
|
||||
cache_dir: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
):
|
||||
"""Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
|
||||
|
||||
Args:
|
||||
size_or_id: Size of the model to download from https://huggingface.co/Systran
|
||||
(tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en,
|
||||
distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2,
|
||||
distil-large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub
|
||||
(e.g. Systran/faster-whisper-large-v3).
|
||||
output_dir: Directory where the model should be saved. If not set, the model is saved in
|
||||
the cache directory.
|
||||
local_files_only: If True, avoid downloading the file and return the path to the local
|
||||
cached file if it exists.
|
||||
cache_dir: Path to the folder where cached files are stored.
|
||||
revision: An optional Git revision id which can be a branch name, a tag, or a
|
||||
commit hash.
|
||||
use_auth_token: HuggingFace authentication token or True to use the
|
||||
token stored by the HuggingFace config folder.
|
||||
|
||||
Returns:
|
||||
The path to the downloaded model.
|
||||
|
||||
Raises:
|
||||
ValueError: if the model size is invalid.
|
||||
"""
|
||||
if re.match(r".*/.*", size_or_id):
|
||||
repo_id = size_or_id
|
||||
else:
|
||||
repo_id = _MODELS.get(size_or_id)
|
||||
if repo_id is None:
|
||||
raise ValueError(
|
||||
"Invalid model size '%s', expected one of: %s"
|
||||
% (size_or_id, ", ".join(_MODELS.keys()))
|
||||
)
|
||||
|
||||
allow_patterns = [
|
||||
"config.json",
|
||||
"preprocessor_config.json",
|
||||
"model.bin",
|
||||
"tokenizer.json",
|
||||
"vocabulary.*",
|
||||
]
|
||||
|
||||
kwargs = {
|
||||
"local_files_only": local_files_only,
|
||||
"allow_patterns": allow_patterns,
|
||||
"tqdm_class": disabled_tqdm,
|
||||
"revision": revision,
|
||||
}
|
||||
|
||||
if output_dir is not None:
|
||||
kwargs["local_dir"] = output_dir
|
||||
kwargs["local_dir_use_symlinks"] = False
|
||||
|
||||
if cache_dir is not None:
|
||||
kwargs["cache_dir"] = cache_dir
|
||||
|
||||
if use_auth_token is not None:
|
||||
kwargs["token"] = use_auth_token
|
||||
|
||||
return huggingface_hub.snapshot_download(repo_id, **kwargs)
|
||||
|
||||
|
||||
def format_timestamp(
|
||||
seconds: float,
|
||||
always_include_hours: bool = False,
|
||||
decimal_marker: str = ".",
|
||||
) -> str:
|
||||
assert seconds >= 0, "non-negative timestamp expected"
|
||||
milliseconds = round(seconds * 1000.0)
|
||||
|
||||
hours = milliseconds // 3_600_000
|
||||
milliseconds -= hours * 3_600_000
|
||||
|
||||
minutes = milliseconds // 60_000
|
||||
milliseconds -= minutes * 60_000
|
||||
|
||||
seconds = milliseconds // 1_000
|
||||
milliseconds -= seconds * 1_000
|
||||
|
||||
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
||||
return (
|
||||
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
||||
)
|
||||
|
||||
|
||||
class disabled_tqdm(tqdm):
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs["disable"] = True
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
def get_end(segments: List[dict]) -> Optional[float]:
|
||||
return next(
|
||||
(w["end"] for s in reversed(segments) for w in reversed(s["words"])),
|
||||
segments[-1]["end"] if segments else None,
|
||||
)
|
||||
351
venv/lib/python3.12/site-packages/faster_whisper/vad.py
Normal file
351
venv/lib/python3.12/site-packages/faster_whisper/vad.py
Normal file
@@ -0,0 +1,351 @@
|
||||
import bisect
|
||||
import functools
|
||||
import os
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from faster_whisper.utils import get_assets_path
|
||||
|
||||
|
||||
# The code below is adapted from https://github.com/snakers4/silero-vad.
|
||||
@dataclass
|
||||
class VadOptions:
|
||||
"""VAD options.
|
||||
|
||||
Attributes:
|
||||
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
|
||||
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
||||
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
||||
neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
|
||||
than neg_threshold, it is always considered silence. Values higher than neg_threshold
|
||||
are only considered speech if the previous sample was classified as speech; otherwise,
|
||||
they are treated as silence. This parameter helps refine the detection of speech
|
||||
transitions, ensuring smoother segment boundaries.
|
||||
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
||||
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
||||
than max_speech_duration_s will be split at the timestamp of the last silence that
|
||||
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
||||
split aggressively just before max_speech_duration_s.
|
||||
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
||||
before separating it
|
||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||
"""
|
||||
|
||||
threshold: float = 0.5
|
||||
neg_threshold: float = None
|
||||
min_speech_duration_ms: int = 0
|
||||
max_speech_duration_s: float = float("inf")
|
||||
min_silence_duration_ms: int = 2000
|
||||
speech_pad_ms: int = 400
|
||||
|
||||
|
||||
def get_speech_timestamps(
|
||||
audio: np.ndarray,
|
||||
vad_options: Optional[VadOptions] = None,
|
||||
sampling_rate: int = 16000,
|
||||
**kwargs,
|
||||
) -> List[dict]:
|
||||
"""This method is used for splitting long audios into speech chunks using silero VAD.
|
||||
|
||||
Args:
|
||||
audio: One dimensional float array.
|
||||
vad_options: Options for VAD processing.
|
||||
sampling rate: Sampling rate of the audio.
|
||||
kwargs: VAD options passed as keyword arguments for backward compatibility.
|
||||
|
||||
Returns:
|
||||
List of dicts containing begin and end samples of each speech chunk.
|
||||
"""
|
||||
if vad_options is None:
|
||||
vad_options = VadOptions(**kwargs)
|
||||
|
||||
threshold = vad_options.threshold
|
||||
neg_threshold = vad_options.neg_threshold
|
||||
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
||||
max_speech_duration_s = vad_options.max_speech_duration_s
|
||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||
window_size_samples = 512
|
||||
speech_pad_ms = vad_options.speech_pad_ms
|
||||
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
||||
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||
max_speech_samples = (
|
||||
sampling_rate * max_speech_duration_s
|
||||
- window_size_samples
|
||||
- 2 * speech_pad_samples
|
||||
)
|
||||
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
||||
|
||||
audio_length_samples = len(audio)
|
||||
|
||||
model = get_vad_model()
|
||||
|
||||
padded_audio = np.pad(
|
||||
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
|
||||
)
|
||||
speech_probs = model(padded_audio)
|
||||
|
||||
triggered = False
|
||||
speeches = []
|
||||
current_speech = {}
|
||||
if neg_threshold is None:
|
||||
neg_threshold = max(threshold - 0.15, 0.01)
|
||||
|
||||
# to save potential segment end (and tolerate some silence)
|
||||
temp_end = 0
|
||||
# to save potential segment limits in case of maximum segment size reached
|
||||
prev_end = next_start = 0
|
||||
|
||||
for i, speech_prob in enumerate(speech_probs):
|
||||
if (speech_prob >= threshold) and temp_end:
|
||||
temp_end = 0
|
||||
if next_start < prev_end:
|
||||
next_start = window_size_samples * i
|
||||
|
||||
if (speech_prob >= threshold) and not triggered:
|
||||
triggered = True
|
||||
current_speech["start"] = window_size_samples * i
|
||||
continue
|
||||
|
||||
if (
|
||||
triggered
|
||||
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
|
||||
):
|
||||
if prev_end:
|
||||
current_speech["end"] = prev_end
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
# previously reached silence (< neg_thres) and is still not speech (< thres)
|
||||
if next_start < prev_end:
|
||||
triggered = False
|
||||
else:
|
||||
current_speech["start"] = next_start
|
||||
prev_end = next_start = temp_end = 0
|
||||
else:
|
||||
current_speech["end"] = window_size_samples * i
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
continue
|
||||
|
||||
if (speech_prob < neg_threshold) and triggered:
|
||||
if not temp_end:
|
||||
temp_end = window_size_samples * i
|
||||
# condition to avoid cutting in very short silence
|
||||
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
|
||||
prev_end = temp_end
|
||||
if (window_size_samples * i) - temp_end < min_silence_samples:
|
||||
continue
|
||||
else:
|
||||
current_speech["end"] = temp_end
|
||||
if (
|
||||
current_speech["end"] - current_speech["start"]
|
||||
) > min_speech_samples:
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
continue
|
||||
|
||||
if (
|
||||
current_speech
|
||||
and (audio_length_samples - current_speech["start"]) > min_speech_samples
|
||||
):
|
||||
current_speech["end"] = audio_length_samples
|
||||
speeches.append(current_speech)
|
||||
|
||||
for i, speech in enumerate(speeches):
|
||||
if i == 0:
|
||||
speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
|
||||
if i != len(speeches) - 1:
|
||||
silence_duration = speeches[i + 1]["start"] - speech["end"]
|
||||
if silence_duration < 2 * speech_pad_samples:
|
||||
speech["end"] += int(silence_duration // 2)
|
||||
speeches[i + 1]["start"] = int(
|
||||
max(0, speeches[i + 1]["start"] - silence_duration // 2)
|
||||
)
|
||||
else:
|
||||
speech["end"] = int(
|
||||
min(audio_length_samples, speech["end"] + speech_pad_samples)
|
||||
)
|
||||
speeches[i + 1]["start"] = int(
|
||||
max(0, speeches[i + 1]["start"] - speech_pad_samples)
|
||||
)
|
||||
else:
|
||||
speech["end"] = int(
|
||||
min(audio_length_samples, speech["end"] + speech_pad_samples)
|
||||
)
|
||||
|
||||
return speeches
|
||||
|
||||
|
||||
def collect_chunks(
|
||||
audio: np.ndarray,
|
||||
chunks: List[dict],
|
||||
sampling_rate: int = 16000,
|
||||
max_duration: float = float("inf"),
|
||||
) -> Tuple[List[np.ndarray], List[Dict[str, float]]]:
|
||||
"""This function merges the chunks of audio into chunks of max_duration (s) length."""
|
||||
if not chunks:
|
||||
chunk_metadata = {
|
||||
"offset": 0,
|
||||
"duration": 0,
|
||||
"segments": [],
|
||||
}
|
||||
return [np.array([], dtype=np.float32)], [chunk_metadata]
|
||||
|
||||
audio_chunks = []
|
||||
chunks_metadata = []
|
||||
|
||||
current_segments = []
|
||||
current_duration = 0
|
||||
total_duration = 0
|
||||
current_audio = np.array([], dtype=np.float32)
|
||||
|
||||
for chunk in chunks:
|
||||
if (
|
||||
current_duration + chunk["end"] - chunk["start"]
|
||||
> max_duration * sampling_rate
|
||||
):
|
||||
audio_chunks.append(current_audio)
|
||||
chunk_metadata = {
|
||||
"offset": total_duration / sampling_rate,
|
||||
"duration": current_duration / sampling_rate,
|
||||
"segments": current_segments,
|
||||
}
|
||||
total_duration += current_duration
|
||||
chunks_metadata.append(chunk_metadata)
|
||||
|
||||
current_segments = []
|
||||
|
||||
current_audio = audio[chunk["start"] : chunk["end"]]
|
||||
current_duration = chunk["end"] - chunk["start"]
|
||||
else:
|
||||
current_segments.append(chunk)
|
||||
current_audio = np.concatenate(
|
||||
(current_audio, audio[chunk["start"] : chunk["end"]])
|
||||
)
|
||||
|
||||
current_duration += chunk["end"] - chunk["start"]
|
||||
|
||||
audio_chunks.append(current_audio)
|
||||
|
||||
chunk_metadata = {
|
||||
"offset": total_duration / sampling_rate,
|
||||
"duration": current_duration / sampling_rate,
|
||||
"segments": current_segments,
|
||||
}
|
||||
chunks_metadata.append(chunk_metadata)
|
||||
return audio_chunks, chunks_metadata
|
||||
|
||||
|
||||
class SpeechTimestampsMap:
|
||||
"""Helper class to restore original speech timestamps."""
|
||||
|
||||
def __init__(self, chunks: List[dict], sampling_rate: int, time_precision: int = 2):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.time_precision = time_precision
|
||||
self.chunk_end_sample = []
|
||||
self.total_silence_before = []
|
||||
|
||||
previous_end = 0
|
||||
silent_samples = 0
|
||||
|
||||
for chunk in chunks:
|
||||
silent_samples += chunk["start"] - previous_end
|
||||
previous_end = chunk["end"]
|
||||
|
||||
self.chunk_end_sample.append(chunk["end"] - silent_samples)
|
||||
self.total_silence_before.append(silent_samples / sampling_rate)
|
||||
|
||||
def get_original_time(
|
||||
self,
|
||||
time: float,
|
||||
chunk_index: Optional[int] = None,
|
||||
is_end: bool = False,
|
||||
) -> float:
|
||||
if chunk_index is None:
|
||||
chunk_index = self.get_chunk_index(time, is_end)
|
||||
|
||||
total_silence_before = self.total_silence_before[chunk_index]
|
||||
return round(total_silence_before + time, self.time_precision)
|
||||
|
||||
def get_chunk_index(self, time: float, is_end: bool = False) -> int:
|
||||
sample = int(time * self.sampling_rate)
|
||||
if sample in self.chunk_end_sample and is_end:
|
||||
return self.chunk_end_sample.index(sample)
|
||||
|
||||
return min(
|
||||
bisect.bisect(self.chunk_end_sample, sample),
|
||||
len(self.chunk_end_sample) - 1,
|
||||
)
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def get_vad_model():
|
||||
"""Returns the VAD model instance."""
|
||||
path = os.path.join(get_assets_path(), "silero_vad_v6.onnx")
|
||||
return SileroVADModel(path)
|
||||
|
||||
|
||||
class SileroVADModel:
|
||||
def __init__(self, path):
|
||||
try:
|
||||
import onnxruntime
|
||||
except ImportError as e:
|
||||
raise RuntimeError(
|
||||
"Applying the VAD filter requires the onnxruntime package"
|
||||
) from e
|
||||
|
||||
opts = onnxruntime.SessionOptions()
|
||||
opts.inter_op_num_threads = 1
|
||||
opts.intra_op_num_threads = 1
|
||||
opts.enable_cpu_mem_arena = False
|
||||
opts.log_severity_level = 4
|
||||
|
||||
self.session = onnxruntime.InferenceSession(
|
||||
path,
|
||||
providers=["CPUExecutionProvider"],
|
||||
sess_options=opts,
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self, audio: np.ndarray, num_samples: int = 512, context_size_samples: int = 64
|
||||
):
|
||||
assert audio.ndim == 1, "Input should be a 1D array"
|
||||
assert (
|
||||
audio.shape[0] % num_samples == 0
|
||||
), "Input size should be a multiple of num_samples"
|
||||
|
||||
h = np.zeros((1, 1, 128), dtype="float32")
|
||||
c = np.zeros((1, 1, 128), dtype="float32")
|
||||
context = np.zeros(
|
||||
(1, context_size_samples),
|
||||
dtype="float32",
|
||||
)
|
||||
|
||||
batched_audio = audio.reshape(-1, num_samples)
|
||||
context = batched_audio[..., -context_size_samples:]
|
||||
context[-1] = 0
|
||||
context = np.roll(context, 1, 0)
|
||||
batched_audio = np.concatenate([context, batched_audio], 1)
|
||||
|
||||
batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples)
|
||||
|
||||
encoder_batch_size = 10000
|
||||
num_segments = batched_audio.shape[0]
|
||||
outputs = []
|
||||
for i in range(0, num_segments, encoder_batch_size):
|
||||
output, h, c = self.session.run(
|
||||
None,
|
||||
{"input": batched_audio[i : i + encoder_batch_size], "h": h, "c": c},
|
||||
)
|
||||
outputs.append(output)
|
||||
|
||||
out = np.concatenate(outputs, axis=0)
|
||||
|
||||
return out
|
||||
@@ -0,0 +1,3 @@
|
||||
"""Version information."""
|
||||
|
||||
__version__ = "1.2.1"
|
||||
Reference in New Issue
Block a user