
* Fix voice extractor (#483) * changes * changes * Minor cleanup * Use 48000 everywhere * Fix test * Balance between processing and VRAM * Warmup the read_static_voice() cache * Warmup the read_static_voice() cache * Simplify replace_audio to prevent FFmpeg 7 infite loop * Fix potential exception in conditional download * Add more deoldify models * Rename eye-glasses to glasses, Prepare release 2.5.1 --------- Co-authored-by: Harisreedhar <46858047+harisreedhar@users.noreply.github.com>
138 lines
4.4 KiB
Python
138 lines
4.4 KiB
Python
from typing import Optional, Any, List
|
|
from functools import lru_cache
|
|
import numpy
|
|
import scipy
|
|
|
|
from facefusion.filesystem import is_audio
|
|
from facefusion.ffmpeg import read_audio_buffer
|
|
from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilterBank
|
|
from facefusion.voice_extractor import batch_extract_voice
|
|
|
|
|
|
@lru_cache(maxsize = 128)
|
|
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
|
return read_audio(audio_path, fps)
|
|
|
|
|
|
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
|
sample_rate = 48000
|
|
channel_total = 2
|
|
|
|
if is_audio(audio_path):
|
|
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
|
|
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
|
audio = prepare_audio(audio)
|
|
spectrogram = create_spectrogram(audio)
|
|
audio_frames = extract_audio_frames(spectrogram, fps)
|
|
return audio_frames
|
|
return None
|
|
|
|
|
|
@lru_cache(maxsize = 128)
|
|
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
|
return read_voice(audio_path, fps)
|
|
|
|
|
|
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
|
sample_rate = 48000
|
|
channel_total = 2
|
|
chunk_size = 1024 * 240
|
|
step_size = 1024 * 180
|
|
|
|
if is_audio(audio_path):
|
|
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
|
|
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
|
audio = batch_extract_voice(audio, chunk_size, step_size)
|
|
audio = prepare_voice(audio)
|
|
spectrogram = create_spectrogram(audio)
|
|
audio_frames = extract_audio_frames(spectrogram, fps)
|
|
return audio_frames
|
|
return None
|
|
|
|
|
|
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
|
if is_audio(audio_path):
|
|
audio_frames = read_static_audio(audio_path, fps)
|
|
if frame_number in range(len(audio_frames)):
|
|
return audio_frames[frame_number]
|
|
return None
|
|
|
|
|
|
def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
|
if is_audio(audio_path):
|
|
voice_frames = read_static_voice(audio_path, fps)
|
|
if frame_number in range(len(voice_frames)):
|
|
return voice_frames[frame_number]
|
|
return None
|
|
|
|
|
|
def create_empty_audio_frame() -> AudioFrame:
|
|
mel_filter_total = 80
|
|
step_size = 16
|
|
audio_frame = numpy.zeros((mel_filter_total, step_size)).astype(numpy.int16)
|
|
return audio_frame
|
|
|
|
|
|
def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
|
|
if audio.ndim > 1:
|
|
audio = numpy.mean(audio, axis = 1)
|
|
audio = audio / numpy.max(numpy.abs(audio), axis = 0)
|
|
audio = scipy.signal.lfilter([ 1.0, -0.97 ], [ 1.0 ], audio)
|
|
return audio
|
|
|
|
|
|
def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio:
|
|
sample_rate = 48000
|
|
resample_rate = 16000
|
|
|
|
audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate))
|
|
audio = prepare_audio(audio)
|
|
return audio
|
|
|
|
|
|
def convert_hertz_to_mel(hertz : float) -> float:
|
|
return 2595 * numpy.log10(1 + hertz / 700)
|
|
|
|
|
|
def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, Any]:
|
|
return 700 * (10 ** (mel / 2595) - 1)
|
|
|
|
|
|
def create_mel_filter_bank() -> MelFilterBank:
|
|
mel_filter_total = 80
|
|
mel_bin_total = 800
|
|
sample_rate = 16000
|
|
min_frequency = 55.0
|
|
max_frequency = 7600.0
|
|
mel_filter_bank = numpy.zeros((mel_filter_total, mel_bin_total // 2 + 1))
|
|
mel_frequency_range = numpy.linspace(convert_hertz_to_mel(min_frequency), convert_hertz_to_mel(max_frequency), mel_filter_total + 2)
|
|
indices = numpy.floor((mel_bin_total + 1) * convert_mel_to_hertz(mel_frequency_range) / sample_rate).astype(numpy.int16)
|
|
|
|
for index in range(mel_filter_total):
|
|
start = indices[index]
|
|
end = indices[index + 1]
|
|
mel_filter_bank[index, start:end] = scipy.signal.windows.triang(end - start)
|
|
return mel_filter_bank
|
|
|
|
|
|
def create_spectrogram(audio : Audio) -> Spectrogram:
|
|
mel_bin_total = 800
|
|
mel_bin_overlap = 600
|
|
mel_filter_bank = create_mel_filter_bank()
|
|
spectrogram = scipy.signal.stft(audio, nperseg = mel_bin_total, nfft = mel_bin_total, noverlap = mel_bin_overlap)[2]
|
|
spectrogram = numpy.dot(mel_filter_bank, numpy.abs(spectrogram))
|
|
return spectrogram
|
|
|
|
|
|
def extract_audio_frames(spectrogram : Spectrogram, fps : Fps) -> List[AudioFrame]:
|
|
mel_filter_total = 80
|
|
step_size = 16
|
|
audio_frames = []
|
|
indices = numpy.arange(0, spectrogram.shape[1], mel_filter_total / fps).astype(numpy.int16)
|
|
indices = indices[indices >= step_size]
|
|
|
|
for index in indices:
|
|
start = max(0, index - step_size)
|
|
audio_frames.append(spectrogram[:, start:index])
|
|
return audio_frames
|