* Fix voice extractor (#483)

* changes

* changes

* Minor cleanup

* Use 48000 everywhere

* Fix test

* Balance between processing and VRAM

* Warmup the read_static_voice() cache

* Warmup the read_static_voice() cache

* Simplify replace_audio to prevent FFmpeg 7 infite loop

* Fix potential exception in conditional download

* Add more deoldify models

* Rename eye-glasses to glasses, Prepare release 2.5.1

---------

Co-authored-by: Harisreedhar <46858047+harisreedhar@users.noreply.github.com>
This commit is contained in:
Henry Ruhs 2024-04-13 11:27:55 +02:00 committed by GitHub
parent f77c463531
commit 092dfbb796
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 44 additions and 21 deletions

View File

@ -70,7 +70,7 @@ face mask:
--face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...] mix and match different face mask types (choices: box, occlusion, region)
--face-mask-blur [0.0-1.0] specify the degree of blur applied the box mask
--face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...] apply top, right, bottom and left padding to the box mask
--face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...] choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip)
--face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...] choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, glasses, nose, mouth, upper-lip, lower-lip)
frame extraction:
--trim-frame-start TRIM_FRAME_START specify the the start frame of the target video
@ -94,7 +94,7 @@ frame processors:
--face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,gpen_bfr_1024,gpen_bfr_2048,restoreformer_plus_plus} choose the model responsible for enhancing the face
--face-enhancer-blend [0-100] blend the enhanced into the previous face
--face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial,uniface_256} choose the model responsible for swapping the face
--frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify_artistic} choose the model responsible for colorizing the frame
--frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify,deoldify_artistic,deoldify_stable} choose the model responsible for colorizing the frame
--frame-colorizer-blend [0-100] blend the colorized into the previous frame
--frame-enhancer-model {lsdir_x4,nomos8k_sc_x4,real_esrgan_x2,real_esrgan_x2_fp16,real_esrgan_x4,real_esrgan_x4_fp16,real_hatgan_x4,span_kendata_x4} choose the model responsible for enhancing the frame
--frame-enhancer-blend [0-100] blend the enhanced into the previous frame

View File

@ -15,7 +15,7 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
sample_rate = 16000
sample_rate = 48000
channel_total = 2
if is_audio(audio_path):
@ -34,16 +34,16 @@ def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
sample_rate = 16000
sample_rate = 48000
channel_total = 2
chunk_size = 1024 ** 3
step_size = chunk_size // 4
chunk_size = 1024 * 240
step_size = 1024 * 180
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = batch_extract_voice(audio, chunk_size, step_size)
audio = prepare_audio(audio)
audio = prepare_voice(audio)
spectrogram = create_spectrogram(audio)
audio_frames = extract_audio_frames(spectrogram, fps)
return audio_frames
@ -81,6 +81,15 @@ def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
return audio
def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio:
sample_rate = 48000
resample_rate = 16000
audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate))
audio = prepare_audio(audio)
return audio
def convert_hertz_to_mel(hertz : float) -> float:
return 2595 * numpy.log10(1 + hertz / 700)

View File

@ -17,7 +17,7 @@ face_detector_set : Dict[FaceDetectorModel, List[str]] =\
}
face_selector_modes : List[FaceSelectorMode] = [ 'many', 'one', 'reference' ]
face_mask_types : List[FaceMaskType] = [ 'box', 'occlusion', 'region' ]
face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ]
face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ]
temp_frame_formats : List[TempFrameFormat] = [ 'bmp', 'jpg', 'png' ]
output_video_encoders : List[OutputVideoEncoder] = [ 'libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf' ]
output_video_presets : List[OutputVideoPreset] = [ 'ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow' ]

View File

@ -4,7 +4,6 @@ import platform
import ssl
import urllib.request
from typing import List
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from tqdm import tqdm
@ -17,9 +16,6 @@ if platform.system().lower() == 'darwin':
def conditional_download(download_directory_path : str, urls : List[str]) -> None:
with ThreadPoolExecutor() as executor:
for url in urls:
executor.submit(get_download_size, url)
for url in urls:
download_file_path = os.path.join(download_directory_path, os.path.basename(url))
initial_size = os.path.getsize(download_file_path) if is_file(download_file_path) else 0

View File

@ -37,7 +37,7 @@ FACE_MASK_REGIONS : Dict[FaceMaskRegion, int] =\
'right-eyebrow': 3,
'left-eye': 4,
'right-eye': 5,
'eye-glasses': 6,
'glasses': 6,
'nose': 10,
'mouth': 11,
'upper-lip': 12,

View File

@ -120,7 +120,7 @@ def restore_audio(target_path : str, output_path : str, output_video_fps : Fps)
def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool:
temp_output_path = get_temp_output_video_path(target_path)
commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-c:v', 'copy', '-af', 'apad', '-map', '0:v:0', '-map', '1:a:0', '-shortest', '-y', output_path ]
commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-af', 'apad', '-shortest', '-y', output_path ]
return run_ffmpeg(commands)

View File

@ -2,7 +2,7 @@ METADATA =\
{
'name': 'FaceFusion',
'description': 'Next generation face swapper and enhancer',
'version': '2.5.0',
'version': '2.5.1',
'license': 'MIT',
'author': 'Henry Ruhs',
'url': 'https://facefusion.io'

View File

@ -6,7 +6,7 @@ from facefusion.processors.frame.typings import FaceDebuggerItem, FaceEnhancerMo
face_debugger_items : List[FaceDebuggerItem] = [ 'bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender' ]
face_enhancer_models : List[FaceEnhancerModel] = [ 'codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus' ]
face_swapper_models : List[FaceSwapperModel] = [ 'blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256' ]
frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify_artistic' ]
frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
frame_enhancer_models : List[FrameEnhancerModel] = [ 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4' ]
lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_gan' ]

View File

@ -42,12 +42,26 @@ MODELS : ModelSet =\
'path': resolve_relative_path('../.assets/models/ddcolor_artistic.onnx'),
'size': (512, 512)
},
'deoldify':
{
'type': 'deoldify',
'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify.onnx',
'path': resolve_relative_path('../.assets/models/deoldify.onnx'),
'size': (256, 256)
},
'deoldify_artistic':
{
'type': 'deoldify',
'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_artistic.onnx',
'path': resolve_relative_path('../.assets/models/deoldify_artistic.onnx'),
'size': (512, 512)
'size': (256, 256)
},
'deoldify_stable':
{
'type': 'deoldify',
'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_stable.onnx',
'path': resolve_relative_path('../.assets/models/deoldify_stable.onnx'),
'size': (256, 256)
}
}
OPTIONS : Optional[OptionsWithModel] = None

View File

@ -253,4 +253,8 @@ def process_image(source_paths : List[str], target_path : str, output_path : str
def process_video(source_paths : List[str], temp_frame_paths : List[str]) -> None:
source_audio_paths = filter_audio_paths(facefusion.globals.source_paths)
temp_video_fps = restrict_video_fps(facefusion.globals.target_path, facefusion.globals.output_video_fps)
for source_audio_path in source_audio_paths:
read_static_voice(source_audio_path, temp_video_fps)
frame_processors.multi_process_frames(source_paths, temp_frame_paths, process_frames)

View File

@ -5,7 +5,7 @@ from facefusion.typing import Face, FaceSet, AudioFrame, VisionFrame
FaceDebuggerItem = Literal['bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender']
FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus']
FaceSwapperModel = Literal['blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256']
FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify_artistic']
FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
FrameEnhancerModel = Literal['lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4']
LipSyncerModel = Literal['wav2lip_gan']

View File

@ -75,7 +75,7 @@ FaceDetectorModel = Literal['many', 'retinaface', 'scrfd', 'yoloface', 'yunet']
FaceDetectorTweak = Literal['low-luminance', 'high-luminance']
FaceRecognizerModel = Literal['arcface_blendswap', 'arcface_inswapper', 'arcface_simswap', 'arcface_uniface']
FaceMaskType = Literal['box', 'occlusion', 'region']
FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']
FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']
TempFrameFormat = Literal['jpg', 'png', 'bmp']
OutputVideoEncoder = Literal['libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf']
OutputVideoPreset = Literal['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow']

View File

@ -21,6 +21,6 @@ def test_get_audio_frame() -> None:
def test_read_static_audio() -> None:
assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 91
assert len(read_static_audio('.assets/examples/source.wav', 25)) == 91
assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 280
assert len(read_static_audio('.assets/examples/source.wav', 25)) == 280
assert read_static_audio('invalid', 25) is None