From 092dfbb796c94651c05ff0382f1358f93217b84b Mon Sep 17 00:00:00 2001 From: Henry Ruhs Date: Sat, 13 Apr 2024 11:27:55 +0200 Subject: [PATCH] Next (#487) * Fix voice extractor (#483) * changes * changes * Minor cleanup * Use 48000 everywhere * Fix test * Balance between processing and VRAM * Warmup the read_static_voice() cache * Warmup the read_static_voice() cache * Simplify replace_audio to prevent FFmpeg 7 infite loop * Fix potential exception in conditional download * Add more deoldify models * Rename eye-glasses to glasses, Prepare release 2.5.1 --------- Co-authored-by: Harisreedhar <46858047+harisreedhar@users.noreply.github.com> --- README.md | 4 ++-- facefusion/audio.py | 19 ++++++++++++++----- facefusion/choices.py | 2 +- facefusion/download.py | 4 ---- facefusion/face_masker.py | 2 +- facefusion/ffmpeg.py | 2 +- facefusion/metadata.py | 2 +- facefusion/processors/frame/choices.py | 2 +- .../frame/modules/frame_colorizer.py | 16 +++++++++++++++- .../processors/frame/modules/lip_syncer.py | 4 ++++ facefusion/processors/frame/typings.py | 2 +- facefusion/typing.py | 2 +- tests/test_audio.py | 4 ++-- 13 files changed, 44 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 8687565e..86e8aa29 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ face mask: --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...] mix and match different face mask types (choices: box, occlusion, region) --face-mask-blur [0.0-1.0] specify the degree of blur applied the box mask --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...] apply top, right, bottom and left padding to the box mask - --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...] choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip) + --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...] choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, glasses, nose, mouth, upper-lip, lower-lip) frame extraction: --trim-frame-start TRIM_FRAME_START specify the the start frame of the target video @@ -94,7 +94,7 @@ frame processors: --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,gpen_bfr_1024,gpen_bfr_2048,restoreformer_plus_plus} choose the model responsible for enhancing the face --face-enhancer-blend [0-100] blend the enhanced into the previous face --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial,uniface_256} choose the model responsible for swapping the face - --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify_artistic} choose the model responsible for colorizing the frame + --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify,deoldify_artistic,deoldify_stable} choose the model responsible for colorizing the frame --frame-colorizer-blend [0-100] blend the colorized into the previous frame --frame-enhancer-model {lsdir_x4,nomos8k_sc_x4,real_esrgan_x2,real_esrgan_x2_fp16,real_esrgan_x4,real_esrgan_x4_fp16,real_hatgan_x4,span_kendata_x4} choose the model responsible for enhancing the frame --frame-enhancer-blend [0-100] blend the enhanced into the previous frame diff --git a/facefusion/audio.py b/facefusion/audio.py index fc1d782e..de800502 100644 --- a/facefusion/audio.py +++ b/facefusion/audio.py @@ -15,7 +15,7 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]] def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: - sample_rate = 16000 + sample_rate = 48000 channel_total = 2 if is_audio(audio_path): @@ -34,16 +34,16 @@ def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]] def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: - sample_rate = 16000 + sample_rate = 48000 channel_total = 2 - chunk_size = 1024 ** 3 - step_size = chunk_size // 4 + chunk_size = 1024 * 240 + step_size = 1024 * 180 if is_audio(audio_path): audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total) audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) audio = batch_extract_voice(audio, chunk_size, step_size) - audio = prepare_audio(audio) + audio = prepare_voice(audio) spectrogram = create_spectrogram(audio) audio_frames = extract_audio_frames(spectrogram, fps) return audio_frames @@ -81,6 +81,15 @@ def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio: return audio +def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio: + sample_rate = 48000 + resample_rate = 16000 + + audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate)) + audio = prepare_audio(audio) + return audio + + def convert_hertz_to_mel(hertz : float) -> float: return 2595 * numpy.log10(1 + hertz / 700) diff --git a/facefusion/choices.py b/facefusion/choices.py index 49d47add..e5587b85 100755 --- a/facefusion/choices.py +++ b/facefusion/choices.py @@ -17,7 +17,7 @@ face_detector_set : Dict[FaceDetectorModel, List[str]] =\ } face_selector_modes : List[FaceSelectorMode] = [ 'many', 'one', 'reference' ] face_mask_types : List[FaceMaskType] = [ 'box', 'occlusion', 'region' ] -face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ] +face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ] temp_frame_formats : List[TempFrameFormat] = [ 'bmp', 'jpg', 'png' ] output_video_encoders : List[OutputVideoEncoder] = [ 'libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf' ] output_video_presets : List[OutputVideoPreset] = [ 'ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow' ] diff --git a/facefusion/download.py b/facefusion/download.py index d5f12cfe..d24b4340 100644 --- a/facefusion/download.py +++ b/facefusion/download.py @@ -4,7 +4,6 @@ import platform import ssl import urllib.request from typing import List -from concurrent.futures import ThreadPoolExecutor from functools import lru_cache from tqdm import tqdm @@ -17,9 +16,6 @@ if platform.system().lower() == 'darwin': def conditional_download(download_directory_path : str, urls : List[str]) -> None: - with ThreadPoolExecutor() as executor: - for url in urls: - executor.submit(get_download_size, url) for url in urls: download_file_path = os.path.join(download_directory_path, os.path.basename(url)) initial_size = os.path.getsize(download_file_path) if is_file(download_file_path) else 0 diff --git a/facefusion/face_masker.py b/facefusion/face_masker.py index 647e44f5..583ce708 100755 --- a/facefusion/face_masker.py +++ b/facefusion/face_masker.py @@ -37,7 +37,7 @@ FACE_MASK_REGIONS : Dict[FaceMaskRegion, int] =\ 'right-eyebrow': 3, 'left-eye': 4, 'right-eye': 5, - 'eye-glasses': 6, + 'glasses': 6, 'nose': 10, 'mouth': 11, 'upper-lip': 12, diff --git a/facefusion/ffmpeg.py b/facefusion/ffmpeg.py index 6413e45c..869a34c3 100644 --- a/facefusion/ffmpeg.py +++ b/facefusion/ffmpeg.py @@ -120,7 +120,7 @@ def restore_audio(target_path : str, output_path : str, output_video_fps : Fps) def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool: temp_output_path = get_temp_output_video_path(target_path) - commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-c:v', 'copy', '-af', 'apad', '-map', '0:v:0', '-map', '1:a:0', '-shortest', '-y', output_path ] + commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-af', 'apad', '-shortest', '-y', output_path ] return run_ffmpeg(commands) diff --git a/facefusion/metadata.py b/facefusion/metadata.py index 5b8e3aa7..e84f8eae 100644 --- a/facefusion/metadata.py +++ b/facefusion/metadata.py @@ -2,7 +2,7 @@ METADATA =\ { 'name': 'FaceFusion', 'description': 'Next generation face swapper and enhancer', - 'version': '2.5.0', + 'version': '2.5.1', 'license': 'MIT', 'author': 'Henry Ruhs', 'url': 'https://facefusion.io' diff --git a/facefusion/processors/frame/choices.py b/facefusion/processors/frame/choices.py index d6337ba2..176db878 100755 --- a/facefusion/processors/frame/choices.py +++ b/facefusion/processors/frame/choices.py @@ -6,7 +6,7 @@ from facefusion.processors.frame.typings import FaceDebuggerItem, FaceEnhancerMo face_debugger_items : List[FaceDebuggerItem] = [ 'bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender' ] face_enhancer_models : List[FaceEnhancerModel] = [ 'codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus' ] face_swapper_models : List[FaceSwapperModel] = [ 'blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256' ] -frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify_artistic' ] +frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ] frame_enhancer_models : List[FrameEnhancerModel] = [ 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4' ] lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_gan' ] diff --git a/facefusion/processors/frame/modules/frame_colorizer.py b/facefusion/processors/frame/modules/frame_colorizer.py index 4b36c4a4..11a43a38 100644 --- a/facefusion/processors/frame/modules/frame_colorizer.py +++ b/facefusion/processors/frame/modules/frame_colorizer.py @@ -42,12 +42,26 @@ MODELS : ModelSet =\ 'path': resolve_relative_path('../.assets/models/ddcolor_artistic.onnx'), 'size': (512, 512) }, + 'deoldify': + { + 'type': 'deoldify', + 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify.onnx', + 'path': resolve_relative_path('../.assets/models/deoldify.onnx'), + 'size': (256, 256) + }, 'deoldify_artistic': { 'type': 'deoldify', 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_artistic.onnx', 'path': resolve_relative_path('../.assets/models/deoldify_artistic.onnx'), - 'size': (512, 512) + 'size': (256, 256) + }, + 'deoldify_stable': + { + 'type': 'deoldify', + 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_stable.onnx', + 'path': resolve_relative_path('../.assets/models/deoldify_stable.onnx'), + 'size': (256, 256) } } OPTIONS : Optional[OptionsWithModel] = None diff --git a/facefusion/processors/frame/modules/lip_syncer.py b/facefusion/processors/frame/modules/lip_syncer.py index 901b3e4b..b665e6b7 100755 --- a/facefusion/processors/frame/modules/lip_syncer.py +++ b/facefusion/processors/frame/modules/lip_syncer.py @@ -253,4 +253,8 @@ def process_image(source_paths : List[str], target_path : str, output_path : str def process_video(source_paths : List[str], temp_frame_paths : List[str]) -> None: + source_audio_paths = filter_audio_paths(facefusion.globals.source_paths) + temp_video_fps = restrict_video_fps(facefusion.globals.target_path, facefusion.globals.output_video_fps) + for source_audio_path in source_audio_paths: + read_static_voice(source_audio_path, temp_video_fps) frame_processors.multi_process_frames(source_paths, temp_frame_paths, process_frames) diff --git a/facefusion/processors/frame/typings.py b/facefusion/processors/frame/typings.py index 62391700..05729c5d 100644 --- a/facefusion/processors/frame/typings.py +++ b/facefusion/processors/frame/typings.py @@ -5,7 +5,7 @@ from facefusion.typing import Face, FaceSet, AudioFrame, VisionFrame FaceDebuggerItem = Literal['bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender'] FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus'] FaceSwapperModel = Literal['blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256'] -FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify_artistic'] +FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable'] FrameEnhancerModel = Literal['lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4'] LipSyncerModel = Literal['wav2lip_gan'] diff --git a/facefusion/typing.py b/facefusion/typing.py index 7972c057..b22eb2e0 100755 --- a/facefusion/typing.py +++ b/facefusion/typing.py @@ -75,7 +75,7 @@ FaceDetectorModel = Literal['many', 'retinaface', 'scrfd', 'yoloface', 'yunet'] FaceDetectorTweak = Literal['low-luminance', 'high-luminance'] FaceRecognizerModel = Literal['arcface_blendswap', 'arcface_inswapper', 'arcface_simswap', 'arcface_uniface'] FaceMaskType = Literal['box', 'occlusion', 'region'] -FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip'] +FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip'] TempFrameFormat = Literal['jpg', 'png', 'bmp'] OutputVideoEncoder = Literal['libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf'] OutputVideoPreset = Literal['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow'] diff --git a/tests/test_audio.py b/tests/test_audio.py index 08f8d471..765acfb8 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -21,6 +21,6 @@ def test_get_audio_frame() -> None: def test_read_static_audio() -> None: - assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 91 - assert len(read_static_audio('.assets/examples/source.wav', 25)) == 91 + assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 280 + assert len(read_static_audio('.assets/examples/source.wav', 25)) == 280 assert read_static_audio('invalid', 25) is None