From 092dfbb796c94651c05ff0382f1358f93217b84b Mon Sep 17 00:00:00 2001
From: Henry Ruhs <info@henryruhs.com>
Date: Sat, 13 Apr 2024 11:27:55 +0200
Subject: [PATCH] Next (#487)

* Fix voice extractor (#483)

* changes

* changes

* Minor cleanup

* Use 48000 everywhere

* Fix test

* Balance between processing and VRAM

* Warmup the read_static_voice() cache

* Warmup the read_static_voice() cache

* Simplify replace_audio to prevent FFmpeg 7 infite loop

* Fix potential exception in conditional download

* Add more deoldify models

* Rename eye-glasses to glasses, Prepare release 2.5.1

---------

Co-authored-by: Harisreedhar <46858047+harisreedhar@users.noreply.github.com>
---
 README.md                                     |  4 ++--
 facefusion/audio.py                           | 19 ++++++++++++++-----
 facefusion/choices.py                         |  2 +-
 facefusion/download.py                        |  4 ----
 facefusion/face_masker.py                     |  2 +-
 facefusion/ffmpeg.py                          |  2 +-
 facefusion/metadata.py                        |  2 +-
 facefusion/processors/frame/choices.py        |  2 +-
 .../frame/modules/frame_colorizer.py          | 16 +++++++++++++++-
 .../processors/frame/modules/lip_syncer.py    |  4 ++++
 facefusion/processors/frame/typings.py        |  2 +-
 facefusion/typing.py                          |  2 +-
 tests/test_audio.py                           |  4 ++--
 13 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 8687565e..86e8aa29 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ face mask:
   --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...]                                                                                               mix and match different face mask types (choices: box, occlusion, region)
   --face-mask-blur [0.0-1.0]                                                                                                                            specify the degree of blur applied the box mask
   --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...]                                                                                         apply top, right, bottom and left padding to the box mask
-  --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...]                                                                                         choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip)
+  --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...]                                                                                         choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, glasses, nose, mouth, upper-lip, lower-lip)
 
 frame extraction:
   --trim-frame-start TRIM_FRAME_START                                                                                                                   specify the the start frame of the target video
@@ -94,7 +94,7 @@ frame processors:
   --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,gpen_bfr_1024,gpen_bfr_2048,restoreformer_plus_plus}     choose the model responsible for enhancing the face
   --face-enhancer-blend [0-100]                                                                                                                         blend the enhanced into the previous face
   --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial,uniface_256}                                  choose the model responsible for swapping the face
-  --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify_artistic}                                                                                  choose the model responsible for colorizing the frame
+  --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify,deoldify_artistic,deoldify_stable}                                                         choose the model responsible for colorizing the frame
   --frame-colorizer-blend [0-100]                                                                                                                       blend the colorized into the previous frame
   --frame-enhancer-model {lsdir_x4,nomos8k_sc_x4,real_esrgan_x2,real_esrgan_x2_fp16,real_esrgan_x4,real_esrgan_x4_fp16,real_hatgan_x4,span_kendata_x4}  choose the model responsible for enhancing the frame
   --frame-enhancer-blend [0-100]                                                                                                                        blend the enhanced into the previous frame
diff --git a/facefusion/audio.py b/facefusion/audio.py
index fc1d782e..de800502 100644
--- a/facefusion/audio.py
+++ b/facefusion/audio.py
@@ -15,7 +15,7 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]
 
 
 def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
-	sample_rate = 16000
+	sample_rate = 48000
 	channel_total = 2
 
 	if is_audio(audio_path):
@@ -34,16 +34,16 @@ def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]
 
 
 def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
-	sample_rate = 16000
+	sample_rate = 48000
 	channel_total = 2
-	chunk_size = 1024 ** 3
-	step_size = chunk_size // 4
+	chunk_size = 1024 * 240
+	step_size = 1024 * 180
 
 	if is_audio(audio_path):
 		audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
 		audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
 		audio = batch_extract_voice(audio, chunk_size, step_size)
-		audio = prepare_audio(audio)
+		audio = prepare_voice(audio)
 		spectrogram = create_spectrogram(audio)
 		audio_frames = extract_audio_frames(spectrogram, fps)
 		return audio_frames
@@ -81,6 +81,15 @@ def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
 	return audio
 
 
+def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio:
+	sample_rate = 48000
+	resample_rate = 16000
+
+	audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate))
+	audio = prepare_audio(audio)
+	return audio
+
+
 def convert_hertz_to_mel(hertz : float) -> float:
 	return 2595 * numpy.log10(1 + hertz / 700)
 
diff --git a/facefusion/choices.py b/facefusion/choices.py
index 49d47add..e5587b85 100755
--- a/facefusion/choices.py
+++ b/facefusion/choices.py
@@ -17,7 +17,7 @@ face_detector_set : Dict[FaceDetectorModel, List[str]] =\
 }
 face_selector_modes : List[FaceSelectorMode] = [ 'many', 'one', 'reference' ]
 face_mask_types : List[FaceMaskType] = [ 'box', 'occlusion', 'region' ]
-face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ]
+face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ]
 temp_frame_formats : List[TempFrameFormat] = [ 'bmp', 'jpg', 'png' ]
 output_video_encoders : List[OutputVideoEncoder] = [ 'libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf' ]
 output_video_presets : List[OutputVideoPreset] = [ 'ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow' ]
diff --git a/facefusion/download.py b/facefusion/download.py
index d5f12cfe..d24b4340 100644
--- a/facefusion/download.py
+++ b/facefusion/download.py
@@ -4,7 +4,6 @@ import platform
 import ssl
 import urllib.request
 from typing import List
-from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from tqdm import tqdm
 
@@ -17,9 +16,6 @@ if platform.system().lower() == 'darwin':
 
 
 def conditional_download(download_directory_path : str, urls : List[str]) -> None:
-	with ThreadPoolExecutor() as executor:
-		for url in urls:
-			executor.submit(get_download_size, url)
 	for url in urls:
 		download_file_path = os.path.join(download_directory_path, os.path.basename(url))
 		initial_size = os.path.getsize(download_file_path) if is_file(download_file_path) else 0
diff --git a/facefusion/face_masker.py b/facefusion/face_masker.py
index 647e44f5..583ce708 100755
--- a/facefusion/face_masker.py
+++ b/facefusion/face_masker.py
@@ -37,7 +37,7 @@ FACE_MASK_REGIONS : Dict[FaceMaskRegion, int] =\
 	'right-eyebrow': 3,
 	'left-eye': 4,
 	'right-eye': 5,
-	'eye-glasses': 6,
+	'glasses': 6,
 	'nose': 10,
 	'mouth': 11,
 	'upper-lip': 12,
diff --git a/facefusion/ffmpeg.py b/facefusion/ffmpeg.py
index 6413e45c..869a34c3 100644
--- a/facefusion/ffmpeg.py
+++ b/facefusion/ffmpeg.py
@@ -120,7 +120,7 @@ def restore_audio(target_path : str, output_path : str, output_video_fps : Fps)
 
 def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool:
 	temp_output_path = get_temp_output_video_path(target_path)
-	commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-c:v', 'copy', '-af', 'apad', '-map', '0:v:0', '-map', '1:a:0', '-shortest', '-y', output_path ]
+	commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-af', 'apad', '-shortest', '-y', output_path ]
 	return run_ffmpeg(commands)
 
 
diff --git a/facefusion/metadata.py b/facefusion/metadata.py
index 5b8e3aa7..e84f8eae 100644
--- a/facefusion/metadata.py
+++ b/facefusion/metadata.py
@@ -2,7 +2,7 @@ METADATA =\
 {
 	'name': 'FaceFusion',
 	'description': 'Next generation face swapper and enhancer',
-	'version': '2.5.0',
+	'version': '2.5.1',
 	'license': 'MIT',
 	'author': 'Henry Ruhs',
 	'url': 'https://facefusion.io'
diff --git a/facefusion/processors/frame/choices.py b/facefusion/processors/frame/choices.py
index d6337ba2..176db878 100755
--- a/facefusion/processors/frame/choices.py
+++ b/facefusion/processors/frame/choices.py
@@ -6,7 +6,7 @@ from facefusion.processors.frame.typings import FaceDebuggerItem, FaceEnhancerMo
 face_debugger_items : List[FaceDebuggerItem] = [ 'bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender' ]
 face_enhancer_models : List[FaceEnhancerModel] = [ 'codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus' ]
 face_swapper_models : List[FaceSwapperModel] = [ 'blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256' ]
-frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify_artistic' ]
+frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
 frame_enhancer_models : List[FrameEnhancerModel] = [ 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4' ]
 lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_gan' ]
 
diff --git a/facefusion/processors/frame/modules/frame_colorizer.py b/facefusion/processors/frame/modules/frame_colorizer.py
index 4b36c4a4..11a43a38 100644
--- a/facefusion/processors/frame/modules/frame_colorizer.py
+++ b/facefusion/processors/frame/modules/frame_colorizer.py
@@ -42,12 +42,26 @@ MODELS : ModelSet =\
 		'path': resolve_relative_path('../.assets/models/ddcolor_artistic.onnx'),
 		'size': (512, 512)
 	},
+	'deoldify':
+	{
+		'type': 'deoldify',
+		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify.onnx',
+		'path': resolve_relative_path('../.assets/models/deoldify.onnx'),
+		'size': (256, 256)
+	},
 	'deoldify_artistic':
 	{
 		'type': 'deoldify',
 		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_artistic.onnx',
 		'path': resolve_relative_path('../.assets/models/deoldify_artistic.onnx'),
-		'size': (512, 512)
+		'size': (256, 256)
+	},
+	'deoldify_stable':
+	{
+		'type': 'deoldify',
+		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_stable.onnx',
+		'path': resolve_relative_path('../.assets/models/deoldify_stable.onnx'),
+		'size': (256, 256)
 	}
 }
 OPTIONS : Optional[OptionsWithModel] = None
diff --git a/facefusion/processors/frame/modules/lip_syncer.py b/facefusion/processors/frame/modules/lip_syncer.py
index 901b3e4b..b665e6b7 100755
--- a/facefusion/processors/frame/modules/lip_syncer.py
+++ b/facefusion/processors/frame/modules/lip_syncer.py
@@ -253,4 +253,8 @@ def process_image(source_paths : List[str], target_path : str, output_path : str
 
 
 def process_video(source_paths : List[str], temp_frame_paths : List[str]) -> None:
+	source_audio_paths = filter_audio_paths(facefusion.globals.source_paths)
+	temp_video_fps = restrict_video_fps(facefusion.globals.target_path, facefusion.globals.output_video_fps)
+	for source_audio_path in source_audio_paths:
+		read_static_voice(source_audio_path, temp_video_fps)
 	frame_processors.multi_process_frames(source_paths, temp_frame_paths, process_frames)
diff --git a/facefusion/processors/frame/typings.py b/facefusion/processors/frame/typings.py
index 62391700..05729c5d 100644
--- a/facefusion/processors/frame/typings.py
+++ b/facefusion/processors/frame/typings.py
@@ -5,7 +5,7 @@ from facefusion.typing import Face, FaceSet, AudioFrame, VisionFrame
 FaceDebuggerItem = Literal['bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender']
 FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus']
 FaceSwapperModel = Literal['blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256']
-FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify_artistic']
+FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
 FrameEnhancerModel = Literal['lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4']
 LipSyncerModel = Literal['wav2lip_gan']
 
diff --git a/facefusion/typing.py b/facefusion/typing.py
index 7972c057..b22eb2e0 100755
--- a/facefusion/typing.py
+++ b/facefusion/typing.py
@@ -75,7 +75,7 @@ FaceDetectorModel = Literal['many', 'retinaface', 'scrfd', 'yoloface', 'yunet']
 FaceDetectorTweak = Literal['low-luminance', 'high-luminance']
 FaceRecognizerModel = Literal['arcface_blendswap', 'arcface_inswapper', 'arcface_simswap', 'arcface_uniface']
 FaceMaskType = Literal['box', 'occlusion', 'region']
-FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']
+FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']
 TempFrameFormat = Literal['jpg', 'png', 'bmp']
 OutputVideoEncoder = Literal['libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf']
 OutputVideoPreset = Literal['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow']
diff --git a/tests/test_audio.py b/tests/test_audio.py
index 08f8d471..765acfb8 100644
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@@ -21,6 +21,6 @@ def test_get_audio_frame() -> None:
 
 
 def test_read_static_audio() -> None:
-	assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 91
-	assert len(read_static_audio('.assets/examples/source.wav', 25)) == 91
+	assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 280
+	assert len(read_static_audio('.assets/examples/source.wav', 25)) == 280
 	assert read_static_audio('invalid', 25) is None