Next (#487)

* Fix voice extractor (#483) * changes * changes * Minor cleanup * Use 48000 everywhere * Fix test * Balance between processing and VRAM * Warmup the read_static_voice() cache * Warmup the read_static_voice() cache * Simplify replace_audio to prevent FFmpeg 7 infite loop * Fix potential exception in conditional download * Add more deoldify models * Rename eye-glasses to glasses, Prepare release 2.5.1 --------- Co-authored-by: Harisreedhar <46858047+harisreedhar@users.noreply.github.com>
2024-04-13 11:27:55 +02:00 · 2024-04-13 11:27:55 +02:00 · 092dfbb796
commit 092dfbb796
parent f77c463531
13 changed files with 44 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -70,7 +70,7 @@ face mask:
  --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...]                                                                                               mix and match different face mask types (choices: box, occlusion, region)
  --face-mask-blur [0.0-1.0]                                                                                                                            specify the degree of blur applied the box mask
  --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...]                                                                                         apply top, right, bottom and left padding to the box mask
-  --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...]                                                                                         choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip)
+  --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...]                                                                                         choose the facial features used for the region mask (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, glasses, nose, mouth, upper-lip, lower-lip)

 frame extraction:
  --trim-frame-start TRIM_FRAME_START                                                                                                                   specify the the start frame of the target video
@ -94,7 +94,7 @@ frame processors:
  --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,gpen_bfr_1024,gpen_bfr_2048,restoreformer_plus_plus}     choose the model responsible for enhancing the face
  --face-enhancer-blend [0-100]                                                                                                                         blend the enhanced into the previous face
  --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial,uniface_256}                                  choose the model responsible for swapping the face
-  --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify_artistic}                                                                                  choose the model responsible for colorizing the frame
+  --frame-colorizer-model {ddcolor,ddcolor_artistic,deoldify,deoldify_artistic,deoldify_stable}                                                         choose the model responsible for colorizing the frame
  --frame-colorizer-blend [0-100]                                                                                                                       blend the colorized into the previous frame
  --frame-enhancer-model {lsdir_x4,nomos8k_sc_x4,real_esrgan_x2,real_esrgan_x2_fp16,real_esrgan_x4,real_esrgan_x4_fp16,real_hatgan_x4,span_kendata_x4}  choose the model responsible for enhancing the frame
  --frame-enhancer-blend [0-100]                                                                                                                        blend the enhanced into the previous frame
--- a/facefusion/audio.py
+++ b/facefusion/audio.py
@ -15,7 +15,7 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]


 def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
-	sample_rate = 16000
+	sample_rate = 48000
 	channel_total = 2

 	if is_audio(audio_path):
@ -34,16 +34,16 @@ def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]


 def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
-	sample_rate = 16000
+	sample_rate = 48000
 	channel_total = 2
-	chunk_size = 1024 ** 3
-	step_size = chunk_size // 4
+	chunk_size = 1024 * 240
+	step_size = 1024 * 180

 	if is_audio(audio_path):
 		audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
 		audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
 		audio = batch_extract_voice(audio, chunk_size, step_size)
-		audio = prepare_audio(audio)
+		audio = prepare_voice(audio)
 		spectrogram = create_spectrogram(audio)
 		audio_frames = extract_audio_frames(spectrogram, fps)
 		return audio_frames
@ -81,6 +81,15 @@ def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
 	return audio


+def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio:
+	sample_rate = 48000
+	resample_rate = 16000
+
+	audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate))
+	audio = prepare_audio(audio)
+	return audio
+
+
 def convert_hertz_to_mel(hertz : float) -> float:
 	return 2595 * numpy.log10(1 + hertz / 700)

--- a/facefusion/choices.py
+++ b/facefusion/choices.py
@ -17,7 +17,7 @@ face_detector_set : Dict[FaceDetectorModel, List[str]] =\
 }
 face_selector_modes : List[FaceSelectorMode] = [ 'many', 'one', 'reference' ]
 face_mask_types : List[FaceMaskType] = [ 'box', 'occlusion', 'region' ]
-face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ]
+face_mask_regions : List[FaceMaskRegion] = [ 'skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip' ]
 temp_frame_formats : List[TempFrameFormat] = [ 'bmp', 'jpg', 'png' ]
 output_video_encoders : List[OutputVideoEncoder] = [ 'libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf' ]
 output_video_presets : List[OutputVideoPreset] = [ 'ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow' ]
--- a/facefusion/download.py
+++ b/facefusion/download.py
@ -4,7 +4,6 @@ import platform
 import ssl
 import urllib.request
 from typing import List
-from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from tqdm import tqdm

@ -17,9 +16,6 @@ if platform.system().lower() == 'darwin':


 def conditional_download(download_directory_path : str, urls : List[str]) -> None:
-	with ThreadPoolExecutor() as executor:
-		for url in urls:
-			executor.submit(get_download_size, url)
 	for url in urls:
 		download_file_path = os.path.join(download_directory_path, os.path.basename(url))
 		initial_size = os.path.getsize(download_file_path) if is_file(download_file_path) else 0
--- a/facefusion/face_masker.py
+++ b/facefusion/face_masker.py
@ -37,7 +37,7 @@ FACE_MASK_REGIONS : Dict[FaceMaskRegion, int] =\
 	'right-eyebrow': 3,
 	'left-eye': 4,
 	'right-eye': 5,
-	'eye-glasses': 6,
+	'glasses': 6,
 	'nose': 10,
 	'mouth': 11,
 	'upper-lip': 12,
--- a/facefusion/ffmpeg.py
+++ b/facefusion/ffmpeg.py
@ -120,7 +120,7 @@ def restore_audio(target_path : str, output_path : str, output_video_fps : Fps)

 def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool:
 	temp_output_path = get_temp_output_video_path(target_path)
-	commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-c:v', 'copy', '-af', 'apad', '-map', '0:v:0', '-map', '1:a:0', '-shortest', '-y', output_path ]
+	commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', audio_path, '-af', 'apad', '-shortest', '-y', output_path ]
 	return run_ffmpeg(commands)


--- a/facefusion/metadata.py
+++ b/facefusion/metadata.py
@ -2,7 +2,7 @@ METADATA =\
 {
 	'name': 'FaceFusion',
 	'description': 'Next generation face swapper and enhancer',
-	'version': '2.5.0',
+	'version': '2.5.1',
 	'license': 'MIT',
 	'author': 'Henry Ruhs',
 	'url': 'https://facefusion.io'
--- a/facefusion/processors/frame/choices.py
+++ b/facefusion/processors/frame/choices.py
@ -6,7 +6,7 @@ from facefusion.processors.frame.typings import FaceDebuggerItem, FaceEnhancerMo
 face_debugger_items : List[FaceDebuggerItem] = [ 'bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender' ]
 face_enhancer_models : List[FaceEnhancerModel] = [ 'codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus' ]
 face_swapper_models : List[FaceSwapperModel] = [ 'blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256' ]
-frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify_artistic' ]
+frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
 frame_enhancer_models : List[FrameEnhancerModel] = [ 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4' ]
 lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_gan' ]

--- a/facefusion/processors/frame/modules/frame_colorizer.py
+++ b/facefusion/processors/frame/modules/frame_colorizer.py
@ -42,12 +42,26 @@ MODELS : ModelSet =\
 		'path': resolve_relative_path('../.assets/models/ddcolor_artistic.onnx'),
 		'size': (512, 512)
 	},
+	'deoldify':
+	{
+		'type': 'deoldify',
+		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify.onnx',
+		'path': resolve_relative_path('../.assets/models/deoldify.onnx'),
+		'size': (256, 256)
+	},
 	'deoldify_artistic':
 	{
 		'type': 'deoldify',
 		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_artistic.onnx',
 		'path': resolve_relative_path('../.assets/models/deoldify_artistic.onnx'),
-		'size': (512, 512)
+		'size': (256, 256)
+	},
+	'deoldify_stable':
+	{
+		'type': 'deoldify',
+		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/deoldify_stable.onnx',
+		'path': resolve_relative_path('../.assets/models/deoldify_stable.onnx'),
+		'size': (256, 256)
 	}
 }
 OPTIONS : Optional[OptionsWithModel] = None
--- a/facefusion/processors/frame/modules/lip_syncer.py
+++ b/facefusion/processors/frame/modules/lip_syncer.py
@ -253,4 +253,8 @@ def process_image(source_paths : List[str], target_path : str, output_path : str


 def process_video(source_paths : List[str], temp_frame_paths : List[str]) -> None:
+	source_audio_paths = filter_audio_paths(facefusion.globals.source_paths)
+	temp_video_fps = restrict_video_fps(facefusion.globals.target_path, facefusion.globals.output_video_fps)
+	for source_audio_path in source_audio_paths:
+		read_static_voice(source_audio_path, temp_video_fps)
 	frame_processors.multi_process_frames(source_paths, temp_frame_paths, process_frames)
--- a/facefusion/processors/frame/typings.py
+++ b/facefusion/processors/frame/typings.py
@ -5,7 +5,7 @@ from facefusion.typing import Face, FaceSet, AudioFrame, VisionFrame
 FaceDebuggerItem = Literal['bounding-box', 'face-landmark-5', 'face-landmark-5/68', 'face-landmark-68', 'face-landmark-68/5', 'face-mask', 'face-detector-score', 'face-landmarker-score', 'age', 'gender']
 FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.4', 'gpen_bfr_256', 'gpen_bfr_512', 'gpen_bfr_1024', 'gpen_bfr_2048', 'restoreformer_plus_plus']
 FaceSwapperModel = Literal['blendswap_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_512_unofficial', 'uniface_256']
-FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify_artistic']
+FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
 FrameEnhancerModel = Literal['lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_hatgan_x4', 'span_kendata_x4']
 LipSyncerModel = Literal['wav2lip_gan']

--- a/facefusion/typing.py
+++ b/facefusion/typing.py
@ -75,7 +75,7 @@ FaceDetectorModel = Literal['many', 'retinaface', 'scrfd', 'yoloface', 'yunet']
 FaceDetectorTweak = Literal['low-luminance', 'high-luminance']
 FaceRecognizerModel = Literal['arcface_blendswap', 'arcface_inswapper', 'arcface_simswap', 'arcface_uniface']
 FaceMaskType = Literal['box', 'occlusion', 'region']
-FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']
+FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']
 TempFrameFormat = Literal['jpg', 'png', 'bmp']
 OutputVideoEncoder = Literal['libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf']
 OutputVideoPreset = Literal['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow']
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@ -21,6 +21,6 @@ def test_get_audio_frame() -> None:


 def test_read_static_audio() -> None:
-	assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 91
-	assert len(read_static_audio('.assets/examples/source.wav', 25)) == 91
+	assert len(read_static_audio('.assets/examples/source.mp3', 25)) == 280
+	assert len(read_static_audio('.assets/examples/source.wav', 25)) == 280
 	assert read_static_audio('invalid', 25) is None