diff --git a/facefusion/core.py b/facefusion/core.py index a5a27ca6..097c9670 100755 --- a/facefusion/core.py +++ b/facefusion/core.py @@ -13,7 +13,7 @@ from argparse import ArgumentParser, HelpFormatter import facefusion.choices import facefusion.globals -from facefusion import metadata, predictor, wording +from facefusion import face_analyser, predictor, metadata, wording from facefusion.predictor import predict_image, predict_video from facefusion.processors.frame.core import get_frame_processors_modules, load_frame_processor_module from facefusion.utilities import is_image, is_video, detect_fps, compress_image, merge_video, extract_frames, get_temp_frame_paths, restore_audio, create_temp, move_temp, clear_temp, list_module_names, encode_execution_providers, decode_execution_providers, normalize_output_path, update_status @@ -47,22 +47,22 @@ def cli() -> None: group_face_recognition.add_argument('--face-analyser-age', help = wording.get('face_analyser_age_help'), dest = 'face_analyser_age', choices = facefusion.choices.face_analyser_ages) group_face_recognition.add_argument('--face-analyser-gender', help = wording.get('face_analyser_gender_help'), dest = 'face_analyser_gender', choices = facefusion.choices.face_analyser_genders) group_face_recognition.add_argument('--reference-face-position', help = wording.get('reference_face_position_help'), dest = 'reference_face_position', type = int, default = 0) - group_face_recognition.add_argument('--reference-face-distance', help = wording.get('reference_face_distance_help'), dest = 'reference_face_distance', type = float, default = 1.5) + group_face_recognition.add_argument('--reference-face-distance', help = wording.get('reference_face_distance_help'), dest = 'reference_face_distance', type = float, default = 0.6) group_face_recognition.add_argument('--reference-frame-number', help = wording.get('reference_frame_number_help'), dest = 'reference_frame_number', type = int, default = 0) # frame extraction - group_processing = program.add_argument_group('frame extraction') - group_processing.add_argument('--trim-frame-start', help = wording.get('trim_frame_start_help'), dest = 'trim_frame_start', type = int) - group_processing.add_argument('--trim-frame-end', help = wording.get('trim_frame_end_help'), dest = 'trim_frame_end', type = int) - group_processing.add_argument('--temp-frame-format', help = wording.get('temp_frame_format_help'), dest = 'temp_frame_format', default = 'jpg', choices = facefusion.choices.temp_frame_formats) - group_processing.add_argument('--temp-frame-quality', help = wording.get('temp_frame_quality_help'), dest = 'temp_frame_quality', type = int, default = 100, choices = range(101), metavar = '[0-100]') - group_processing.add_argument('--keep-temp', help = wording.get('keep_temp_help'), dest = 'keep_temp', action = 'store_true') + group_frame_extraction = program.add_argument_group('frame extraction') + group_frame_extraction.add_argument('--trim-frame-start', help = wording.get('trim_frame_start_help'), dest = 'trim_frame_start', type = int) + group_frame_extraction.add_argument('--trim-frame-end', help = wording.get('trim_frame_end_help'), dest = 'trim_frame_end', type = int) + group_frame_extraction.add_argument('--temp-frame-format', help = wording.get('temp_frame_format_help'), dest = 'temp_frame_format', default = 'jpg', choices = facefusion.choices.temp_frame_formats) + group_frame_extraction.add_argument('--temp-frame-quality', help = wording.get('temp_frame_quality_help'), dest = 'temp_frame_quality', type = int, default = 100, choices = range(101), metavar = '[0-100]') + group_frame_extraction.add_argument('--keep-temp', help = wording.get('keep_temp_help'), dest = 'keep_temp', action = 'store_true') # output creation - group_output = program.add_argument_group('output creation') - group_output.add_argument('--output-image-quality', help=wording.get('output_image_quality_help'), dest = 'output_image_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]') - group_output.add_argument('--output-video-encoder', help = wording.get('output_video_encoder_help'), dest = 'output_video_encoder', default = 'libx264', choices = facefusion.choices.output_video_encoders) - group_output.add_argument('--output-video-quality', help = wording.get('output_video_quality_help'), dest = 'output_video_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]') - group_output.add_argument('--keep-fps', help = wording.get('keep_fps_help'), dest = 'keep_fps', action = 'store_true') - group_output.add_argument('--skip-audio', help = wording.get('skip_audio_help'), dest = 'skip_audio', action = 'store_true') + group_output_creation = program.add_argument_group('output creation') + group_output_creation.add_argument('--output-image-quality', help=wording.get('output_image_quality_help'), dest = 'output_image_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]') + group_output_creation.add_argument('--output-video-encoder', help = wording.get('output_video_encoder_help'), dest = 'output_video_encoder', default = 'libx264', choices = facefusion.choices.output_video_encoders) + group_output_creation.add_argument('--output-video-quality', help = wording.get('output_video_quality_help'), dest = 'output_video_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]') + group_output_creation.add_argument('--keep-fps', help = wording.get('keep_fps_help'), dest = 'keep_fps', action = 'store_true') + group_output_creation.add_argument('--skip-audio', help = wording.get('skip_audio_help'), dest = 'skip_audio', action = 'store_true') # frame processors available_frame_processors = list_module_names('facefusion/processors/frame/modules') program = ArgumentParser(parents = [ program ], formatter_class = program.formatter_class, add_help = True) @@ -124,7 +124,7 @@ def apply_args(program : ArgumentParser) -> None: def run(program : ArgumentParser) -> None: apply_args(program) limit_resources() - if not pre_check() or not predictor.pre_check(): + if not pre_check() or not predictor.pre_check() or not face_analyser.pre_check(): return for frame_processor_module in get_frame_processors_modules(facefusion.globals.frame_processors): if not frame_processor_module.pre_check(): diff --git a/facefusion/face_analyser.py b/facefusion/face_analyser.py index 976b718e..89671582 100644 --- a/facefusion/face_analyser.py +++ b/facefusion/face_analyser.py @@ -1,14 +1,37 @@ -from typing import Any, Optional, List +from typing import Any, Optional, List, Dict, Tuple import threading -import insightface +import cv2 import numpy +import onnxruntime import facefusion.globals from facefusion.face_cache import get_faces_cache, set_faces_cache -from facefusion.typing import Frame, Face, FaceAnalyserDirection, FaceAnalyserAge, FaceAnalyserGender +from facefusion.face_helper import warp_face +from facefusion.typing import Frame, Face, FaceAnalyserDirection, FaceAnalyserAge, FaceAnalyserGender, ModelValue, Kps, Embedding +from facefusion.utilities import resolve_relative_path, conditional_download +from facefusion.vision import resize_frame_dimension FACE_ANALYSER = None +THREAD_SEMAPHORE : threading.Semaphore = threading.Semaphore() THREAD_LOCK : threading.Lock = threading.Lock() +MODELS : Dict[str, ModelValue] =\ +{ + 'face_recognition_arcface': + { + 'url': 'https://huggingface.co/bluefoxcreation/insightface-retinaface-arcface-model/resolve/main/w600k_r50.onnx', + 'path': resolve_relative_path('../.assets/models/w600k_r50.onnx') + }, + 'face_detection_yunet': + { + 'url': 'https://github.com/opencv/opencv_zoo/raw/main/models/face_detection_yunet/face_detection_yunet_2023mar.onnx', + 'path': resolve_relative_path('../.assets/models/face_detection_yunet_2023mar.onnx') + }, + 'gender_age': + { + 'url': 'https://huggingface.co/facefusion/buffalo_l/resolve/main/genderage.onnx', + 'path': resolve_relative_path('../.assets/models/genderage.onnx') + } +} def get_face_analyser() -> Any: @@ -16,8 +39,12 @@ def get_face_analyser() -> Any: with THREAD_LOCK: if FACE_ANALYSER is None: - FACE_ANALYSER = insightface.app.FaceAnalysis(name = 'buffalo_l', providers = facefusion.globals.execution_providers) - FACE_ANALYSER.prepare(ctx_id = 0) + FACE_ANALYSER =\ + { + 'face_detector': cv2.FaceDetectorYN.create(MODELS.get('face_detection_yunet').get('path'), None, (0, 0)), + 'face_recognition': onnxruntime.InferenceSession(MODELS.get('face_recognition_arcface').get('path'), providers = facefusion.globals.execution_providers), + 'gender_age': onnxruntime.InferenceSession(MODELS.get('gender_age').get('path'), providers = facefusion.globals.execution_providers), + } return FACE_ANALYSER @@ -27,6 +54,80 @@ def clear_face_analyser() -> Any: FACE_ANALYSER = None +def pre_check() -> bool: + if not facefusion.globals.skip_download: + download_directory_path = resolve_relative_path('../.assets/models') + model_urls = [ MODELS.get('face_recognition_arcface').get('url'), MODELS.get('face_detection_yunet').get('url'), MODELS.get('gender_age').get('url') ] + conditional_download(download_directory_path, model_urls) + return True + + +def extract_faces(frame : Frame) -> List[Face]: + face_detector = get_face_analyser().get('face_detector') + faces: List[Face] = [] + temp_frame = resize_frame_dimension(frame, 640, 640) + temp_frame_height, temp_frame_width, _ = temp_frame.shape + frame_height, frame_width, _ = frame.shape + ratio_height = frame_height / temp_frame_height + ratio_width = frame_width / temp_frame_width + face_detector.setScoreThreshold(0.5) + face_detector.setTopK(100) + face_detector.setInputSize((temp_frame_width, temp_frame_height)) + with THREAD_SEMAPHORE: + _, detections = face_detector.detect(temp_frame) + if detections.any(): + for detection in detections: + bbox =\ + [ + detection[0:4][0] * ratio_width, + detection[0:4][1] * ratio_height, + (detection[0:4][0] + detection[0:4][2]) * ratio_width, + (detection[0:4][1] + detection[0:4][3]) * ratio_height + ] + kps = (detection[4:14].reshape((5, 2)) * [[ ratio_width, ratio_height ]]).astype(int) + score = detection[14] + embedding = calc_embedding(frame, kps) + normed_embedding = embedding / numpy.linalg.norm(embedding) + gender, age = detect_gender_age(frame, kps) + faces.append(Face( + bbox = bbox, + kps = kps, + score = score, + embedding = embedding, + normed_embedding = normed_embedding, + gender = gender, + age = age + )) + return faces + + +def calc_embedding(temp_frame : Frame, kps : Kps) -> Embedding: + face_recognition = get_face_analyser().get('face_recognition') + crop_frame, matrix = warp_face(temp_frame, kps, 'arcface', (112, 112)) + crop_frame = crop_frame.astype(numpy.float32) / 127.5 - 1 + crop_frame = crop_frame[:, :, ::-1].transpose(2, 0, 1) + crop_frame = numpy.expand_dims(crop_frame, axis = 0) + embedding = face_recognition.run(None, + { + face_recognition.get_inputs()[0].name: crop_frame + })[0] + embedding = embedding.ravel() + return embedding + + +def detect_gender_age(frame : Frame, kps : Kps) -> Tuple[int, int]: + gender_age = get_face_analyser().get('gender_age') + crop_frame, affine_matrix = warp_face(frame, kps, 'arcface', (96, 96)) + crop_frame = numpy.expand_dims(crop_frame, axis = 0).transpose(0, 3, 1, 2).astype(numpy.float32) + prediction = gender_age.run(None, + { + gender_age.get_inputs()[0].name: crop_frame + })[0][0] + gender = int(numpy.argmax(prediction[:2])) + age = int(numpy.round(prediction[2] * 100)) + return gender, age + + def get_one_face(frame : Frame, position : int = 0) -> Optional[Face]: many_faces = get_many_faces(frame) if many_faces: @@ -43,7 +144,7 @@ def get_many_faces(frame : Frame) -> List[Face]: if faces_cache: faces = faces_cache else: - faces = get_face_analyser().get(frame) + faces = extract_faces(frame) set_faces_cache(frame, faces) if facefusion.globals.face_analyser_direction: faces = sort_by_direction(faces, facefusion.globals.face_analyser_direction) @@ -62,7 +163,7 @@ def find_similar_faces(frame : Frame, reference_face : Face, face_distance : flo if many_faces: for face in many_faces: if hasattr(face, 'normed_embedding') and hasattr(reference_face, 'normed_embedding'): - current_face_distance = numpy.sum(numpy.square(face.normed_embedding - reference_face.normed_embedding)) + current_face_distance = 1 - numpy.dot(face.normed_embedding, reference_face.normed_embedding) if current_face_distance < face_distance: similar_faces.append(face) return similar_faces diff --git a/facefusion/processors/frame/modules/face_swapper.py b/facefusion/processors/frame/modules/face_swapper.py index f5cf25c2..79128b90 100644 --- a/facefusion/processors/frame/modules/face_swapper.py +++ b/facefusion/processors/frame/modules/face_swapper.py @@ -154,7 +154,7 @@ def swap_face(source_face : Face, target_face : Face, temp_frame : Frame) -> Fra if frame_processor_input.name == 'source': frame_processor_inputs[frame_processor_input.name] = source_face if frame_processor_input.name == 'target': - frame_processor_inputs[frame_processor_input.name] = crop_frame + frame_processor_inputs[frame_processor_input.name] = crop_frame # type: ignore[assignment] crop_frame = frame_processor.run(None, frame_processor_inputs)[0][0] crop_frame = normalize_crop_frame(crop_frame) temp_frame = paste_back(temp_frame, crop_frame, affine_matrix) diff --git a/facefusion/processors/frame/modules/frame_enhancer.py b/facefusion/processors/frame/modules/frame_enhancer.py index a529971c..730e8979 100644 --- a/facefusion/processors/frame/modules/frame_enhancer.py +++ b/facefusion/processors/frame/modules/frame_enhancer.py @@ -137,7 +137,8 @@ def enhance_frame(temp_frame : Frame) -> Frame: def blend_frame(temp_frame : Frame, paste_frame : Frame) -> Frame: frame_enhancer_blend = 1 - (frame_processors_globals.frame_enhancer_blend / 100) - temp_frame = cv2.resize(temp_frame, (paste_frame.shape[1], paste_frame.shape[0])) + paste_frame_height, paste_frame_width = paste_frame.shape[0:2] + temp_frame = cv2.resize(temp_frame, (paste_frame_width, paste_frame_height)) temp_frame = cv2.addWeighted(temp_frame, frame_enhancer_blend, paste_frame, 1 - frame_enhancer_blend, 0) return temp_frame diff --git a/facefusion/typing.py b/facefusion/typing.py index 255f6fe1..447f1679 100644 --- a/facefusion/typing.py +++ b/facefusion/typing.py @@ -1,9 +1,11 @@ +from collections import namedtuple from typing import Any, Literal, Callable, List, TypedDict, Dict -from insightface.app.common import Face import numpy +Bbox = numpy.ndarray[Any, Any] Kps = numpy.ndarray[Any, Any] -Face = Face +Embedding = numpy.ndarray[Any, Any] +Face = namedtuple('Face', [ 'bbox', 'kps', 'score', 'embedding', 'normed_embedding', 'gender', 'age' ]) Frame = numpy.ndarray[Any, Any] Matrix = numpy.ndarray[Any, Any] diff --git a/facefusion/uis/components/face_selector.py b/facefusion/uis/components/face_selector.py index 52fea49f..35bffbf4 100644 --- a/facefusion/uis/components/face_selector.py +++ b/facefusion/uis/components/face_selector.py @@ -5,7 +5,7 @@ import gradio import facefusion.choices import facefusion.globals from facefusion import wording -from facefusion.vision import get_video_frame, normalize_frame_color, read_static_image +from facefusion.vision import get_video_frame, read_static_image, normalize_frame_color from facefusion.face_analyser import get_many_faces from facefusion.face_reference import clear_face_reference from facefusion.typing import Frame, FaceRecognition @@ -47,9 +47,9 @@ def render() -> None: REFERENCE_FACE_DISTANCE_SLIDER = gradio.Slider( label = wording.get('reference_face_distance_slider_label'), value = facefusion.globals.reference_face_distance, - step = 0.05, + step = 0.025, minimum = 0, - maximum = 3, + maximum = 1.5, visible = 'reference' in facefusion.globals.face_recognition ) register_ui_component('face_recognition_dropdown', FACE_RECOGNITION_DROPDOWN) @@ -134,3 +134,4 @@ def extract_gallery_frames(reference_frame : Frame) -> List[Frame]: crop_frame = normalize_frame_color(crop_frame) crop_frames.append(crop_frame) return crop_frames + diff --git a/requirements.txt b/requirements.txt index 96320667..c3b0dc91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ basicsr==1.4.2 gradio==3.47.1 -insightface==0.7.3 numpy==1.24.3 onnx==1.14.1 onnxruntime==1.16.0