Replace retinaface with yunet (#168)

* Remove insightface dependency * Fix urllib * Some fixes * Analyse based on matches * Analyse based on rate * Fix CI * Migrate to Yunet * Something is off here * We indeed need semaphore for yunet * Normalize the normed_embedding * Fix download of models * Fix download of models * Fix download of models * Add score and improve affine_matrix * Temp fix for bbox out of frame * Temp fix for bbox out of frame * ROCM and OpenVINO mapping for torch backends * Normalize bbox * Implement gender age * Cosmetics on cli args * Prevent face jumping * Fix the paste back speed * FIx import * Introduce detection size
2023-10-22 12:33:31 +02:00 · 2023-10-22 12:33:31 +02:00 · 228febd73b
commit 228febd73b
parent 738d69a10b
7 changed files with 134 additions and 30 deletions
--- a/facefusion/core.py
+++ b/facefusion/core.py
@ -13,7 +13,7 @@ from argparse import ArgumentParser, HelpFormatter

 import facefusion.choices
 import facefusion.globals
-from facefusion import metadata, predictor, wording
+from facefusion import face_analyser, predictor, metadata, wording
 from facefusion.predictor import predict_image, predict_video
 from facefusion.processors.frame.core import get_frame_processors_modules, load_frame_processor_module
 from facefusion.utilities import is_image, is_video, detect_fps, compress_image, merge_video, extract_frames, get_temp_frame_paths, restore_audio, create_temp, move_temp, clear_temp, list_module_names, encode_execution_providers, decode_execution_providers, normalize_output_path, update_status
@ -47,22 +47,22 @@ def cli() -> None:
 	group_face_recognition.add_argument('--face-analyser-age', help = wording.get('face_analyser_age_help'), dest = 'face_analyser_age', choices = facefusion.choices.face_analyser_ages)
 	group_face_recognition.add_argument('--face-analyser-gender', help = wording.get('face_analyser_gender_help'), dest = 'face_analyser_gender', choices = facefusion.choices.face_analyser_genders)
 	group_face_recognition.add_argument('--reference-face-position', help = wording.get('reference_face_position_help'), dest = 'reference_face_position', type = int, default = 0)
-	group_face_recognition.add_argument('--reference-face-distance', help = wording.get('reference_face_distance_help'), dest = 'reference_face_distance', type = float, default = 1.5)
+	group_face_recognition.add_argument('--reference-face-distance', help = wording.get('reference_face_distance_help'), dest = 'reference_face_distance', type = float, default = 0.6)
 	group_face_recognition.add_argument('--reference-frame-number', help = wording.get('reference_frame_number_help'), dest = 'reference_frame_number', type = int, default = 0)
 	# frame extraction
-	group_processing = program.add_argument_group('frame extraction')
-	group_processing.add_argument('--trim-frame-start', help = wording.get('trim_frame_start_help'), dest = 'trim_frame_start', type = int)
-	group_processing.add_argument('--trim-frame-end', help = wording.get('trim_frame_end_help'), dest = 'trim_frame_end', type = int)
-	group_processing.add_argument('--temp-frame-format', help = wording.get('temp_frame_format_help'), dest = 'temp_frame_format', default = 'jpg', choices = facefusion.choices.temp_frame_formats)
-	group_processing.add_argument('--temp-frame-quality', help = wording.get('temp_frame_quality_help'), dest = 'temp_frame_quality', type = int, default = 100, choices = range(101), metavar = '[0-100]')
-	group_processing.add_argument('--keep-temp', help = wording.get('keep_temp_help'), dest = 'keep_temp', action = 'store_true')
+	group_frame_extraction = program.add_argument_group('frame extraction')
+	group_frame_extraction.add_argument('--trim-frame-start', help = wording.get('trim_frame_start_help'), dest = 'trim_frame_start', type = int)
+	group_frame_extraction.add_argument('--trim-frame-end', help = wording.get('trim_frame_end_help'), dest = 'trim_frame_end', type = int)
+	group_frame_extraction.add_argument('--temp-frame-format', help = wording.get('temp_frame_format_help'), dest = 'temp_frame_format', default = 'jpg', choices = facefusion.choices.temp_frame_formats)
+	group_frame_extraction.add_argument('--temp-frame-quality', help = wording.get('temp_frame_quality_help'), dest = 'temp_frame_quality', type = int, default = 100, choices = range(101), metavar = '[0-100]')
+	group_frame_extraction.add_argument('--keep-temp', help = wording.get('keep_temp_help'), dest = 'keep_temp', action = 'store_true')
 	# output creation
-	group_output = program.add_argument_group('output creation')
-	group_output.add_argument('--output-image-quality', help=wording.get('output_image_quality_help'), dest = 'output_image_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]')
-	group_output.add_argument('--output-video-encoder', help = wording.get('output_video_encoder_help'), dest = 'output_video_encoder', default = 'libx264', choices = facefusion.choices.output_video_encoders)
-	group_output.add_argument('--output-video-quality', help = wording.get('output_video_quality_help'), dest = 'output_video_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]')
-	group_output.add_argument('--keep-fps', help = wording.get('keep_fps_help'), dest = 'keep_fps', action = 'store_true')
-	group_output.add_argument('--skip-audio', help = wording.get('skip_audio_help'), dest = 'skip_audio', action = 'store_true')
+	group_output_creation = program.add_argument_group('output creation')
+	group_output_creation.add_argument('--output-image-quality', help=wording.get('output_image_quality_help'), dest = 'output_image_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]')
+	group_output_creation.add_argument('--output-video-encoder', help = wording.get('output_video_encoder_help'), dest = 'output_video_encoder', default = 'libx264', choices = facefusion.choices.output_video_encoders)
+	group_output_creation.add_argument('--output-video-quality', help = wording.get('output_video_quality_help'), dest = 'output_video_quality', type = int, default = 80, choices = range(101), metavar = '[0-100]')
+	group_output_creation.add_argument('--keep-fps', help = wording.get('keep_fps_help'), dest = 'keep_fps', action = 'store_true')
+	group_output_creation.add_argument('--skip-audio', help = wording.get('skip_audio_help'), dest = 'skip_audio', action = 'store_true')
 	# frame processors
 	available_frame_processors = list_module_names('facefusion/processors/frame/modules')
 	program = ArgumentParser(parents = [ program ], formatter_class = program.formatter_class, add_help = True)
@ -124,7 +124,7 @@ def apply_args(program : ArgumentParser) -> None:
 def run(program : ArgumentParser) -> None:
 	apply_args(program)
 	limit_resources()
-	if not pre_check() or not predictor.pre_check():
+	if not pre_check() or not predictor.pre_check() or not face_analyser.pre_check():
 		return
 	for frame_processor_module in get_frame_processors_modules(facefusion.globals.frame_processors):
 		if not frame_processor_module.pre_check():
--- a/facefusion/face_analyser.py
+++ b/facefusion/face_analyser.py
@ -1,14 +1,37 @@
-from typing import Any, Optional, List
+from typing import Any, Optional, List, Dict, Tuple
 import threading
-import insightface
+import cv2
 import numpy
+import onnxruntime

 import facefusion.globals
 from facefusion.face_cache import get_faces_cache, set_faces_cache
-from facefusion.typing import Frame, Face, FaceAnalyserDirection, FaceAnalyserAge, FaceAnalyserGender
+from facefusion.face_helper import warp_face
+from facefusion.typing import Frame, Face, FaceAnalyserDirection, FaceAnalyserAge, FaceAnalyserGender, ModelValue, Kps, Embedding
+from facefusion.utilities import resolve_relative_path, conditional_download
+from facefusion.vision import resize_frame_dimension

 FACE_ANALYSER = None
+THREAD_SEMAPHORE : threading.Semaphore = threading.Semaphore()
 THREAD_LOCK : threading.Lock = threading.Lock()
+MODELS : Dict[str, ModelValue] =\
+{
+	'face_recognition_arcface':
+	{
+		'url': 'https://huggingface.co/bluefoxcreation/insightface-retinaface-arcface-model/resolve/main/w600k_r50.onnx',
+		'path': resolve_relative_path('../.assets/models/w600k_r50.onnx')
+	},
+	'face_detection_yunet':
+	{
+		'url': 'https://github.com/opencv/opencv_zoo/raw/main/models/face_detection_yunet/face_detection_yunet_2023mar.onnx',
+		'path': resolve_relative_path('../.assets/models/face_detection_yunet_2023mar.onnx')
+	},
+	'gender_age':
+	{
+		'url': 'https://huggingface.co/facefusion/buffalo_l/resolve/main/genderage.onnx',
+		'path': resolve_relative_path('../.assets/models/genderage.onnx')
+	}
+}


 def get_face_analyser() -> Any:
@ -16,8 +39,12 @@ def get_face_analyser() -> Any:

 	with THREAD_LOCK:
 		if FACE_ANALYSER is None:
-			FACE_ANALYSER = insightface.app.FaceAnalysis(name = 'buffalo_l', providers = facefusion.globals.execution_providers)
-			FACE_ANALYSER.prepare(ctx_id = 0)
+			FACE_ANALYSER =\
+			{
+				'face_detector': cv2.FaceDetectorYN.create(MODELS.get('face_detection_yunet').get('path'), None, (0, 0)),
+				'face_recognition': onnxruntime.InferenceSession(MODELS.get('face_recognition_arcface').get('path'), providers = facefusion.globals.execution_providers),
+				'gender_age': onnxruntime.InferenceSession(MODELS.get('gender_age').get('path'), providers = facefusion.globals.execution_providers),
+			}
 	return FACE_ANALYSER


@ -27,6 +54,80 @@ def clear_face_analyser() -> Any:
 	FACE_ANALYSER = None


+def pre_check() -> bool:
+	if not facefusion.globals.skip_download:
+		download_directory_path = resolve_relative_path('../.assets/models')
+		model_urls = [ MODELS.get('face_recognition_arcface').get('url'), MODELS.get('face_detection_yunet').get('url'), MODELS.get('gender_age').get('url') ]
+		conditional_download(download_directory_path, model_urls)
+	return True
+
+
+def extract_faces(frame : Frame) -> List[Face]:
+	face_detector = get_face_analyser().get('face_detector')
+	faces: List[Face] = []
+	temp_frame = resize_frame_dimension(frame, 640, 640)
+	temp_frame_height, temp_frame_width, _ = temp_frame.shape
+	frame_height, frame_width, _ = frame.shape
+	ratio_height = frame_height / temp_frame_height
+	ratio_width = frame_width / temp_frame_width
+	face_detector.setScoreThreshold(0.5)
+	face_detector.setTopK(100)
+	face_detector.setInputSize((temp_frame_width, temp_frame_height))
+	with THREAD_SEMAPHORE:
+		_, detections = face_detector.detect(temp_frame)
+	if detections.any():
+		for detection in detections:
+			bbox =\
+			[
+				detection[0:4][0] * ratio_width,
+				detection[0:4][1] * ratio_height,
+				(detection[0:4][0] + detection[0:4][2]) * ratio_width,
+				(detection[0:4][1] + detection[0:4][3]) * ratio_height
+			]
+			kps = (detection[4:14].reshape((5, 2)) * [[ ratio_width, ratio_height ]]).astype(int)
+			score = detection[14]
+			embedding = calc_embedding(frame, kps)
+			normed_embedding = embedding / numpy.linalg.norm(embedding)
+			gender, age = detect_gender_age(frame, kps)
+			faces.append(Face(
+				bbox = bbox,
+				kps = kps,
+				score = score,
+				embedding = embedding,
+				normed_embedding = normed_embedding,
+				gender = gender,
+				age = age
+			))
+	return faces
+
+
+def calc_embedding(temp_frame : Frame, kps : Kps) -> Embedding:
+	face_recognition = get_face_analyser().get('face_recognition')
+	crop_frame, matrix = warp_face(temp_frame, kps, 'arcface', (112, 112))
+	crop_frame = crop_frame.astype(numpy.float32) / 127.5 - 1
+	crop_frame = crop_frame[:, :, ::-1].transpose(2, 0, 1)
+	crop_frame = numpy.expand_dims(crop_frame, axis = 0)
+	embedding = face_recognition.run(None,
+	{
+		face_recognition.get_inputs()[0].name: crop_frame
+	})[0]
+	embedding = embedding.ravel()
+	return embedding
+
+
+def detect_gender_age(frame : Frame, kps : Kps) -> Tuple[int, int]:
+	gender_age = get_face_analyser().get('gender_age')
+	crop_frame, affine_matrix = warp_face(frame, kps, 'arcface', (96, 96))
+	crop_frame = numpy.expand_dims(crop_frame, axis = 0).transpose(0, 3, 1, 2).astype(numpy.float32)
+	prediction = gender_age.run(None,
+	{
+		gender_age.get_inputs()[0].name: crop_frame
+	})[0][0]
+	gender = int(numpy.argmax(prediction[:2]))
+	age = int(numpy.round(prediction[2] * 100))
+	return gender, age
+
+
 def get_one_face(frame : Frame, position : int = 0) -> Optional[Face]:
 	many_faces = get_many_faces(frame)
 	if many_faces:
@ -43,7 +144,7 @@ def get_many_faces(frame : Frame) -> List[Face]:
 		if faces_cache:
 			faces = faces_cache
 		else:
-			faces = get_face_analyser().get(frame)
+			faces = extract_faces(frame)
 			set_faces_cache(frame, faces)
 		if facefusion.globals.face_analyser_direction:
 			faces = sort_by_direction(faces, facefusion.globals.face_analyser_direction)
@ -62,7 +163,7 @@ def find_similar_faces(frame : Frame, reference_face : Face, face_distance : flo
 	if many_faces:
 		for face in many_faces:
 			if hasattr(face, 'normed_embedding') and hasattr(reference_face, 'normed_embedding'):
-				current_face_distance = numpy.sum(numpy.square(face.normed_embedding - reference_face.normed_embedding))
+				current_face_distance = 1 - numpy.dot(face.normed_embedding, reference_face.normed_embedding)
 				if current_face_distance < face_distance:
 					similar_faces.append(face)
 	return similar_faces
--- a/facefusion/processors/frame/modules/face_swapper.py
+++ b/facefusion/processors/frame/modules/face_swapper.py
@ -154,7 +154,7 @@ def swap_face(source_face : Face, target_face : Face, temp_frame : Frame) -> Fra
 		if frame_processor_input.name == 'source':
 			frame_processor_inputs[frame_processor_input.name] = source_face
 		if frame_processor_input.name == 'target':
-			frame_processor_inputs[frame_processor_input.name] = crop_frame
+			frame_processor_inputs[frame_processor_input.name] = crop_frame # type: ignore[assignment]
 	crop_frame = frame_processor.run(None, frame_processor_inputs)[0][0]
 	crop_frame = normalize_crop_frame(crop_frame)
 	temp_frame = paste_back(temp_frame, crop_frame, affine_matrix)
--- a/facefusion/processors/frame/modules/frame_enhancer.py
+++ b/facefusion/processors/frame/modules/frame_enhancer.py
@ -137,7 +137,8 @@ def enhance_frame(temp_frame : Frame) -> Frame:

 def blend_frame(temp_frame : Frame, paste_frame : Frame) -> Frame:
 	frame_enhancer_blend = 1 - (frame_processors_globals.frame_enhancer_blend / 100)
-	temp_frame = cv2.resize(temp_frame, (paste_frame.shape[1], paste_frame.shape[0]))
+	paste_frame_height, paste_frame_width = paste_frame.shape[0:2]
+	temp_frame = cv2.resize(temp_frame, (paste_frame_width, paste_frame_height))
 	temp_frame = cv2.addWeighted(temp_frame, frame_enhancer_blend, paste_frame, 1 - frame_enhancer_blend, 0)
 	return temp_frame

--- a/facefusion/typing.py
+++ b/facefusion/typing.py
@ -1,9 +1,11 @@
+from collections import namedtuple
 from typing import Any, Literal, Callable, List, TypedDict, Dict
-from insightface.app.common import Face
 import numpy

+Bbox = numpy.ndarray[Any, Any]
 Kps = numpy.ndarray[Any, Any]
-Face = Face
+Embedding = numpy.ndarray[Any, Any]
+Face = namedtuple('Face', [ 'bbox', 'kps', 'score', 'embedding', 'normed_embedding', 'gender', 'age' ])
 Frame = numpy.ndarray[Any, Any]
 Matrix = numpy.ndarray[Any, Any]

--- a/facefusion/uis/components/face_selector.py
+++ b/facefusion/uis/components/face_selector.py
@ -5,7 +5,7 @@ import gradio
 import facefusion.choices
 import facefusion.globals
 from facefusion import wording
-from facefusion.vision import get_video_frame, normalize_frame_color, read_static_image
+from facefusion.vision import get_video_frame, read_static_image, normalize_frame_color
 from facefusion.face_analyser import get_many_faces
 from facefusion.face_reference import clear_face_reference
 from facefusion.typing import Frame, FaceRecognition
@ -47,9 +47,9 @@ def render() -> None:
 	REFERENCE_FACE_DISTANCE_SLIDER = gradio.Slider(
 		label = wording.get('reference_face_distance_slider_label'),
 		value = facefusion.globals.reference_face_distance,
-		step = 0.05,
+		step = 0.025,
 		minimum = 0,
-		maximum = 3,
+		maximum = 1.5,
 		visible = 'reference' in facefusion.globals.face_recognition
 	)
 	register_ui_component('face_recognition_dropdown', FACE_RECOGNITION_DROPDOWN)
@ -134,3 +134,4 @@ def extract_gallery_frames(reference_frame : Frame) -> List[Frame]:
 		crop_frame = normalize_frame_color(crop_frame)
 		crop_frames.append(crop_frame)
 	return crop_frames
+
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,5 @@
 basicsr==1.4.2
 gradio==3.47.1
-insightface==0.7.3
 numpy==1.24.3
 onnx==1.14.1
 onnxruntime==1.16.0