import uuid import tqdm import numpy as np import cv2 import sys import os import math from talkingface.data.few_shot_dataset import get_image import torch import mediapipe as mp from talkingface.utils import crop_mouth, main_keypoints_index, smooth_array import json from mini_live.obj.wrap_utils import index_wrap, index_edge_wrap mp_face_mesh = mp.solutions.face_mesh mp_face_detection = mp.solutions.face_detection def detect_face(frame): # 剔除掉多个人脸、大角度侧脸(鼻子不在两个眼之间)、部分人脸框在画面外、人脸像素低于80*80的 with mp_face_detection.FaceDetection( model_selection=1, min_detection_confidence=0.5) as face_detection: results = face_detection.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) if not results.detections or len(results.detections) > 1: return -1, None rect = results.detections[0].location_data.relative_bounding_box out_rect = [rect.xmin, rect.xmin + rect.width, rect.ymin, rect.ymin + rect.height] nose_ = mp_face_detection.get_key_point( results.detections[0], mp_face_detection.FaceKeyPoint.NOSE_TIP) l_eye_ = mp_face_detection.get_key_point( results.detections[0], mp_face_detection.FaceKeyPoint.LEFT_EYE) r_eye_ = mp_face_detection.get_key_point( results.detections[0], mp_face_detection.FaceKeyPoint.RIGHT_EYE) # print(nose_, l_eye_, r_eye_) if nose_.x > l_eye_.x or nose_.x < r_eye_.x: return -2, out_rect h, w = frame.shape[:2] # print(frame.shape) if rect.xmin < 0 or rect.ymin < 0 or rect.xmin + rect.width > w or rect.ymin + rect.height > h: return -3, out_rect if rect.width * w < 100 or rect.height * h < 100: return -4, out_rect return 1, out_rect def calc_face_interact(face0, face1): x_min = min(face0[0], face1[0]) x_max = max(face0[1], face1[1]) y_min = min(face0[2], face1[2]) y_max = max(face0[3], face1[3]) tmp0 = ((face0[1] - face0[0]) * (face0[3] - face0[2])) / ((x_max - x_min) * (y_max - y_min)) tmp1 = ((face1[1] - face1[0]) * (face1[3] - face1[2])) / ((x_max - x_min) * (y_max - y_min)) return min(tmp0, tmp1) def detect_face_mesh(frame): with mp_face_mesh.FaceMesh( static_image_mode=True, max_num_faces=1, refine_landmarks=True, min_detection_confidence=0.5) as face_mesh: results = face_mesh.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) pts_3d = np.zeros([478, 3]) if not results.multi_face_landmarks: print("****** WARNING! No face detected! ******") else: image_height, image_width = frame.shape[:2] for face_landmarks in results.multi_face_landmarks: for index_, i in enumerate(face_landmarks.landmark): x_px = min(math.floor(i.x * image_width), image_width - 1) y_px = min(math.floor(i.y * image_height), image_height - 1) z_px = min(math.floor(i.z * image_width), image_width - 1) pts_3d[index_] = np.array([x_px, y_px, z_px]) return pts_3d def ExtractFromVideo(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): return 0 dir_path = os.path.dirname(video_path) vid_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # 宽度 vid_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # 高度 totalFrames = cap.get(cv2.CAP_PROP_FRAME_COUNT) # 总帧数 totalFrames = int(totalFrames) pts_3d = np.zeros([totalFrames, 478, 3]) frame_index = 0 face_rect_list = [] ms_list = [] model_name = os.path.basename(video_path)[:-4] # os.makedirs("../preparation/{}/image".format(model_name)) for frame_index in tqdm.tqdm(range(totalFrames)): ret, frame = cap.read() # 按帧读取视频 ms = cap.get(cv2.CAP_PROP_POS_MSEC) ms_list.append(ms) # #到视频结尾时终止 if ret is False: break # cv2.imwrite("../preparation/{}/image/{:0>6d}.png".format(model_name, frame_index), frame) tag_, rect = detect_face(frame) if frame_index == 0 and tag_ != 1: print("第一帧人脸检测异常,请剔除掉多个人脸、大角度侧脸(鼻子不在两个眼之间)、部分人脸框在画面外、人脸像素低于80*80") pts_3d = -1 break elif tag_ == -1: # 有时候人脸检测会失败,就用上一帧的结果替代这一帧的结果 rect = face_rect_list[-1] elif tag_ != 1: print("第{}帧人脸检测异常,请剔除掉多个人脸、大角度侧脸(鼻子不在两个眼之间)、部分人脸框在画面外、人脸像素低于80*80, tag: {}".format(frame_index, tag_)) # exit() if len(face_rect_list) > 0: face_area_inter = calc_face_interact(face_rect_list[-1], rect) # print(frame_index, face_area_inter) if face_area_inter < 0.6: print("人脸区域变化幅度太大,请复查,超出值为{}, frame_num: {}".format(face_area_inter, frame_index)) pts_3d = -2 break face_rect_list.append(rect) x_min = rect[0] * vid_width y_min = rect[2] * vid_height x_max = rect[1] * vid_width y_max = rect[3] * vid_height seq_w, seq_h = x_max - x_min, y_max - y_min x_mid, y_mid = (x_min + x_max) / 2, (y_min + y_max) / 2 # x_min = int(max(0, x_mid - seq_w * 0.65)) # y_min = int(max(0, y_mid - seq_h * 0.4)) # x_max = int(min(vid_width, x_mid + seq_w * 0.65)) # y_max = int(min(vid_height, y_mid + seq_h * 0.8)) crop_size = int(max(seq_w * 1.35, seq_h * 1.35)) x_min = int(max(0, x_mid - crop_size * 0.5)) y_min = int(max(0, y_mid - crop_size * 0.45)) x_max = int(min(vid_width, x_min + crop_size)) y_max = int(min(vid_height, y_min + crop_size)) frame_face = frame[y_min:y_max, x_min:x_max] print(y_min, y_max, x_min, x_max) # cv2.imshow("s", frame_face) # cv2.waitKey(20) frame_kps = detect_face_mesh(frame_face) pts_3d[frame_index] = frame_kps + np.array([x_min, y_min, 0]) cap.release() # 释放视频对象 return pts_3d,ms_list def step0_keypoints(video_in_path, out_path): video_out_path = os.path.join(out_path, "01.mp4") # 1 视频转换为25FPS ffmpeg_cmd = "ffmpeg -i {} -r 25 -an -y {}".format(video_in_path, video_out_path) os.system(ffmpeg_cmd) # front_video_path = video_in_path cap = cv2.VideoCapture(video_out_path) vid_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # 宽度 vid_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # 高度 frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) cap.release() print("正向视频帧数:", frames) pts_3d,_ = ExtractFromVideo(video_out_path) if type(pts_3d) is np.ndarray and len(pts_3d) == frames: print("关键点已提取") pts_3d = pts_3d.reshape(len(pts_3d), -1) smooth_array_ = smooth_array(pts_3d, weight=[0.03, 0.06, 0.11, 0.6, 0.11, 0.06, 0.03]) pts_3d = smooth_array_.reshape(len(pts_3d), 478, 3) return pts_3d,vid_width,vid_height def step1_crop_mouth(pts_3d, vid_width, vid_height): list_source_crop_rect = [crop_mouth(source_pts[main_keypoints_index], vid_width, vid_height) for source_pts in pts_3d] list_source_crop_rect = np.array(list_source_crop_rect).reshape(len(pts_3d), -1) face_size = (list_source_crop_rect[:,2] - list_source_crop_rect[:,0]).mean()/2.0 + (list_source_crop_rect[:,3] - list_source_crop_rect[:,1]).mean()/2.0 face_size = int(face_size)//2 * 2 face_mid = (list_source_crop_rect[:,2:] + list_source_crop_rect[:,0:2])/2. # step 1: Smooth Cropping Rectangle Transition # Since HTML video playback can have inconsistent frame rates and may not align precisely from frame to frame, adjust the cropping rectangle to transition smoothly, compensating for potential misalignment. face_mid = smooth_array(face_mid, weight=[0.20, 0.20, 0.20, 0.20, 0.20]) face_mid = face_mid.astype(int) if face_mid[:, 0].max() + face_size / 2 > vid_width or face_mid[:, 1].max() + face_size / 2 > vid_height: raise ValueError("人脸范围超出了视频,请保证视频合格后再重试") list_source_crop_rect = np.concatenate([face_mid - face_size // 2, face_mid + face_size // 2], axis = 1) standard_size = 128 list_standard_v = [] for frame_index in range(len(list_source_crop_rect)): source_pts = pts_3d[frame_index] source_crop_rect = list_source_crop_rect[frame_index] print(source_crop_rect) standard_v = get_image(source_pts, source_crop_rect, input_type="mediapipe", resize=standard_size) list_standard_v.append(standard_v) return list_source_crop_rect, list_standard_v def step2_generate_obj(list_source_crop_rect, list_standard_v, out_path): from mini_live.obj.obj_utils import generateRenderInfo, generateWrapModel render_verts, render_face = generateRenderInfo() face_pts_mean = render_verts[:478, :3].copy() wrapModel_verts, wrapModel_face = generateWrapModel() # 求平均人脸 from talkingface.run_utils import calc_face_mat mat_list, _, face_pts_mean_personal_primer = calc_face_mat(np.array(list_standard_v), face_pts_mean) from mini_live.obj.wrap_utils import newWrapModel face_wrap_entity = newWrapModel(wrapModel_verts, face_pts_mean_personal_primer) with open(os.path.join(out_path,"face3D.obj"), "w") as f: for i in face_wrap_entity: f.write("v {:.3f} {:.3f} {:.3f} {:.02f} {:.0f}\n".format(i[0], i[1], i[2], i[3], i[4])) for i in range(len(wrapModel_face) // 3): f.write("f {0} {1} {2}\n".format(wrapModel_face[3 * i] + 1, wrapModel_face[3 * i + 1] + 1, wrapModel_face[3 * i + 2] + 1)) json_data = [] for frame_index in range(len(list_source_crop_rect)): source_crop_rect = list_source_crop_rect[frame_index] standard_v = list_standard_v[frame_index] standard_v = standard_v[index_wrap, :2].flatten().tolist() mat = mat_list[frame_index].T.flatten().tolist() # 将 standard_v 中所有元素四舍五入到两位小数 standard_v_rounded = [round(i, 5) for i in mat] + [round(i, 1) for i in standard_v] print(len(standard_v_rounded), 16 + 209 * 2) json_data.append({"rect": source_crop_rect.tolist(), "points": standard_v_rounded}) # print(json_data) # break with open(os.path.join(out_path, "json_data.json"), "w") as f: json.dump(json_data, f) def step3_generate_ref_tensor(list_source_crop_rect, list_standard_v, out_path): from talkingface.render_model_mini import RenderModel_Mini renderModel_mini = RenderModel_Mini() renderModel_mini.loadModel("checkpoint/DINet_mini/epoch_40.pth") from talkingface.data.few_shot_dataset import select_ref_index from talkingface.utils import draw_mouth_maps driven_keypoints = np.array(list_standard_v)[:,main_keypoints_index] ref_img_index_list = select_ref_index(driven_keypoints, n_ref=3, ratio=0.33) # 从当前视频选n_ref个图片 ref_img_list = [] video_path = os.path.join(out_path, "01.mp4") cap_input = cv2.VideoCapture(video_path) for index in ref_img_index_list: cap_input.set(cv2.CAP_PROP_POS_FRAMES, index) # 设置要获取的帧号 ret, frame = cap_input.read() ref_face_edge = draw_mouth_maps(driven_keypoints[index], size=(128, 128)) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) ref_img = get_image(frame, list_source_crop_rect[index], input_type="image", resize=128) ref_face_edge = cv2.resize(ref_face_edge, (128, 128)) ref_img = cv2.resize(ref_img, (128, 128)) w_pad = int((128 - 72) / 2) h_pad = int((128 - 56) / 2) ref_img = np.concatenate([ref_img[h_pad:-h_pad, w_pad:-w_pad], ref_face_edge[h_pad:-h_pad, w_pad:-w_pad, :1]], axis=2) # cv2.imshow("ss", ref_img[:,:,::-1]) # cv2.waitKey(-1) ref_img_list.append(ref_img) ref_img = np.concatenate(ref_img_list, axis=2) ref_tensor = torch.from_numpy(ref_img / 255.).float().permute(2, 0, 1).unsqueeze(0).cuda() renderModel_mini.net.ref_input(ref_tensor) ref_in_feature = renderModel_mini.net.infer_model.ref_in_feature print(1111, ref_in_feature.size()) ref_in_feature = ref_in_feature.detach().squeeze(0).cpu().float().numpy().flatten() print(1111, ref_in_feature.shape) np.savetxt(os.path.join(out_path, 'ref_data.txt'), ref_in_feature, fmt='%.8f') def main(): # 检查命令行参数的数量 if len(sys.argv) != 2: print("Usage: python data_preparation_web.py ") sys.exit(1) # 参数数量不正确时退出程序 # 获取video_name参数 video_in_path = sys.argv[1] # video_in_path = r"E:\data\video\video/5.mp4" out_path = "web_demo/static/assets" pts_3d, vid_width,vid_height = step0_keypoints(video_in_path, out_path) list_source_crop_rect, list_standard_v = step1_crop_mouth(pts_3d, vid_width, vid_height) step2_generate_obj(list_source_crop_rect, list_standard_v, out_path) step3_generate_ref_tensor(list_source_crop_rect, list_standard_v, out_path) if __name__ == "__main__": main()