fix: v2v (#217)

* chore: fix v2v lip normalization and add smooth to non relative motion * chore: fix v2v lip normalization and add smooth to non relative motion * chore: fix v2v lip normalization and add smooth to non relative motion --------- Co-authored-by: zhangdingyun <zhangdingyun@kuaishou.com>
2025-03-14 13:02:13 +00:00 · 2024-07-25 17:56:40 +08:00 · 2024-07-25 17:56:40 +08:00 · 172b852127
commit 172b852127
parent d654a014da
2 changed files with 28 additions and 12 deletions
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@ -26,8 +26,8 @@ class ArgumentConfig(PrintableConfig):
    flag_normalize_lip: bool = True  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
    flag_source_video_eye_retargeting: bool = False  # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering
    flag_video_editing_head_rotation: bool = False  # when the input is a source video, whether to inherit the relative head rotation from the driving video
-    flag_eye_retargeting: bool = False  # not recommend to be True, WIP
-    flag_lip_retargeting: bool = False  # not recommend to be True, WIP
+    flag_eye_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame
+    flag_lip_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the lip-open ratio of each driving frame to the source image or the corresponding source frame
    flag_stitching: bool = True  # recommend to True if head movement is small, False if head movement is large
    flag_relative_motion: bool = True  # whether to use relative motion
    flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@ -193,12 +193,20 @@ class LivePortraitPipeline(object):
            I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)
            source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)

-            x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
-            x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
-            if inf_cfg.flag_video_editing_head_rotation:
-                key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'  # compatible with previous keys
-                x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
-                x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+            key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'  # compatible with previous keys
+            if inf_cfg.flag_relative_motion:
+                x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
+                x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                if inf_cfg.flag_video_editing_head_rotation:
+                    x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
+                    x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+            else:
+                x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
+                x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                if inf_cfg.flag_video_editing_head_rotation:
+                    x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
+                    x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+
        else:  # if the input is a source image, process it only once
            if inf_cfg.flag_do_crop:
                crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)
@ -217,7 +225,7 @@ class LivePortraitPipeline(object):
            x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)

            # let lip-open scalar to be 0 at first
-            if flag_normalize_lip and source_lmk is not None:
+            if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
                c_d_lip_before_animation = [0.]
                combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
                if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
@ -244,11 +252,13 @@ class LivePortraitPipeline(object):
                x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)

                # let lip-open scalar to be 0 at first if the input is a video
-                if flag_normalize_lip and source_lmk is not None:
+                if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
                    c_d_lip_before_animation = [0.]
                    combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
                    if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
                        lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+                    else:
+                        lip_delta_before_animation = None

                # let eye-open scalar to be the same as the first frame if the latter is eye-open state
                if flag_source_video_eye_retargeting and source_lmk is not None:
@ -284,8 +294,14 @@ class LivePortraitPipeline(object):
                scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
                t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
            else:
-                R_new = R_d_i
-                delta_new = x_d_i_info['exp']
+                if flag_is_source_video:
+                    if inf_cfg.flag_video_editing_head_rotation:
+                        R_new = x_d_r_lst_smooth[i]
+                    else:
+                        R_new = R_s
+                else:
+                    R_new = R_d_i
+                delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_d_i_info['exp']
                scale_new = x_s_info['scale']
                t_new = x_d_i_info['t']