From 172b852127b1f2f6595786e4d6af8f0675dd8c25 Mon Sep 17 00:00:00 2001
From: Jianzhu Guo <guojianzhu@kuaishou.com>
Date: Thu, 25 Jul 2024 17:56:40 +0800
Subject: [PATCH] fix: v2v (#217)

* chore: fix v2v lip normalization and add smooth to non relative motion

* chore: fix v2v lip normalization and add smooth to non relative motion

* chore: fix v2v lip normalization and add smooth to non relative motion

---------

Co-authored-by: zhangdingyun <zhangdingyun@kuaishou.com>
---
 src/config/argument_config.py |  4 ++--
 src/live_portrait_pipeline.py | 36 +++++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/config/argument_config.py b/src/config/argument_config.py
index 6653f9c..5930dbe 100644
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@@ -26,8 +26,8 @@ class ArgumentConfig(PrintableConfig):
     flag_normalize_lip: bool = True  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
     flag_source_video_eye_retargeting: bool = False  # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering
     flag_video_editing_head_rotation: bool = False  # when the input is a source video, whether to inherit the relative head rotation from the driving video
-    flag_eye_retargeting: bool = False  # not recommend to be True, WIP
-    flag_lip_retargeting: bool = False  # not recommend to be True, WIP
+    flag_eye_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame
+    flag_lip_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the lip-open ratio of each driving frame to the source image or the corresponding source frame
     flag_stitching: bool = True  # recommend to True if head movement is small, False if head movement is large
     flag_relative_motion: bool = True  # whether to use relative motion
     flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py
index 9eccc8e..95c2a50 100644
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@@ -193,12 +193,20 @@ class LivePortraitPipeline(object):
             I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)
             source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)
 
-            x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
-            x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
-            if inf_cfg.flag_video_editing_head_rotation:
-                key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'  # compatible with previous keys
-                x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
-                x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+            key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d'  # compatible with previous keys
+            if inf_cfg.flag_relative_motion:
+                x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
+                x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                if inf_cfg.flag_video_editing_head_rotation:
+                    x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
+                    x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+            else:
+                x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
+                x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                if inf_cfg.flag_video_editing_head_rotation:
+                    x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
+                    x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+
         else:  # if the input is a source image, process it only once
             if inf_cfg.flag_do_crop:
                 crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)
@@ -217,7 +225,7 @@ class LivePortraitPipeline(object):
             x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
 
             # let lip-open scalar to be 0 at first
-            if flag_normalize_lip and source_lmk is not None:
+            if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
                 c_d_lip_before_animation = [0.]
                 combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
                 if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
@@ -244,11 +252,13 @@ class LivePortraitPipeline(object):
                 x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
 
                 # let lip-open scalar to be 0 at first if the input is a video
-                if flag_normalize_lip and source_lmk is not None:
+                if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
                     c_d_lip_before_animation = [0.]
                     combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
                     if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
                         lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+                    else:
+                        lip_delta_before_animation = None
 
                 # let eye-open scalar to be the same as the first frame if the latter is eye-open state
                 if flag_source_video_eye_retargeting and source_lmk is not None:
@@ -284,8 +294,14 @@ class LivePortraitPipeline(object):
                 scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
                 t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
             else:
-                R_new = R_d_i
-                delta_new = x_d_i_info['exp']
+                if flag_is_source_video:
+                    if inf_cfg.flag_video_editing_head_rotation:
+                        R_new = x_d_r_lst_smooth[i]
+                    else:
+                        R_new = R_s
+                else:
+                    R_new = R_d_i
+                delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_d_i_info['exp']
                 scale_new = x_s_info['scale']
                 t_new = x_d_i_info['t']