From 200f84dd1fb0b91bb684aa72099c5886ea614425 Mon Sep 17 00:00:00 2001
From: zhangdingyun <zhangdingyun@kuaishou.com>
Date: Tue, 13 Aug 2024 18:23:30 +0800
Subject: [PATCH] feat: update

---
 src/config/argument_config.py |  8 ++++----
 src/live_portrait_pipeline.py | 26 ++++++++++++++------------
 src/utils/cropper.py          |  6 +++---
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/config/argument_config.py b/src/config/argument_config.py
index 6599ce5..11fd3df 100644
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@@ -13,8 +13,8 @@ from .base_config import PrintableConfig, make_abs_path
 @dataclass(repr=False)  # use repr from PrintableConfig
 class ArgumentConfig(PrintableConfig):
     ########## input arguments ##########
-    source: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/source/s3.jpg')  # path to the source portrait (human/animal) or video (human)
-    driving:  Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d3.jpg')  # path to driving video or template (.pkl format)
+    source: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/driving/d0.mp4')  # path to the source portrait (human/animal) or video (human)
+    driving:  Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d6.pkl')  # path to driving video or template (.pkl format)
     output_dir: Annotated[str, tyro.conf.arg(aliases=["-o"])] = 'animations/'  # directory to save output video
 
     ########## inference arguments ##########
@@ -22,7 +22,7 @@ class ArgumentConfig(PrintableConfig):
     flag_crop_driving_video: bool = False  # whether to crop the driving video, if the given driving info is a video
     device_id: int = 0  # gpu device id
     flag_force_cpu: bool = False  # force cpu inference, WIP!
-    flag_normalize_lip: bool = True  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
+    flag_normalize_lip: bool = False  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
     flag_source_video_eye_retargeting: bool = False  # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering
     flag_video_editing_head_rotation: bool = False  # when the input is a source video, whether to inherit the relative head rotation from the driving video
     flag_eye_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame
@@ -35,7 +35,7 @@ class ArgumentConfig(PrintableConfig):
     driving_multiplier: float = 1.0 # be used only when driving_option is "expression-friendly"
     driving_smooth_observation_variance: float = 3e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
     audio_priority: Literal['source', 'driving'] = 'driving'  # whether to use the audio from source or driving video
-    animation_region: Literal["exp", "pose", "lip", "eyes", "all"] = "pose" # the region where the animation was performed, "exp" means the expression, "pose" means the head pose
+    animation_region: Literal["exp", "pose", "lip", "eyes", "all"] = "eyes" # the region where the animation was performed, "exp" means the expression, "pose" means the head pose
     ########## source crop arguments ##########
     det_thresh: float = 0.15 # detection threshold
     scale: float = 2.3  # the ratio of face area is smaller if scale is larger
diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py
index dd66b89..e5067cc 100644
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@@ -290,6 +290,7 @@ class LivePortraitPipeline(object):
                 R_d_0 = R_d_i
                 x_d_0_info = x_d_i_info
 
+            delta_new = x_s_info['exp'].clone()
             if inf_cfg.flag_relative_motion:
                 if flag_is_source_video:
                     if inf_cfg.flag_video_editing_head_rotation:
@@ -298,21 +299,18 @@ class LivePortraitPipeline(object):
                         R_new = R_s
                 else:
                     if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
-                        delta_new = x_s_info['exp']
+
                         R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
                     else:
                         R_new = R_s
                 if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "exp":
                     delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
                 elif inf_cfg.animation_region == "lip":
-                    delta_new = x_s_info['exp']
                     for lip_idx in [14, 17, 19, 20]:
-                        delta_new[:, lip_idx, :] += (x_d_i_info['exp'][:, lip_idx, :] - x_d_0_info['exp'][:, lip_idx, :])
+                        delta_new[:, lip_idx, :] =  x_d_exp_lst_smooth[i][lip_idx, :] if flag_is_source_video else (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
                 elif inf_cfg.animation_region == "eyes":
-                    delta_new = x_s_info['exp']
                     for eyes_idx in [11, 13, 15, 16]:
-                        delta_new[:, eyes_idx, :] += (x_d_i_info['exp'][:, eyes_idx, :] - x_d_0_info['exp'][:, eyes_idx, :])
-
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :] if flag_is_source_video else (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
                 if inf_cfg.animation_region == "all":
                     scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
                 else:
@@ -329,20 +327,24 @@ class LivePortraitPipeline(object):
                         R_new = R_s
                 else:
                     if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
-                        delta_new = x_s_info['exp']
+
                         R_new = R_d_i
                     else:
                         R_new = R_s
                 if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "exp":
-                    delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_d_i_info['exp']
+                    # delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_d_i_info['exp']
+                    for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
+                        delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :] if flag_is_source_video else x_d_i_info['exp'][:, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1] if flag_is_source_video else x_d_i_info['exp'][:, 3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_lst_smooth[i][5, 2] if flag_is_source_video else x_d_i_info['exp'][:, 5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2] if flag_is_source_video else x_d_i_info['exp'][:, 8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:] if flag_is_source_video else x_d_i_info['exp'][:, 9, 1:]
                 elif inf_cfg.animation_region == "lip":
-                    delta_new = x_s_info['exp']
                     for lip_idx in [14, 17, 19, 20]:
-                        delta_new[:, lip_idx, :] = x_d_i_info['exp'][:, lip_idx, :]
+                        delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, lip_idx, :]
                 elif inf_cfg.animation_region == "eyes":
-                    delta_new = x_s_info['exp']
                     for eyes_idx in [11, 13, 15, 16]:
-                        delta_new[:, eyes_idx, :] = x_d_i_info['exp'][:, eyes_idx, :]
+                        delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, eyes_idx, :]
                 scale_new = x_s_info['scale']
                 if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
                     t_new = x_d_i_info['t']
diff --git a/src/utils/cropper.py b/src/utils/cropper.py
index 97e26c7..10b4370 100644
--- a/src/utils/cropper.py
+++ b/src/utils/cropper.py
@@ -66,14 +66,14 @@ class Cropper(object):
                     providers=face_analysis_wrapper_provider,
                 )
         self.face_analysis_wrapper.prepare(ctx_id=device_id, det_size=(512, 512), det_thresh=self.crop_cfg.det_thresh)
-        self.face_analysis_wrapper.warmup()
+        # self.face_analysis_wrapper.warmup()
 
         self.human_landmark_runner = HumanLandmark(
             ckpt_path=self.crop_cfg.landmark_ckpt_path,
             onnx_provider=device,
             device_id=device_id,
         )
-        self.human_landmark_runner.warmup()
+        # self.human_landmark_runner.warmup()
 
         if self.image_type == "animal_face":
             from .animal_landmark_runner import XPoseRunner as AnimalLandmarkRunner
@@ -83,7 +83,7 @@ class Cropper(object):
                     embeddings_cache_path=self.crop_cfg.xpose_embedding_cache_path,
                     flag_use_half_precision=kwargs.get("flag_use_half_precision", True),
                 )
-            self.animal_landmark_runner.warmup()
+            # self.animal_landmark_runner.warmup()
 
     def update_config(self, user_args):
         for k, v in user_args.items():