From 3f45c776da13321d5f0c85c8ded805741680f583 Mon Sep 17 00:00:00 2001
From: zhangdingyun <zhangdingyun@kuaishou.com>
Date: Wed, 14 Aug 2024 15:56:55 +0800
Subject: [PATCH] feat: update

---
 app.py                         | 110 ++++++++++++++++++++++++++++++---
 src/config/argument_config.py  |   7 +--
 src/config/inference_config.py |   1 -
 src/gradio_pipeline.py         |  19 ++++--
 src/live_portrait_pipeline.py  |  81 ++++++++++++------------
 5 files changed, 159 insertions(+), 59 deletions(-)

diff --git a/app.py b/app.py
index 35a57f2..8068eee 100644
--- a/app.py
+++ b/app.py
@@ -85,12 +85,12 @@ data_examples_i2v = [
     [osp.join(example_portrait_dir, "s2.jpg"), osp.join(example_video_dir, "d13.mp4"), True, True, True, True],
 ]
 data_examples_v2v = [
-    [osp.join(example_portrait_dir, "s13.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 3e-7],
+    [osp.join(example_portrait_dir, "s13.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, 3e-7],
     # [osp.join(example_portrait_dir, "s14.mp4"), osp.join(example_video_dir, "d18.mp4"), True, True, True, False, False, 3e-7],
     # [osp.join(example_portrait_dir, "s15.mp4"), osp.join(example_video_dir, "d19.mp4"), True, True, True, False, False, 3e-7],
-    [osp.join(example_portrait_dir, "s18.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 3e-7],
+    [osp.join(example_portrait_dir, "s18.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, 3e-7],
     # [osp.join(example_portrait_dir, "s19.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 3e-7],
-    [osp.join(example_portrait_dir, "s20.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 3e-7],
+    [osp.join(example_portrait_dir, "s20.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, 3e-7],
 ]
 #################### interface logic ####################
 
@@ -126,6 +126,75 @@ output_video = gr.Video(autoplay=False)
 output_video_paste_back = gr.Video(autoplay=False)
 output_video_i2v = gr.Video(autoplay=False)
 output_video_concat_i2v = gr.Video(autoplay=False)
+output_image_i2i = gr.Image(type="numpy")
+output_image_concat_i2i = gr.Image(type="numpy")
+
+"""
+每个点和每个维度对应的表情：
+(0,0): 头顶左右偏
+(0,1): 头顶上下偏
+(0,2): 头顶前后偏
+(1,0): 眉毛上下，眼睛左右
+(1,1): 眉毛上下，眼睛上下
+(1,2): 嘴巴和眼睛的动作
+(2,0): 眉毛上下，眼睛左右
+(2,1): 眉毛上下，眼睛上下
+(2,2): 嘴巴动作
+(3,0): 左脸胖瘦, 眉毛上下
+(3,1): 左脸上下，眉毛上下
+(3,2): 左脸前后，会变形
+(4,0): 右脸胖瘦
+(4,1): 右脸上下
+(4,2): 右脸前后，会变形
+(5,0): 头左右平移
+(5,1): 头上下平移
+(5,2): 嘴部动作
+(6,0): 嘴部动作
+(6,1): 嘴部动作
+(6,2): 嘴部动作
+(7,0): 右脸胖瘦
+(7,1): 右脸上下
+(7,2): 右脸前后
+(8,0): 右脸胖瘦
+(8,1): 右脸上下
+(8,2): 嘴部动作
+(9,0): 下巴胖瘦
+(9,1): 嘴部动作
+(9,2): 眼部动作
+(10,0): 左边放缩
+(10,1): 左边放缩，眼部动作
+(10,2): 下巴放缩
+(11,0): 左眼左右转
+(11,1): 左眼上下睁开闭合
+(11,2): 左眼前后
+(12,0): 嘴部动作
+(12,1): 无明显
+(12,2): 嘴部动作
+(13,0): 眼部动作
+(13,1): 眼部动作
+(13,2): 眼部动作
+(14,0): 嘴部动作
+(14,1): 嘴部动作
+(14,2): 嘴部动作
+(15,0): 眼部动作
+(15,1): 眼部动作，嘴部动作
+(15,2): 眼部动作
+(16,0): 眼睛
+(16,1): 右眼睁开闭合，嘴部动作
+(16,2): 眼部动作
+(17,0): 嘴部动作，眼部动作
+(17,1): 嘴部动作，眼部动作
+(17,2): 撅嘴，拉平嘴
+(18,0): 眼部方向
+(18,1): 眼部上下
+(18,2): 嘴部动作，眼部动作
+(19,0): 撇嘴
+(19,1): 张开闭合嘴
+(19,2): 内收外翻嘴
+(20,0): 下弯嘴
+(20,1): 露牙，闭合牙
+(20,2): 下拉嘴，哦形嘴
+"""
 
 
 with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo:
@@ -196,6 +265,19 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
                             inputs=[driving_video_input],
                             cache_examples=False,
                         )
+                with gr.TabItem("🖼️ Driving Image") as v_tab_image:
+                    with gr.Accordion(open=True, label="Driving Image"):
+                        driving_image_input = gr.Image(type="filepath")
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_video_dir, "d3.jpg")],
+                                [osp.join(example_video_dir, "d9.jpg")],
+                                [osp.join(example_video_dir, "d11.jpg")],
+                            ],
+                            inputs=[driving_image_input],
+                            cache_examples=False,
+                        )
+
                 with gr.TabItem("📁 Driving Pickle") as v_tab_pickle:
                     with gr.Accordion(open=True, label="Driving Pickle"):
                         driving_video_pickle_input = gr.File(type="filepath", file_types=[".pkl"])
@@ -212,8 +294,9 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
                         )
 
                 v_tab_selection = gr.Textbox(visible=False)
-                v_tab_pickle.select(lambda: "Pickle", None, v_tab_selection)
                 v_tab_video.select(lambda: "Video", None, v_tab_selection)
+                v_tab_image.select(lambda: "Image", None, v_tab_selection)
+                v_tab_pickle.select(lambda: "Pickle", None, v_tab_selection)
             # with gr.Accordion(open=False, label="Animation Instructions"):
                 # gr.Markdown(load_description("assets/gradio/gradio_description_animation.md"))
             with gr.Accordion(open=True, label="Cropping Options for Driving Video"):
@@ -229,9 +312,9 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
                 flag_relative_input = gr.Checkbox(value=True, label="relative motion")
                 flag_remap_input = gr.Checkbox(value=True, label="paste-back")
                 flag_stitching_input = gr.Checkbox(value=True, label="stitching")
+                animation_region = gr.Radio(["exp", "pose", "lip", "eyes", "all"], value="all", label="animation region")
                 driving_option_input = gr.Radio(['expression-friendly', 'pose-friendly'], value="expression-friendly", label="driving option (i2v)")
                 driving_multiplier = gr.Number(value=1.0, label="driving multiplier (i2v)", minimum=0.0, maximum=2.0, step=0.02)
-                flag_video_editing_head_rotation = gr.Checkbox(value=False, label="relative head rotation (v2v)")
                 driving_smooth_observation_variance = gr.Number(value=3e-7, label="motion smooth strength (v2v)", minimum=1e-11, maximum=1e-2, step=1e-8)
 
     gr.Markdown(load_description("assets/gradio/gradio_description_animate_clear.md"))
@@ -244,8 +327,14 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
         with gr.Column():
             with gr.Accordion(open=True, label="The animated video"):
                 output_video_concat_i2v.render()
+        with gr.Column():
+            with gr.Accordion(open=True, label="The animated image in the original image space"):
+                output_image_i2i.render()
+        with gr.Column():
+            with gr.Accordion(open=True, label="The animated image"):
+                output_image_concat_i2i.render()
     with gr.Row():
-        process_button_reset = gr.ClearButton([source_image_input, source_video_input, driving_video_pickle_input, driving_video_input, output_video_i2v, output_video_concat_i2v], value="🧹 Clear")
+        process_button_reset = gr.ClearButton([source_image_input, source_video_input, driving_video_pickle_input, driving_video_input, driving_image_input, output_video_i2v, output_video_concat_i2v, output_image_i2i, output_image_concat_i2i], value="🧹 Clear")
 
     with gr.Row():
         # Examples
@@ -279,7 +368,6 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
                         flag_do_crop_input,
                         flag_remap_input,
                         flag_crop_driving_video_input,
-                        flag_video_editing_head_rotation,
                         driving_smooth_observation_variance,
                     ],
                     outputs=[output_image, output_image_paste_back],
@@ -413,16 +501,17 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
         inputs=[
             source_image_input,
             source_video_input,
-            driving_video_pickle_input,
             driving_video_input,
+            driving_image_input,
+            driving_video_pickle_input,
             flag_relative_input,
             flag_do_crop_input,
             flag_remap_input,
             flag_stitching_input,
+            animation_region,
             driving_option_input,
             driving_multiplier,
             flag_crop_driving_video_input,
-            flag_video_editing_head_rotation,
             scale,
             vx_ratio,
             vy_ratio,
@@ -433,10 +522,11 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
             tab_selection,
             v_tab_selection,
         ],
-        outputs=[output_video_i2v, output_video_concat_i2v],
+        outputs=[output_video_i2v, output_video_concat_i2v, output_image_i2i, output_image_concat_i2i],
         show_progress=True
     )
 
+
     retargeting_input_image.change(
         fn=gradio_pipeline.init_retargeting_image,
         inputs=[retargeting_source_scale, eye_retargeting_slider, lip_retargeting_slider, retargeting_input_image],
diff --git a/src/config/argument_config.py b/src/config/argument_config.py
index 11fd3df..d6e6916 100644
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@@ -13,8 +13,8 @@ from .base_config import PrintableConfig, make_abs_path
 @dataclass(repr=False)  # use repr from PrintableConfig
 class ArgumentConfig(PrintableConfig):
     ########## input arguments ##########
-    source: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/driving/d0.mp4')  # path to the source portrait (human/animal) or video (human)
-    driving:  Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d6.pkl')  # path to driving video or template (.pkl format)
+    source: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/source/s18.mp4')  # path to the source portrait (human/animal) or video (human)
+    driving:  Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d3.jpg')  # path to driving video or template (.pkl format)
     output_dir: Annotated[str, tyro.conf.arg(aliases=["-o"])] = 'animations/'  # directory to save output video
 
     ########## inference arguments ##########
@@ -24,7 +24,6 @@ class ArgumentConfig(PrintableConfig):
     flag_force_cpu: bool = False  # force cpu inference, WIP!
     flag_normalize_lip: bool = False  # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
     flag_source_video_eye_retargeting: bool = False  # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering
-    flag_video_editing_head_rotation: bool = False  # when the input is a source video, whether to inherit the relative head rotation from the driving video
     flag_eye_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame
     flag_lip_retargeting: bool = False  # not recommend to be True, WIP; whether to transfer the lip-open ratio of each driving frame to the source image or the corresponding source frame
     flag_stitching: bool = True  # recommend to True if head movement is small, False if head movement is large or the source image is an animal
@@ -35,7 +34,7 @@ class ArgumentConfig(PrintableConfig):
     driving_multiplier: float = 1.0 # be used only when driving_option is "expression-friendly"
     driving_smooth_observation_variance: float = 3e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
     audio_priority: Literal['source', 'driving'] = 'driving'  # whether to use the audio from source or driving video
-    animation_region: Literal["exp", "pose", "lip", "eyes", "all"] = "eyes" # the region where the animation was performed, "exp" means the expression, "pose" means the head pose
+    animation_region: Literal["exp", "pose", "lip", "eyes", "all"] = "all" # the region where the animation was performed, "exp" means the expression, "pose" means the head pose
     ########## source crop arguments ##########
     det_thresh: float = 0.15 # detection threshold
     scale: float = 2.3  # the ratio of face area is smaller if scale is larger
diff --git a/src/config/inference_config.py b/src/config/inference_config.py
index c56f01a..d1b5572 100644
--- a/src/config/inference_config.py
+++ b/src/config/inference_config.py
@@ -34,7 +34,6 @@ class InferenceConfig(PrintableConfig):
     device_id: int = 0
     flag_normalize_lip: bool = True
     flag_source_video_eye_retargeting: bool = False
-    flag_video_editing_head_rotation: bool = False
     flag_eye_retargeting: bool = False
     flag_lip_retargeting: bool = False
     flag_stitching: bool = True
diff --git a/src/gradio_pipeline.py b/src/gradio_pipeline.py
index 38dcde5..5a9398a 100644
--- a/src/gradio_pipeline.py
+++ b/src/gradio_pipeline.py
@@ -146,16 +146,18 @@ class GradioPipeline(LivePortraitPipeline):
         self,
         input_source_image_path=None,
         input_source_video_path=None,
-        input_driving_video_pickle_path=None,
         input_driving_video_path=None,
+        input_driving_image_path=None,
+        input_driving_video_pickle_path=None,
         flag_relative_input=True,
         flag_do_crop_input=True,
         flag_remap_input=True,
         flag_stitching_input=True,
+        animation_region="all",
         driving_option_input="pose-friendly",
         driving_multiplier=1.0,
         flag_crop_driving_video_input=True,
-        flag_video_editing_head_rotation=False,
+        # flag_video_editing_head_rotation=False,
         scale=2.3,
         vx_ratio=0.0,
         vy_ratio=-0.125,
@@ -177,6 +179,8 @@ class GradioPipeline(LivePortraitPipeline):
 
         if v_tab_selection == 'Video':
             input_driving_path = input_driving_video_path
+        elif v_tab_selection == 'Image':
+            input_driving_path = input_driving_image_path
         elif v_tab_selection == 'Pickle':
             input_driving_path = input_driving_video_pickle_path
         else:
@@ -195,10 +199,10 @@ class GradioPipeline(LivePortraitPipeline):
                 'flag_do_crop': flag_do_crop_input,
                 'flag_pasteback': flag_remap_input,
                 'flag_stitching': flag_stitching_input,
+                'animation_region': animation_region,
                 'driving_option': driving_option_input,
                 'driving_multiplier': driving_multiplier,
                 'flag_crop_driving_video': flag_crop_driving_video_input,
-                'flag_video_editing_head_rotation': flag_video_editing_head_rotation,
                 'scale': scale,
                 'vx_ratio': vx_ratio,
                 'vy_ratio': vy_ratio,
@@ -211,10 +215,13 @@ class GradioPipeline(LivePortraitPipeline):
             self.args = update_args(self.args, args_user)
             self.live_portrait_wrapper.update_config(self.args.__dict__)
             self.cropper.update_config(self.args.__dict__)
-            # video driven animation
-            video_path, video_path_concat = self.execute(self.args)
+
+            output_path, output_path_concat = self.execute(self.args)
             gr.Info("Run successfully!", duration=2)
-            return video_path, video_path_concat
+            if output_path.endswith(".jpg"):
+                return None, None, output_path, output_path_concat
+            else:
+                return output_path, output_path_concat, None, None
         else:
             raise gr.Error("Please upload the source portrait or source video, and driving video 🤗🤗🤗", duration=5)
 
diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py
index e5067cc..0ec24ab 100644
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@@ -112,8 +112,12 @@ class LivePortraitPipeline(object):
             c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']
             driving_n_frames = driving_template_dct['n_frames']
             flag_is_driving_video = True if driving_n_frames > 1 else False
-            if flag_is_source_video:
+            # if flag_is_source_video and not flag_is_driving_video:
+            #     raise Exception(f"Animating a source video with a driving image is not supported!")
+            if flag_is_source_video and flag_is_driving_video:
                 n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames
+            elif flag_is_source_video and not flag_is_driving_video:
+                n_frames = len(source_rgb_lst)
             else:
                 n_frames = driving_n_frames
 
@@ -134,8 +138,10 @@ class LivePortraitPipeline(object):
                 driving_rgb_lst = load_video(args.driving)
             elif is_image(args.driving):
                 flag_is_driving_video = False
+                # if flag_is_source_video:
+                #     raise Exception(f"Animating a source video with a driving image is not supported!")
                 driving_img_rgb = load_image_rgb(args.driving)
-                output_fps = 1
+                output_fps = 25
                 log(f"Load driving image from {args.driving}")
                 driving_rgb_lst = [driving_img_rgb]
             else:
@@ -143,9 +149,11 @@ class LivePortraitPipeline(object):
             ######## make motion template ########
             log("Start making driving motion template...")
             driving_n_frames = len(driving_rgb_lst)
-            if flag_is_source_video:
+            if flag_is_source_video and flag_is_driving_video:
                 n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames
                 driving_rgb_lst = driving_rgb_lst[:n_frames]
+            elif flag_is_source_video and not flag_is_driving_video:
+                n_frames = len(source_rgb_lst)
             else:
                 n_frames = driving_n_frames
             if inf_cfg.flag_crop_driving_video or (not is_square_video(args.driving)):
@@ -207,15 +215,23 @@ class LivePortraitPipeline(object):
             if inf_cfg.flag_relative_motion:
                 x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
                 x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
-                if inf_cfg.flag_video_editing_head_rotation:
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
                     x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
                     x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
             else:
-                x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
-                x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
-                if inf_cfg.flag_video_editing_head_rotation:
-                    x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
-                    x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                if flag_is_driving_video:
+                    x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
+                    x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                else:
+                    x_d_exp_lst = [driving_template_dct['motion'][0]['exp']]
+                    x_d_exp_lst_smooth = [torch.tensor(x_d_exp[0], dtype=torch.float32, device=device) for x_d_exp in x_d_exp_lst]*n_frames
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    if flag_is_driving_video:
+                        x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
+                        x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
+                    else:
+                        x_d_r_lst = [driving_template_dct['motion'][0][key_r]]
+                        x_d_r_lst_smooth = [torch.tensor(x_d_r[0], dtype=torch.float32, device=device) for x_d_r in x_d_r_lst]*n_frames
 
         else:  # if the input is a source image, process it only once
             if inf_cfg.flag_do_crop:
@@ -281,8 +297,10 @@ class LivePortraitPipeline(object):
 
                 if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:  # prepare for paste back
                     mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, source_M_c2o_lst[i], dsize=(source_rgb_lst[i].shape[1], source_rgb_lst[i].shape[0]))
-
-            x_d_i_info = driving_template_dct['motion'][i]
+            if flag_is_source_video and not flag_is_driving_video:
+                x_d_i_info = driving_template_dct['motion'][0]
+            else:
+                x_d_i_info = driving_template_dct['motion'][i]
             x_d_i_info = dct2device(x_d_i_info, device)
             R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
 
@@ -292,24 +310,17 @@ class LivePortraitPipeline(object):
 
             delta_new = x_s_info['exp'].clone()
             if inf_cfg.flag_relative_motion:
-                if flag_is_source_video:
-                    if inf_cfg.flag_video_editing_head_rotation:
-                        R_new = x_d_r_lst_smooth[i]
-                    else:
-                        R_new = R_s
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
                 else:
-                    if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
-
-                        R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
-                    else:
-                        R_new = R_s
+                    R_new = R_s
                 if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "exp":
                     delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
                 elif inf_cfg.animation_region == "lip":
-                    for lip_idx in [14, 17, 19, 20]:
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
                         delta_new[:, lip_idx, :] =  x_d_exp_lst_smooth[i][lip_idx, :] if flag_is_source_video else (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, lip_idx, :]
                 elif inf_cfg.animation_region == "eyes":
-                    for eyes_idx in [11, 13, 15, 16]:
+                    for eyes_idx in [11, 13, 15, 16, 18]:
                         delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :] if flag_is_source_video else (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:, eyes_idx, :]
                 if inf_cfg.animation_region == "all":
                     scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
@@ -320,19 +331,11 @@ class LivePortraitPipeline(object):
                 else:
                     t_new = x_s_info['t']
             else:
-                if flag_is_source_video:
-                    if inf_cfg.flag_video_editing_head_rotation:
-                        R_new = x_d_r_lst_smooth[i]
-                    else:
-                        R_new = R_s
+                if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
+                    R_new = x_d_r_lst_smooth[i] if flag_is_source_video else R_d_i
                 else:
-                    if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
-
-                        R_new = R_d_i
-                    else:
-                        R_new = R_s
+                    R_new = R_s
                 if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "exp":
-                    # delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_d_i_info['exp']
                     for idx in [1,2,6,11,12,13,14,15,16,17,18,19,20]:
                         delta_new[:, idx, :] = x_d_exp_lst_smooth[i][idx, :] if flag_is_source_video else x_d_i_info['exp'][:, idx, :]
                     delta_new[:, 3:5, 1] = x_d_exp_lst_smooth[i][3:5, 1] if flag_is_source_video else x_d_i_info['exp'][:, 3:5, 1]
@@ -340,10 +343,10 @@ class LivePortraitPipeline(object):
                     delta_new[:, 8, 2] = x_d_exp_lst_smooth[i][8, 2] if flag_is_source_video else x_d_i_info['exp'][:, 8, 2]
                     delta_new[:, 9, 1:] = x_d_exp_lst_smooth[i][9, 1:] if flag_is_source_video else x_d_i_info['exp'][:, 9, 1:]
                 elif inf_cfg.animation_region == "lip":
-                    for lip_idx in [14, 17, 19, 20]:
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
                         delta_new[:, lip_idx, :] = x_d_exp_lst_smooth[i][lip_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, lip_idx, :]
                 elif inf_cfg.animation_region == "eyes":
-                    for eyes_idx in [11, 13, 15, 16]:
+                    for eyes_idx in [11, 13, 15, 16, 18]:
                         delta_new[:, eyes_idx, :] = x_d_exp_lst_smooth[i][eyes_idx, :] if flag_is_source_video else x_d_i_info['exp'][:, eyes_idx, :]
                 scale_new = x_s_info['scale']
                 if inf_cfg.animation_region == "all" or inf_cfg.animation_region == "pose":
@@ -421,12 +424,14 @@ class LivePortraitPipeline(object):
         wfp_concat = None
         ######### build the final concatenation result #########
         # driving frame | source frame | generation
-        if flag_is_source_video:
+        if flag_is_source_video and flag_is_driving_video:
             frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, img_crop_256x256_lst, I_p_lst)
+        elif flag_is_source_video and not flag_is_driving_video:
+            frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst*n_frames, img_crop_256x256_lst, I_p_lst)
         else:
             frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
 
-        if flag_is_driving_video:
+        if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
             flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)
             flag_driving_has_audio = (not flag_load_from_template) and has_audio_stream(args.driving)