From 085b535020a8d8dfff92e7341c1dc88eef6e1ebd Mon Sep 17 00:00:00 2001
From: guojianzhu <guojianzhu@kuaishou.com>
Date: Sat, 20 Jul 2024 17:55:25 +0800
Subject: [PATCH] chore: change default audio, smooth, autoplay

---
 app.py                         | 26 +++++++++++++-------------
 src/config/argument_config.py  |  2 +-
 src/config/inference_config.py |  2 +-
 src/gradio_pipeline.py         |  2 +-
 src/live_portrait_pipeline.py  |  6 ++++--
 src/utils/filter.py            |  2 +-
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/app.py b/app.py
index f618263..9b59a4d 100644
--- a/app.py
+++ b/app.py
@@ -69,12 +69,12 @@ data_examples_i2v = [
     [osp.join(example_portrait_dir, "s2.jpg"), osp.join(example_video_dir, "d13.mp4"), True, True, True, True],
 ]
 data_examples_v2v = [
-    [osp.join(example_portrait_dir, "s13.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 3e-6],
-    # [osp.join(example_portrait_dir, "s14.mp4"), osp.join(example_video_dir, "d18.mp4"), True, True, True, False, False, 3e-6],
-    # [osp.join(example_portrait_dir, "s15.mp4"), osp.join(example_video_dir, "d19.mp4"), True, True, True, False, False, 3e-6],
-    [osp.join(example_portrait_dir, "s18.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 3e-6],
-    # [osp.join(example_portrait_dir, "s19.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 3e-6],
-    [osp.join(example_portrait_dir, "s20.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 3e-6],
+    [osp.join(example_portrait_dir, "s13.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 1e-7],
+    # [osp.join(example_portrait_dir, "s14.mp4"), osp.join(example_video_dir, "d18.mp4"), True, True, True, False, False, 1e-7],
+    # [osp.join(example_portrait_dir, "s15.mp4"), osp.join(example_video_dir, "d19.mp4"), True, True, True, False, False, 1e-7],
+    [osp.join(example_portrait_dir, "s18.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 1e-7],
+    # [osp.join(example_portrait_dir, "s19.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 1e-7],
+    [osp.join(example_portrait_dir, "s20.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 1e-7],
 ]
 #################### interface logic ####################
 
@@ -84,10 +84,10 @@ lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="tar
 retargeting_input_image = gr.Image(type="filepath")
 output_image = gr.Image(type="numpy")
 output_image_paste_back = gr.Image(type="numpy")
-output_video_i2v = gr.Video(autoplay=True)
-output_video_concat_i2v = gr.Video(autoplay=True)
-output_video_v2v = gr.Video(autoplay=True)
-output_video_concat_v2v = gr.Video(autoplay=True)
+output_video_i2v = gr.Video(autoplay=False)
+output_video_concat_i2v = gr.Video(autoplay=False)
+# output_video_v2v = gr.Video(autoplay=False)
+# output_video_concat_v2v = gr.Video(autoplay=False)
 
 
 with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo:
@@ -135,7 +135,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
             with gr.Accordion(open=True, label="Cropping Options for Source Image or Video"):
                 with gr.Row():
                     flag_do_crop_input = gr.Checkbox(value=True, label="do crop (source)")
-                    scale = gr.Number(value=2.3, label="source crop scale", minimum=1.8, maximum=2.9, step=0.05)
+                    scale = gr.Number(value=2.3, label="source crop scale", minimum=1.8, maximum=3.2, step=0.05)
                     vx_ratio = gr.Number(value=0.0, label="source crop x", minimum=-0.5, maximum=0.5, step=0.01)
                     vy_ratio = gr.Number(value=-0.125, label="source crop y", minimum=-0.5, maximum=0.5, step=0.01)
 
@@ -158,7 +158,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
             with gr.Accordion(open=True, label="Cropping Options for Driving Video"):
                 with gr.Row():
                     flag_crop_driving_video_input = gr.Checkbox(value=False, label="do crop (driving)")
-                    scale_crop_driving_video = gr.Number(value=2.2, label="driving crop scale", minimum=1.8, maximum=2.9, step=0.05)
+                    scale_crop_driving_video = gr.Number(value=2.2, label="driving crop scale", minimum=1.8, maximum=3.2, step=0.05)
                     vx_ratio_crop_driving_video = gr.Number(value=0.0, label="driving crop x", minimum=-0.5, maximum=0.5, step=0.01)
                     vy_ratio_crop_driving_video = gr.Number(value=-0.1, label="driving crop y", minimum=-0.5, maximum=0.5, step=0.01)
 
@@ -168,7 +168,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
                 flag_relative_input = gr.Checkbox(value=True, label="relative motion")
                 flag_remap_input = gr.Checkbox(value=True, label="paste-back")
                 flag_video_editing_head_rotation = gr.Checkbox(value=False, label="relative head rotation (v2v)")
-                driving_smooth_observation_variance = gr.Number(value=3e-6, label="motion smooth strength (v2v)", minimum=1e-11, maximum=1e-2, step=1e-11)
+                driving_smooth_observation_variance = gr.Number(value=1e-7, label="motion smooth strength (v2v)", minimum=1e-11, maximum=1e-2, step=1e-8)
 
     gr.Markdown(load_description("assets/gradio/gradio_description_animate_clear.md"))
     with gr.Row():
diff --git a/src/config/argument_config.py b/src/config/argument_config.py
index 7a130e8..08d17a7 100644
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@@ -32,7 +32,7 @@ class ArgumentConfig(PrintableConfig):
     flag_relative_motion: bool = True  # whether to use relative motion
     flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
     flag_do_crop: bool = True  # whether to crop the source portrait or video to the face-cropping space
-    driving_smooth_observation_variance: float = 3e-6  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    driving_smooth_observation_variance: float = 1e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
 
     ########## source crop arguments ##########
     scale: float = 2.3  # the ratio of face area is smaller if scale is larger
diff --git a/src/config/inference_config.py b/src/config/inference_config.py
index adb313f..48bf88c 100644
--- a/src/config/inference_config.py
+++ b/src/config/inference_config.py
@@ -41,7 +41,7 @@ class InferenceConfig(PrintableConfig):
     # NOT EXPORTED PARAMS
     lip_normalize_threshold: float = 0.03 # threshold for flag_normalize_lip
     source_video_eye_retargeting_threshold: float = 0.18 # threshold for eyes retargeting if the input is a source video
-    driving_smooth_observation_variance: float = 3e-6 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    driving_smooth_observation_variance: float = 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
     anchor_frame: int = 0 # TO IMPLEMENT
 
     input_shape: Tuple[int, int] = (256, 256)  # input shape
diff --git a/src/gradio_pipeline.py b/src/gradio_pipeline.py
index 7b01370..cbe898e 100644
--- a/src/gradio_pipeline.py
+++ b/src/gradio_pipeline.py
@@ -48,7 +48,7 @@ class GradioPipeline(LivePortraitPipeline):
         scale_crop_driving_video=2.2,
         vx_ratio_crop_driving_video=0.0,
         vy_ratio_crop_driving_video=-0.1,
-        driving_smooth_observation_variance=3e-6,
+        driving_smooth_observation_variance=1e-7,
         tab_selection=None,
     ):
         """ for video-driven potrait animation or video editing
diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py
index 59fc9e1..e344ffc 100644
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@@ -367,7 +367,8 @@ class LivePortraitPipeline(object):
         if flag_source_has_audio or flag_driving_has_audio:
             # final result with concatenation
             wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
-            audio_from_which_video = args.source if flag_source_has_audio else args.driving
+            # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio
+            audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio
             log(f"Audio is selected from {audio_from_which_video}, concat mode")
             add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
             os.replace(wfp_concat_with_audio, wfp_concat)
@@ -383,7 +384,8 @@ class LivePortraitPipeline(object):
         ######### build the final result #########
         if flag_source_has_audio or flag_driving_has_audio:
             wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
-            audio_from_which_video = args.source if flag_source_has_audio else args.driving
+            # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio
+            audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio
             log(f"Audio is selected from {audio_from_which_video}")
             add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
             os.replace(wfp_with_audio, wfp)
diff --git a/src/utils/filter.py b/src/utils/filter.py
index 2ee6abc..5238f49 100644
--- a/src/utils/filter.py
+++ b/src/utils/filter.py
@@ -5,7 +5,7 @@ import numpy as np
 from pykalman import KalmanFilter
 
 
-def smooth(x_d_lst, shape, device, observation_variance=3e-6, process_variance=1e-5):
+def smooth(x_d_lst, shape, device, observation_variance=1e-7, process_variance=1e-5):
     x_d_lst_reshape = [x.reshape(-1) for x in x_d_lst]
     x_d_stacked = np.vstack(x_d_lst_reshape)
     kf = KalmanFilter(