chore: change default audio, smooth, autoplay

2025-03-14 13:02:13 +00:00 · 2024-07-20 17:55:25 +08:00 · 2024-07-20 17:55:25 +08:00 · 085b535020
commit 085b535020
parent 51543310a2
6 changed files with 21 additions and 19 deletions
--- a/app.py
+++ b/app.py
@ -69,12 +69,12 @@ data_examples_i2v = [
    [osp.join(example_portrait_dir, "s2.jpg"), osp.join(example_video_dir, "d13.mp4"), True, True, True, True],
 ]
 data_examples_v2v = [
-    [osp.join(example_portrait_dir, "s13.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 3e-6],
-    # [osp.join(example_portrait_dir, "s14.mp4"), osp.join(example_video_dir, "d18.mp4"), True, True, True, False, False, 3e-6],
-    # [osp.join(example_portrait_dir, "s15.mp4"), osp.join(example_video_dir, "d19.mp4"), True, True, True, False, False, 3e-6],
-    [osp.join(example_portrait_dir, "s18.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 3e-6],
-    # [osp.join(example_portrait_dir, "s19.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 3e-6],
-    [osp.join(example_portrait_dir, "s20.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 3e-6],
+    [osp.join(example_portrait_dir, "s13.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 1e-7],
+    # [osp.join(example_portrait_dir, "s14.mp4"), osp.join(example_video_dir, "d18.mp4"), True, True, True, False, False, 1e-7],
+    # [osp.join(example_portrait_dir, "s15.mp4"), osp.join(example_video_dir, "d19.mp4"), True, True, True, False, False, 1e-7],
+    [osp.join(example_portrait_dir, "s18.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 1e-7],
+    # [osp.join(example_portrait_dir, "s19.mp4"), osp.join(example_video_dir, "d6.mp4"), True, True, True, False, False, 1e-7],
+    [osp.join(example_portrait_dir, "s20.mp4"), osp.join(example_video_dir, "d0.mp4"), True, True, True, False, False, 1e-7],
 ]
 #################### interface logic ####################

@ -84,10 +84,10 @@ lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="tar
 retargeting_input_image = gr.Image(type="filepath")
 output_image = gr.Image(type="numpy")
 output_image_paste_back = gr.Image(type="numpy")
-output_video_i2v = gr.Video(autoplay=True)
-output_video_concat_i2v = gr.Video(autoplay=True)
-output_video_v2v = gr.Video(autoplay=True)
-output_video_concat_v2v = gr.Video(autoplay=True)
+output_video_i2v = gr.Video(autoplay=False)
+output_video_concat_i2v = gr.Video(autoplay=False)
+# output_video_v2v = gr.Video(autoplay=False)
+# output_video_concat_v2v = gr.Video(autoplay=False)


 with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo:
@ -135,7 +135,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
            with gr.Accordion(open=True, label="Cropping Options for Source Image or Video"):
                with gr.Row():
                    flag_do_crop_input = gr.Checkbox(value=True, label="do crop (source)")
-                    scale = gr.Number(value=2.3, label="source crop scale", minimum=1.8, maximum=2.9, step=0.05)
+                    scale = gr.Number(value=2.3, label="source crop scale", minimum=1.8, maximum=3.2, step=0.05)
                    vx_ratio = gr.Number(value=0.0, label="source crop x", minimum=-0.5, maximum=0.5, step=0.01)
                    vy_ratio = gr.Number(value=-0.125, label="source crop y", minimum=-0.5, maximum=0.5, step=0.01)

@ -158,7 +158,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
            with gr.Accordion(open=True, label="Cropping Options for Driving Video"):
                with gr.Row():
                    flag_crop_driving_video_input = gr.Checkbox(value=False, label="do crop (driving)")
-                    scale_crop_driving_video = gr.Number(value=2.2, label="driving crop scale", minimum=1.8, maximum=2.9, step=0.05)
+                    scale_crop_driving_video = gr.Number(value=2.2, label="driving crop scale", minimum=1.8, maximum=3.2, step=0.05)
                    vx_ratio_crop_driving_video = gr.Number(value=0.0, label="driving crop x", minimum=-0.5, maximum=0.5, step=0.01)
                    vy_ratio_crop_driving_video = gr.Number(value=-0.1, label="driving crop y", minimum=-0.5, maximum=0.5, step=0.01)

@ -168,7 +168,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
                flag_relative_input = gr.Checkbox(value=True, label="relative motion")
                flag_remap_input = gr.Checkbox(value=True, label="paste-back")
                flag_video_editing_head_rotation = gr.Checkbox(value=False, label="relative head rotation (v2v)")
-                driving_smooth_observation_variance = gr.Number(value=3e-6, label="motion smooth strength (v2v)", minimum=1e-11, maximum=1e-2, step=1e-11)
+                driving_smooth_observation_variance = gr.Number(value=1e-7, label="motion smooth strength (v2v)", minimum=1e-11, maximum=1e-2, step=1e-8)

    gr.Markdown(load_description("assets/gradio/gradio_description_animate_clear.md"))
    with gr.Row():
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@ -32,7 +32,7 @@ class ArgumentConfig(PrintableConfig):
    flag_relative_motion: bool = True  # whether to use relative motion
    flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
    flag_do_crop: bool = True  # whether to crop the source portrait or video to the face-cropping space
-    driving_smooth_observation_variance: float = 3e-6  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    driving_smooth_observation_variance: float = 1e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy

    ########## source crop arguments ##########
    scale: float = 2.3  # the ratio of face area is smaller if scale is larger
--- a/src/config/inference_config.py
+++ b/src/config/inference_config.py
@ -41,7 +41,7 @@ class InferenceConfig(PrintableConfig):
    # NOT EXPORTED PARAMS
    lip_normalize_threshold: float = 0.03 # threshold for flag_normalize_lip
    source_video_eye_retargeting_threshold: float = 0.18 # threshold for eyes retargeting if the input is a source video
-    driving_smooth_observation_variance: float = 3e-6 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    driving_smooth_observation_variance: float = 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
    anchor_frame: int = 0 # TO IMPLEMENT

    input_shape: Tuple[int, int] = (256, 256)  # input shape
--- a/src/gradio_pipeline.py
+++ b/src/gradio_pipeline.py
@ -48,7 +48,7 @@ class GradioPipeline(LivePortraitPipeline):
        scale_crop_driving_video=2.2,
        vx_ratio_crop_driving_video=0.0,
        vy_ratio_crop_driving_video=-0.1,
-        driving_smooth_observation_variance=3e-6,
+        driving_smooth_observation_variance=1e-7,
        tab_selection=None,
    ):
        """ for video-driven potrait animation or video editing
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@ -367,7 +367,8 @@ class LivePortraitPipeline(object):
        if flag_source_has_audio or flag_driving_has_audio:
            # final result with concatenation
            wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
-            audio_from_which_video = args.source if flag_source_has_audio else args.driving
+            # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio
+            audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio
            log(f"Audio is selected from {audio_from_which_video}, concat mode")
            add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
            os.replace(wfp_concat_with_audio, wfp_concat)
@ -383,7 +384,8 @@ class LivePortraitPipeline(object):
        ######### build the final result #########
        if flag_source_has_audio or flag_driving_has_audio:
            wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
-            audio_from_which_video = args.source if flag_source_has_audio else args.driving
+            # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio
+            audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio
            log(f"Audio is selected from {audio_from_which_video}")
            add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
            os.replace(wfp_with_audio, wfp)
--- a/src/utils/filter.py
+++ b/src/utils/filter.py
@ -5,7 +5,7 @@ import numpy as np
 from pykalman import KalmanFilter


-def smooth(x_d_lst, shape, device, observation_variance=3e-6, process_variance=1e-5):
+def smooth(x_d_lst, shape, device, observation_variance=1e-7, process_variance=1e-5):
    x_d_lst_reshape = [x.reshape(-1) for x in x_d_lst]
    x_d_stacked = np.vstack(x_d_lst_reshape)
    kf = KalmanFilter(