* chore: fix v2v lip normalization and add smooth to non relative motion

* chore: fix v2v lip normalization and add smooth to non relative motion

* chore: fix v2v lip normalization and add smooth to non relative motion

---------

Co-authored-by: zhangdingyun <zhangdingyun@kuaishou.com>
This commit is contained in:
Jianzhu Guo 2024-07-25 17:56:40 +08:00 committed by GitHub
parent d654a014da
commit 172b852127
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 12 deletions

View File

@ -26,8 +26,8 @@ class ArgumentConfig(PrintableConfig):
flag_normalize_lip: bool = True # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False flag_normalize_lip: bool = True # whether to let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
flag_source_video_eye_retargeting: bool = False # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering flag_source_video_eye_retargeting: bool = False # when the input is a source video, whether to let the eye-open scalar of each frame to be the same as the first source frame before the animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False, may cause the inter-frame jittering
flag_video_editing_head_rotation: bool = False # when the input is a source video, whether to inherit the relative head rotation from the driving video flag_video_editing_head_rotation: bool = False # when the input is a source video, whether to inherit the relative head rotation from the driving video
flag_eye_retargeting: bool = False # not recommend to be True, WIP flag_eye_retargeting: bool = False # not recommend to be True, WIP; whether to transfer the eyes-open ratio of each driving frame to the source image or the corresponding source frame
flag_lip_retargeting: bool = False # not recommend to be True, WIP flag_lip_retargeting: bool = False # not recommend to be True, WIP; whether to transfer the lip-open ratio of each driving frame to the source image or the corresponding source frame
flag_stitching: bool = True # recommend to True if head movement is small, False if head movement is large flag_stitching: bool = True # recommend to True if head movement is small, False if head movement is large
flag_relative_motion: bool = True # whether to use relative motion flag_relative_motion: bool = True # whether to use relative motion
flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space

View File

@ -193,12 +193,20 @@ class LivePortraitPipeline(object):
I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst) I_s_lst = self.live_portrait_wrapper.prepare_videos(img_crop_256x256_lst)
source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps) source_template_dct = self.make_motion_template(I_s_lst, c_s_eyes_lst, c_s_lip_lst, output_fps=source_fps)
x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)] key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d' # compatible with previous keys
x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance) if inf_cfg.flag_relative_motion:
if inf_cfg.flag_video_editing_head_rotation: x_d_exp_lst = [source_template_dct['motion'][i]['exp'] + driving_template_dct['motion'][i]['exp'] - driving_template_dct['motion'][0]['exp'] for i in range(n_frames)]
key_r = 'R' if 'R' in driving_template_dct['motion'][0].keys() else 'R_d' # compatible with previous keys x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)] if inf_cfg.flag_video_editing_head_rotation:
x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance) x_d_r_lst = [(np.dot(driving_template_dct['motion'][i][key_r], driving_template_dct['motion'][0][key_r].transpose(0, 2, 1))) @ source_template_dct['motion'][i]['R'] for i in range(n_frames)]
x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
else:
x_d_exp_lst = [driving_template_dct['motion'][i]['exp'] for i in range(n_frames)]
x_d_exp_lst_smooth = smooth(x_d_exp_lst, source_template_dct['motion'][0]['exp'].shape, device, inf_cfg.driving_smooth_observation_variance)
if inf_cfg.flag_video_editing_head_rotation:
x_d_r_lst = [driving_template_dct['motion'][i][key_r] for i in range(n_frames)]
x_d_r_lst_smooth = smooth(x_d_r_lst, source_template_dct['motion'][0]['R'].shape, device, inf_cfg.driving_smooth_observation_variance)
else: # if the input is a source image, process it only once else: # if the input is a source image, process it only once
if inf_cfg.flag_do_crop: if inf_cfg.flag_do_crop:
crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg) crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)
@ -217,7 +225,7 @@ class LivePortraitPipeline(object):
x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info) x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
# let lip-open scalar to be 0 at first # let lip-open scalar to be 0 at first
if flag_normalize_lip and source_lmk is not None: if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
c_d_lip_before_animation = [0.] c_d_lip_before_animation = [0.]
combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk) combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold: if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
@ -244,11 +252,13 @@ class LivePortraitPipeline(object):
x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info) x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
# let lip-open scalar to be 0 at first if the input is a video # let lip-open scalar to be 0 at first if the input is a video
if flag_normalize_lip and source_lmk is not None: if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
c_d_lip_before_animation = [0.] c_d_lip_before_animation = [0.]
combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk) combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold: if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation) lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
else:
lip_delta_before_animation = None
# let eye-open scalar to be the same as the first frame if the latter is eye-open state # let eye-open scalar to be the same as the first frame if the latter is eye-open state
if flag_source_video_eye_retargeting and source_lmk is not None: if flag_source_video_eye_retargeting and source_lmk is not None:
@ -284,8 +294,14 @@ class LivePortraitPipeline(object):
scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale']) scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t']) t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
else: else:
R_new = R_d_i if flag_is_source_video:
delta_new = x_d_i_info['exp'] if inf_cfg.flag_video_editing_head_rotation:
R_new = x_d_r_lst_smooth[i]
else:
R_new = R_s
else:
R_new = R_d_i
delta_new = x_d_exp_lst_smooth[i] if flag_is_source_video else x_d_i_info['exp']
scale_new = x_s_info['scale'] scale_new = x_s_info['scale']
t_new = x_d_i_info['t'] t_new = x_d_i_info['t']