diff --git a/src/config/argument_config.py b/src/config/argument_config.py index 5930dbe..435ab05 100644 --- a/src/config/argument_config.py +++ b/src/config/argument_config.py @@ -3,11 +3,10 @@ """ All configs for user """ - from dataclasses import dataclass import tyro from typing_extensions import Annotated -from typing import Optional +from typing import Optional, Literal from .base_config import PrintableConfig, make_abs_path @@ -33,13 +32,15 @@ class ArgumentConfig(PrintableConfig): flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space flag_do_crop: bool = True # whether to crop the source portrait or video to the face-cropping space driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy - + audio_priority: Literal['source', 'driving'] = 'driving' # whether to use the audio from source or driving video ########## source crop arguments ########## det_thresh: float = 0.15 # detection threshold scale: float = 2.3 # the ratio of face area is smaller if scale is larger vx_ratio: float = 0 # the ratio to move the face to left or right in cropping space vy_ratio: float = -0.125 # the ratio to move the face to up or down in cropping space flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True + source_max_dim: int = 1280 # the max dim of height and width of source image or video, you can change it to a larger number, e.g., 1920 + source_division: int = 2 # make sure the height and width of source image or video can be divided by this number ########## driving crop arguments ########## scale_crop_driving_video: float = 2.2 # scale factor for cropping driving video diff --git a/src/config/inference_config.py b/src/config/inference_config.py index c1f8653..aa06203 100644 --- a/src/config/inference_config.py +++ b/src/config/inference_config.py @@ -37,11 +37,13 @@ class InferenceConfig(PrintableConfig): flag_do_rot: bool = True flag_force_cpu: bool = False flag_do_torch_compile: bool = False + driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy + source_max_dim: int = 1280 # the max dim of height and width of source image or video + source_division: int = 2 # make sure the height and width of source image or video can be divided by this number # NOT EXPORTED PARAMS lip_normalize_threshold: float = 0.03 # threshold for flag_normalize_lip source_video_eye_retargeting_threshold: float = 0.18 # threshold for eyes retargeting if the input is a source video - driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy anchor_frame: int = 0 # TO IMPLEMENT input_shape: Tuple[int, int] = (256, 256) # input shape @@ -51,5 +53,3 @@ class InferenceConfig(PrintableConfig): mask_crop: ndarray = field(default_factory=lambda: cv2.imread(make_abs_path('../utils/resources/mask_template.png'), cv2.IMREAD_COLOR)) size_gif: int = 256 # default gif size, TO IMPLEMENT - source_max_dim: int = 1280 # the max dim of height and width of source image or video - source_division: int = 2 # make sure the height and width of source image or video can be divided by this number diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py index 95c2a50..683a1ce 100644 --- a/src/live_portrait_pipeline.py +++ b/src/live_portrait_pipeline.py @@ -19,9 +19,9 @@ from .config.crop_config import CropConfig from .utils.cropper import Cropper from .utils.camera import get_rotation_matrix from .utils.video import images2video, concat_frames, get_fps, add_audio_to_video, has_audio_stream -from .utils.crop import _transform_img, prepare_paste_back, paste_back +from .utils.crop import prepare_paste_back, paste_back from .utils.io import load_image_rgb, load_video, resize_to_limit, dump, load -from .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image +from .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image, is_square_video from .utils.filter import smooth from .utils.rprint import rlog as log # from .utils.viz import viz_lmk @@ -137,7 +137,7 @@ class LivePortraitPipeline(object): driving_rgb_lst = driving_rgb_lst[:n_frames] else: n_frames = driving_n_frames - if inf_cfg.flag_crop_driving_video: + if inf_cfg.flag_crop_driving_video or (not is_square_video(args.driving)): ret_d = self.cropper.crop_driving_video(driving_rgb_lst) log(f'Driving video is cropped, {len(ret_d["frame_crop_lst"])} frames are processed.') if len(ret_d["frame_crop_lst"]) is not n_frames: @@ -382,8 +382,7 @@ class LivePortraitPipeline(object): if flag_source_has_audio or flag_driving_has_audio: # final result with concatenation wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4') - # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio - audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio + audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source log(f"Audio is selected from {audio_from_which_video}, concat mode") add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio) os.replace(wfp_concat_with_audio, wfp_concat) @@ -399,8 +398,7 @@ class LivePortraitPipeline(object): ######### build the final result ######### if flag_source_has_audio or flag_driving_has_audio: wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4') - # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio - audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio + audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source log(f"Audio is selected from {audio_from_which_video}") add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio) os.replace(wfp_with_audio, wfp) diff --git a/src/utils/cropper.py b/src/utils/cropper.py index c42e74b..67cbf26 100644 --- a/src/utils/cropper.py +++ b/src/utils/cropper.py @@ -135,6 +135,7 @@ class Cropper(object): return lmk + # TODO: support skipping frame with NO FACE def crop_source_video(self, source_rgb_lst, crop_cfg: CropConfig, **kwargs): """Tracking based landmarks/alignment and cropping""" trajectory = Trajectory() @@ -157,8 +158,10 @@ class Cropper(object): lmk = self.landmark_runner.run(frame_rgb, lmk) trajectory.start, trajectory.end = idx, idx else: + # TODO: add IOU check for tracking lmk = self.landmark_runner.run(frame_rgb, trajectory.lmk_lst[-1]) trajectory.end = idx + trajectory.lmk_lst.append(lmk) # crop the face