diff --git a/src/config/argument_config.py b/src/config/argument_config.py
index 5930dbe..435ab05 100644
--- a/src/config/argument_config.py
+++ b/src/config/argument_config.py
@@ -3,11 +3,10 @@
 """
 All configs for user
 """
-
 from dataclasses import dataclass
 import tyro
 from typing_extensions import Annotated
-from typing import Optional
+from typing import Optional, Literal
 from .base_config import PrintableConfig, make_abs_path
 
 
@@ -33,13 +32,15 @@ class ArgumentConfig(PrintableConfig):
     flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
     flag_do_crop: bool = True  # whether to crop the source portrait or video to the face-cropping space
     driving_smooth_observation_variance: float = 3e-7  # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
-
+    audio_priority: Literal['source', 'driving'] = 'driving'  # whether to use the audio from source or driving video
     ########## source crop arguments ##########
     det_thresh: float = 0.15 # detection threshold
     scale: float = 2.3  # the ratio of face area is smaller if scale is larger
     vx_ratio: float = 0  # the ratio to move the face to left or right in cropping space
     vy_ratio: float = -0.125  # the ratio to move the face to up or down in cropping space
     flag_do_rot: bool = True  # whether to conduct the rotation when flag_do_crop is True
+    source_max_dim: int = 1280 # the max dim of height and width of source image or video, you can change it to a larger number, e.g., 1920
+    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number
 
     ########## driving crop arguments ##########
     scale_crop_driving_video: float = 2.2  # scale factor for cropping driving video
diff --git a/src/config/inference_config.py b/src/config/inference_config.py
index c1f8653..aa06203 100644
--- a/src/config/inference_config.py
+++ b/src/config/inference_config.py
@@ -37,11 +37,13 @@ class InferenceConfig(PrintableConfig):
     flag_do_rot: bool = True
     flag_force_cpu: bool = False
     flag_do_torch_compile: bool = False
+    driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+    source_max_dim: int = 1280 # the max dim of height and width of source image or video
+    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number
 
     # NOT EXPORTED PARAMS
     lip_normalize_threshold: float = 0.03 # threshold for flag_normalize_lip
     source_video_eye_retargeting_threshold: float = 0.18 # threshold for eyes retargeting if the input is a source video
-    driving_smooth_observation_variance: float = 3e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
     anchor_frame: int = 0 # TO IMPLEMENT
 
     input_shape: Tuple[int, int] = (256, 256)  # input shape
@@ -51,5 +53,3 @@ class InferenceConfig(PrintableConfig):
 
     mask_crop: ndarray = field(default_factory=lambda: cv2.imread(make_abs_path('../utils/resources/mask_template.png'), cv2.IMREAD_COLOR))
     size_gif: int = 256 # default gif size, TO IMPLEMENT
-    source_max_dim: int = 1280 # the max dim of height and width of source image or video
-    source_division: int = 2 # make sure the height and width of source image or video can be divided by this number
diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py
index 95c2a50..683a1ce 100644
--- a/src/live_portrait_pipeline.py
+++ b/src/live_portrait_pipeline.py
@@ -19,9 +19,9 @@ from .config.crop_config import CropConfig
 from .utils.cropper import Cropper
 from .utils.camera import get_rotation_matrix
 from .utils.video import images2video, concat_frames, get_fps, add_audio_to_video, has_audio_stream
-from .utils.crop import _transform_img, prepare_paste_back, paste_back
+from .utils.crop import prepare_paste_back, paste_back
 from .utils.io import load_image_rgb, load_video, resize_to_limit, dump, load
-from .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image
+from .utils.helper import mkdir, basename, dct2device, is_video, is_template, remove_suffix, is_image, is_square_video
 from .utils.filter import smooth
 from .utils.rprint import rlog as log
 # from .utils.viz import viz_lmk
@@ -137,7 +137,7 @@ class LivePortraitPipeline(object):
                 driving_rgb_lst = driving_rgb_lst[:n_frames]
             else:
                 n_frames = driving_n_frames
-            if inf_cfg.flag_crop_driving_video:
+            if inf_cfg.flag_crop_driving_video or (not is_square_video(args.driving)):
                 ret_d = self.cropper.crop_driving_video(driving_rgb_lst)
                 log(f'Driving video is cropped, {len(ret_d["frame_crop_lst"])} frames are processed.')
                 if len(ret_d["frame_crop_lst"]) is not n_frames:
@@ -382,8 +382,7 @@ class LivePortraitPipeline(object):
         if flag_source_has_audio or flag_driving_has_audio:
             # final result with concatenation
             wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
-            # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio
-            audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio
+            audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
             log(f"Audio is selected from {audio_from_which_video}, concat mode")
             add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
             os.replace(wfp_concat_with_audio, wfp_concat)
@@ -399,8 +398,7 @@ class LivePortraitPipeline(object):
         ######### build the final result #########
         if flag_source_has_audio or flag_driving_has_audio:
             wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
-            # audio_from_which_video = args.source if flag_source_has_audio else args.driving # default source audio
-            audio_from_which_video = args.driving if flag_driving_has_audio else args.source # default driving audio
+            audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
             log(f"Audio is selected from {audio_from_which_video}")
             add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
             os.replace(wfp_with_audio, wfp)
diff --git a/src/utils/cropper.py b/src/utils/cropper.py
index c42e74b..67cbf26 100644
--- a/src/utils/cropper.py
+++ b/src/utils/cropper.py
@@ -135,6 +135,7 @@ class Cropper(object):
 
         return lmk
 
+    # TODO: support skipping frame with NO FACE
     def crop_source_video(self, source_rgb_lst, crop_cfg: CropConfig, **kwargs):
         """Tracking based landmarks/alignment and cropping"""
         trajectory = Trajectory()
@@ -157,8 +158,10 @@ class Cropper(object):
                 lmk = self.landmark_runner.run(frame_rgb, lmk)
                 trajectory.start, trajectory.end = idx, idx
             else:
+                # TODO: add IOU check for tracking
                 lmk = self.landmark_runner.run(frame_rgb, trajectory.lmk_lst[-1])
                 trajectory.end = idx
+
             trajectory.lmk_lst.append(lmk)
 
             # crop the face