This commit is contained in:
萌狼蓝天 2024-08-06 19:43:55 +08:00
commit 00c5b1ce16
107 changed files with 6020 additions and 0 deletions

17
.gitignore vendored Normal file
View File

@ -0,0 +1,17 @@
# Byte-compiled / optimized / DLL files
__pycache__/
**/__pycache__/
*.py[cod]
**/*.py[cod]
*$py.class
# Model weights
**/*.pth
**/*.onnx
# Ipython notebook
*.ipynb
# Temporary files or benchmark resources
animations/*
tmp/*

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

12
.idea/LivePortrait.iml Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

View File

@ -0,0 +1,88 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="68">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="pycryptodome" />
<item index="2" class="java.lang.String" itemvalue="requests" />
<item index="3" class="java.lang.String" itemvalue="urllib3" />
<item index="4" class="java.lang.String" itemvalue="Flask_Cors" />
<item index="5" class="java.lang.String" itemvalue="Jinja2" />
<item index="6" class="java.lang.String" itemvalue="Flask" />
<item index="7" class="java.lang.String" itemvalue="PyMySQL" />
<item index="8" class="java.lang.String" itemvalue="picamera" />
<item index="9" class="java.lang.String" itemvalue="face_recognition_models" />
<item index="10" class="java.lang.String" itemvalue="dlib" />
<item index="11" class="java.lang.String" itemvalue="mysqlclient" />
<item index="12" class="java.lang.String" itemvalue="tzlocal" />
<item index="13" class="java.lang.String" itemvalue="greenlet" />
<item index="14" class="java.lang.String" itemvalue="python-dateutil" />
<item index="15" class="java.lang.String" itemvalue="psycopg2" />
<item index="16" class="java.lang.String" itemvalue="h11" />
<item index="17" class="java.lang.String" itemvalue="MarkupSafe" />
<item index="18" class="java.lang.String" itemvalue="atlastk" />
<item index="19" class="java.lang.String" itemvalue="django-snapshot" />
<item index="20" class="java.lang.String" itemvalue="starlette" />
<item index="21" class="java.lang.String" itemvalue="certifi" />
<item index="22" class="java.lang.String" itemvalue="anyio" />
<item index="23" class="java.lang.String" itemvalue="uvicorn" />
<item index="24" class="java.lang.String" itemvalue="xlrd" />
<item index="25" class="java.lang.String" itemvalue="pydantic" />
<item index="26" class="java.lang.String" itemvalue="markup" />
<item index="27" class="java.lang.String" itemvalue="Werkzeug" />
<item index="28" class="java.lang.String" itemvalue="asgiref" />
<item index="29" class="java.lang.String" itemvalue="cryptography" />
<item index="30" class="java.lang.String" itemvalue="orjson" />
<item index="31" class="java.lang.String" itemvalue="typing-extensions" />
<item index="32" class="java.lang.String" itemvalue="loguru" />
<item index="33" class="java.lang.String" itemvalue="click" />
<item index="34" class="java.lang.String" itemvalue="APScheduler" />
<item index="35" class="java.lang.String" itemvalue="simplejson" />
<item index="36" class="java.lang.String" itemvalue="prettytable" />
<item index="37" class="java.lang.String" itemvalue="aioredis" />
<item index="38" class="java.lang.String" itemvalue="charset-normalizer" />
<item index="39" class="java.lang.String" itemvalue="snapshot" />
<item index="40" class="java.lang.String" itemvalue="idna" />
<item index="41" class="java.lang.String" itemvalue="PyJWT" />
<item index="42" class="java.lang.String" itemvalue="rsa" />
<item index="43" class="java.lang.String" itemvalue="async-timeout" />
<item index="44" class="java.lang.String" itemvalue="SQLAlchemy" />
<item index="45" class="java.lang.String" itemvalue="cffi" />
<item index="46" class="java.lang.String" itemvalue="wcwidth" />
<item index="47" class="java.lang.String" itemvalue="numpy" />
<item index="48" class="java.lang.String" itemvalue="pyasn1" />
<item index="49" class="java.lang.String" itemvalue="importlib-metadata" />
<item index="50" class="java.lang.String" itemvalue="sniffio" />
<item index="51" class="java.lang.String" itemvalue="tortoise" />
<item index="52" class="java.lang.String" itemvalue="zipp" />
<item index="53" class="java.lang.String" itemvalue="pyecharts" />
<item index="54" class="java.lang.String" itemvalue="itsdangerous" />
<item index="55" class="java.lang.String" itemvalue="python-jose" />
<item index="56" class="java.lang.String" itemvalue="tzdata" />
<item index="57" class="java.lang.String" itemvalue="ecdsa" />
<item index="58" class="java.lang.String" itemvalue="python-multipart" />
<item index="59" class="java.lang.String" itemvalue="pytz-deprecation-shim" />
<item index="60" class="java.lang.String" itemvalue="fastapi" />
<item index="61" class="java.lang.String" itemvalue="trustme" />
<item index="62" class="java.lang.String" itemvalue="colorama" />
<item index="63" class="java.lang.String" itemvalue="pytz" />
<item index="64" class="java.lang.String" itemvalue="asyncmy" />
<item index="65" class="java.lang.String" itemvalue="openpyxl" />
<item index="66" class="java.lang.String" itemvalue="pytest-runner" />
<item index="67" class="java.lang.String" itemvalue="pytest" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N802" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

10
.idea/misc.xml Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="LivePortrait" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="LivePortrait" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/LivePortrait.iml" filepath="$PROJECT_DIR$/.idea/LivePortrait.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

19
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,19 @@
{
"[python]": {
"editor.tabSize": 4
},
"files.eol": "\n",
"files.insertFinalNewline": true,
"files.trimFinalNewlines": true,
"files.trimTrailingWhitespace": true,
"files.exclude": {
"**/.git": true,
"**/.svn": true,
"**/.hg": true,
"**/CVS": true,
"**/.DS_Store": true,
"**/Thumbs.db": true,
"**/*.crswap": true,
"**/__pycache__": true
}
}

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Kuaishou Visual Generation and Interaction Center
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

154
app.py Normal file
View File

@ -0,0 +1,154 @@
# coding: utf-8
"""
The entrance of the gradio
"""
import tyro
import gradio as gr
import os.path as osp
from src.utils.helper import load_description
from src.gradio_pipeline import GradioPipeline
from src.config.crop_config import CropConfig
from src.config.argument_config import ArgumentConfig
from src.config.inference_config import InferenceConfig
def partial_fields(target_class, kwargs):
return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
# set tyro theme
tyro.extras.set_accent_color("bright_cyan")
args = tyro.cli(ArgumentConfig)
# specify configs for inference
inference_cfg = partial_fields(InferenceConfig, args.__dict__) # use attribute of args to initial InferenceConfig
crop_cfg = partial_fields(CropConfig, args.__dict__) # use attribute of args to initial CropConfig
gradio_pipeline = GradioPipeline(
inference_cfg=inference_cfg,
crop_cfg=crop_cfg,
args=args
)
# assets
title_md = "assets/gradio_title.md"
example_portrait_dir = "assets/examples/source"
example_video_dir = "assets/examples/driving"
data_examples = [
[osp.join(example_portrait_dir, "s9.jpg"), osp.join(example_video_dir, "d0.mp4"), True, True, True, True],
[osp.join(example_portrait_dir, "s6.jpg"), osp.join(example_video_dir, "d0.mp4"), True, True, True, True],
[osp.join(example_portrait_dir, "s10.jpg"), osp.join(example_video_dir, "d5.mp4"), True, True, True, True],
[osp.join(example_portrait_dir, "s5.jpg"), osp.join(example_video_dir, "d6.mp4"), True, True, True, True],
[osp.join(example_portrait_dir, "s7.jpg"), osp.join(example_video_dir, "d7.mp4"), True, True, True, True],
]
#################### interface logic ####################
# Define components first
eye_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target eyes-open ratio")
lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target lip-open ratio")
retargeting_input_image = gr.Image(type="numpy")
output_image = gr.Image(type="numpy")
output_image_paste_back = gr.Image(type="numpy")
output_video = gr.Video()
output_video_concat = gr.Video()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.HTML(load_description(title_md))
gr.Markdown(load_description("assets/gradio_description_upload.md"))
with gr.Row():
with gr.Accordion(open=True, label="Source Portrait"):
image_input = gr.Image(type="filepath")
with gr.Accordion(open=True, label="Driving Video"):
video_input = gr.Video()
gr.Markdown(load_description("assets/gradio_description_animation.md"))
with gr.Row():
with gr.Accordion(open=True, label="Animation Options"):
with gr.Row():
flag_relative_input = gr.Checkbox(value=True, label="relative motion")
flag_do_crop_input = gr.Checkbox(value=True, label="do crop")
flag_remap_input = gr.Checkbox(value=True, label="paste-back")
with gr.Row():
with gr.Column():
process_button_animation = gr.Button("🚀 Animate", variant="primary")
with gr.Column():
process_button_reset = gr.ClearButton([image_input, video_input, output_video, output_video_concat], value="🧹 Clear")
with gr.Row():
with gr.Column():
with gr.Accordion(open=True, label="The animated video in the original image space"):
output_video.render()
with gr.Column():
with gr.Accordion(open=True, label="The animated video"):
output_video_concat.render()
with gr.Row():
# Examples
gr.Markdown("## You could choose the examples below ⬇️")
with gr.Row():
gr.Examples(
examples=data_examples,
inputs=[
image_input,
video_input,
flag_relative_input,
flag_do_crop_input,
flag_remap_input
],
examples_per_page=5
)
gr.Markdown(load_description("assets/gradio_description_retargeting.md"))
with gr.Row():
eye_retargeting_slider.render()
lip_retargeting_slider.render()
with gr.Row():
process_button_retargeting = gr.Button("🚗 Retargeting", variant="primary")
process_button_reset_retargeting = gr.ClearButton(
[
eye_retargeting_slider,
lip_retargeting_slider,
retargeting_input_image,
output_image,
output_image_paste_back
],
value="🧹 Clear"
)
with gr.Row():
with gr.Column():
with gr.Accordion(open=True, label="Retargeting Input"):
retargeting_input_image.render()
with gr.Column():
with gr.Accordion(open=True, label="Retargeting Result"):
output_image.render()
with gr.Column():
with gr.Accordion(open=True, label="Paste-back Result"):
output_image_paste_back.render()
# binding functions for buttons
process_button_retargeting.click(
fn=gradio_pipeline.execute_image,
inputs=[eye_retargeting_slider, lip_retargeting_slider],
outputs=[output_image, output_image_paste_back],
show_progress=True
)
process_button_animation.click(
fn=gradio_pipeline.execute_video,
inputs=[
image_input,
video_input,
flag_relative_input,
flag_do_crop_input,
flag_remap_input
],
outputs=[output_video, output_video_concat],
show_progress=True
)
image_input.change(
fn=gradio_pipeline.prepare_retargeting,
inputs=image_input,
outputs=[eye_retargeting_slider, lip_retargeting_slider, retargeting_input_image]
)
##########################################################
demo.launch(
server_name=args.server_name,
server_port=args.server_port,
share=args.share,
)

BIN
assets/docs/inference.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 801 KiB

BIN
assets/docs/showcase.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.3 MiB

BIN
assets/docs/showcase2.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 MiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 525 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 742 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 222 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 432 KiB

View File

@ -0,0 +1,7 @@
<span style="font-size: 1.2em;">🔥 To animate the source portrait with the driving video, please follow these steps:</span>
<div style="font-size: 1.2em; margin-left: 20px;">
1. Specify the options in the <strong>Animation Options</strong> section. We recommend checking the <strong>do crop</strong> option when facial areas occupy a relatively small portion of your image.
</div>
<div style="font-size: 1.2em; margin-left: 20px;">
2. Press the <strong>🚀 Animate</strong> button and wait for a moment. Your animated video will appear in the result block. This may take a few moments.
</div>

View File

@ -0,0 +1 @@
<span style="font-size: 1.2em;">🔥 To change the target eyes-open and lip-open ratio of the source portrait, please drag the sliders and then click the <strong>🚗 Retargeting</strong> button. The result would be shown in the middle block. You can try running it multiple times. <strong>😊 Set both ratios to 0.8 to see what's going on!</strong> </span>

View File

@ -0,0 +1,2 @@
## 🤗 This is the official gradio demo for **LivePortrait**.
<div style="font-size: 1.2em;">Please upload or use the webcam to get a source portrait to the <strong>Source Portrait</strong> field and a driving video to the <strong>Driving Video</strong> field.</div>

10
assets/gradio_title.md Normal file
View File

@ -0,0 +1,10 @@
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
<div>
<h1>LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>
<div style="display: flex; justify-content: center; align-items: center; text-align: center;>
<a href="https://arxiv.org/pdf/2407.03168"><img src="https://img.shields.io/badge/arXiv-2407.03168-red"></a>
<a href="https://liveportrait.github.io"><img src="https://img.shields.io/badge/Project_Page-LivePortrait-green" alt="Project Page"></a>
<a href="https://github.com/KwaiVGI/LivePortrait"><img src="https://img.shields.io/badge/Github-Code-blue"></a>
</div>
</div>
</div>

33
inference.py Normal file
View File

@ -0,0 +1,33 @@
# coding: utf-8
import tyro
from src.config.argument_config import ArgumentConfig
from src.config.inference_config import InferenceConfig
from src.config.crop_config import CropConfig
from src.live_portrait_pipeline import LivePortraitPipeline
def partial_fields(target_class, kwargs):
return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
def main():
# set tyro theme
tyro.extras.set_accent_color("bright_cyan")
args = tyro.cli(ArgumentConfig)
# specify configs for inference
inference_cfg = partial_fields(InferenceConfig, args.__dict__) # use attribute of args to initial InferenceConfig
crop_cfg = partial_fields(CropConfig, args.__dict__) # use attribute of args to initial CropConfig
live_portrait_pipeline = LivePortraitPipeline(
inference_cfg=inference_cfg,
crop_cfg=crop_cfg
)
# run
live_portrait_pipeline.execute(args)
if __name__ == '__main__':
main()

View File

144
readme.md Normal file
View File

@ -0,0 +1,144 @@
<h1 align="center">LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>
<div align='center'>
<a href='https://github.com/cleardusk' target='_blank'><strong>Jianzhu Guo</strong></a><sup> 1†</sup>&emsp;
<a href='https://github.com/KwaiVGI' target='_blank'><strong>Dingyun Zhang</strong></a><sup> 1,2</sup>&emsp;
<a href='https://github.com/KwaiVGI' target='_blank'><strong>Xiaoqiang Liu</strong></a><sup> 1</sup>&emsp;
<a href='https://github.com/KwaiVGI' target='_blank'><strong>Zhizhou Zhong</strong></a><sup> 1,3</sup>&emsp;
<a href='https://scholar.google.com.hk/citations?user=_8k1ubAAAAAJ' target='_blank'><strong>Yuan Zhang</strong></a><sup> 1</sup>&emsp;
</div>
<div align='center'>
<a href='https://scholar.google.com/citations?user=P6MraaYAAAAJ' target='_blank'><strong>Pengfei Wan</strong></a><sup> 1</sup>&emsp;
<a href='https://openreview.net/profile?id=~Di_ZHANG3' target='_blank'><strong>Di Zhang</strong></a><sup> 1</sup>&emsp;
</div>
<div align='center'>
<sup>1 </sup>Kuaishou Technology&emsp; <sup>2 </sup>University of Science and Technology of China&emsp; <sup>3 </sup>Fudan University&emsp;
</div>
<br>
<div align="center">
<!-- <a href='LICENSE'><img src='https://img.shields.io/badge/license-MIT-yellow'></a> -->
<a href='https://arxiv.org/pdf/2407.03168'><img src='https://img.shields.io/badge/arXiv-LivePortrait-red'></a>
<a href='https://liveportrait.github.io'><img src='https://img.shields.io/badge/Project-LivePortrait-green'></a>
<a href='https://huggingface.co/spaces/KwaiVGI/liveportrait'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
</div>
<br>
<p align="center">
<img src="./assets/docs/showcase2.gif" alt="showcase">
<br>
🔥 For more results, visit our <a href="https://liveportrait.github.io/"><strong>homepage</strong></a> 🔥
</p>
## 🔥 Updates
- **`2024/07/04`**: 🔥 We released the initial version of the inference code and models. Continuous updates, stay tuned!
- **`2024/07/04`**: 😊 We released the [homepage](https://liveportrait.github.io) and technical report on [arXiv](https://arxiv.org/pdf/2407.03168).
## Introduction
This repo, named **LivePortrait**, contains the official PyTorch implementation of our paper [LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control](https://arxiv.org/pdf/2407.03168).
We are actively updating and improving this repository. If you find any bugs or have suggestions, welcome to raise issues or submit pull requests (PR) 💖.
## 🔥 Getting Started
### 1. Clone the code and prepare the environment
```bash
git clone https://github.com/KwaiVGI/LivePortrait
cd LivePortrait
# create env using conda
conda create -n LivePortrait python==3.9.18
conda activate LivePortrait
# install dependencies with pip
pip install -r requirements.txt
```
### 2. Download pretrained weights
Download our pretrained LivePortrait weights and face detection models of InsightFace from [Google Drive](https://drive.google.com/drive/folders/1UtKgzKjFAOmZkhNK-OYT0caJ_w2XAnib) or [Baidu Yun](https://pan.baidu.com/s/1MGctWmNla_vZxDbEp2Dtzw?pwd=z5cn). We have packed all weights in one directory 😊. Unzip and place them in `./pretrained_weights` ensuring the directory structure is as follows:
```text
pretrained_weights
├── insightface
│ └── models
│ └── buffalo_l
│ ├── 2d106det.onnx
│ └── det_10g.onnx
└── liveportrait
├── base_models
│ ├── appearance_feature_extractor.pth
│ ├── motion_extractor.pth
│ ├── spade_generator.pth
│ └── warping_module.pth
├── landmark.onnx
└── retargeting_models
└── stitching_retargeting_module.pth
```
### 3. Inference 🚀
```bash
python inference.py
```
If the script runs successfully, you will get an output mp4 file named `animations/s6--d0_concat.mp4`. This file includes the following results: driving video, input image, and generated result.
<p align="center">
<img src="./assets/docs/inference.gif" alt="image">
</p>
Or, you can change the input by specifying the `-s` and `-d` arguments:
```bash
python inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4
# or disable pasting back
python inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4 --no_flag_pasteback
# more options to see
python inference.py -h
```
**More interesting results can be found in our [Homepage](https://liveportrait.github.io)** 😊
### 4. Gradio interface
We also provide a Gradio interface for a better experience, just run by:
```bash
python app.py
```
### 5. Inference speed evaluation 🚀🚀🚀
We have also provided a script to evaluate the inference speed of each module:
```bash
python speed.py
```
Below are the results of inferring one frame on an RTX 4090 GPU using the native PyTorch framework with `torch.compile`:
| Model | Parameters(M) | Model Size(MB) | Inference(ms) |
|-----------------------------------|:-------------:|:--------------:|:-------------:|
| Appearance Feature Extractor | 0.84 | 3.3 | 0.82 |
| Motion Extractor | 28.12 | 108 | 0.84 |
| Spade Generator | 55.37 | 212 | 7.59 |
| Warping Module | 45.53 | 174 | 5.21 |
| Stitching and Retargeting Modules| 0.23 | 2.3 | 0.31 |
*Note: the listed values of Stitching and Retargeting Modules represent the combined parameter counts and the total sequential inference time of three MLP networks.*
## Acknowledgements
We would like to thank the contributors of [FOMM](https://github.com/AliaksandrSiarohin/first-order-model), [Open Facevid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis), [SPADE](https://github.com/NVlabs/SPADE), [InsightFace](https://github.com/deepinsight/insightface) repositories, for their open research and contributions.
## Citation 💖
If you find LivePortrait useful for your research, welcome to 🌟 this repo and cite our work using the following BibTeX:
```bibtex
@article{guo2024live,
title = {LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control},
author = {Jianzhu Guo and Dingyun Zhang and Xiaoqiang Liu and Zhizhou Zhong and Yuan Zhang and Pengfei Wan and Di Zhang},
year = {2024},
journal = {arXiv preprint:2407.03168},
}
```

22
requirements.txt Normal file
View File

@ -0,0 +1,22 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.3.0
torchvision==0.18.0
torchaudio==2.3.0
numpy==1.26.4
pyyaml==6.0.1
opencv-python==4.10.0.84
scipy==1.13.1
imageio==2.34.2
lmdb==1.4.1
tqdm==4.66.4
rich==13.7.1
ffmpeg==1.4
onnxruntime-gpu==1.18.0
onnx==1.16.1
scikit-image==0.24.0
albumentations==1.4.10
matplotlib==3.9.0
imageio-ffmpeg==0.5.1
tyro==0.8.5
gradio==4.37.1

192
speed.py Normal file
View File

@ -0,0 +1,192 @@
# coding: utf-8
"""
Benchmark the inference speed of each module in LivePortrait.
TODO: heavy GPT style, need to refactor
"""
import yaml
import torch
import time
import numpy as np
from src.utils.helper import load_model, concat_feat
from src.config.inference_config import InferenceConfig
def initialize_inputs(batch_size=1):
"""
Generate random input tensors and move them to GPU
"""
feature_3d = torch.randn(batch_size, 32, 16, 64, 64).cuda().half()
kp_source = torch.randn(batch_size, 21, 3).cuda().half()
kp_driving = torch.randn(batch_size, 21, 3).cuda().half()
source_image = torch.randn(batch_size, 3, 256, 256).cuda().half()
generator_input = torch.randn(batch_size, 256, 64, 64).cuda().half()
eye_close_ratio = torch.randn(batch_size, 3).cuda().half()
lip_close_ratio = torch.randn(batch_size, 2).cuda().half()
feat_stitching = concat_feat(kp_source, kp_driving).half()
feat_eye = concat_feat(kp_source, eye_close_ratio).half()
feat_lip = concat_feat(kp_source, lip_close_ratio).half()
inputs = {
'feature_3d': feature_3d,
'kp_source': kp_source,
'kp_driving': kp_driving,
'source_image': source_image,
'generator_input': generator_input,
'feat_stitching': feat_stitching,
'feat_eye': feat_eye,
'feat_lip': feat_lip
}
return inputs
def load_and_compile_models(cfg, model_config):
"""
Load and compile models for inference
"""
appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor')
motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor')
warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module')
spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator')
stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module')
models_with_params = [
('Appearance Feature Extractor', appearance_feature_extractor),
('Motion Extractor', motion_extractor),
('Warping Network', warping_module),
('SPADE Decoder', spade_generator)
]
compiled_models = {}
for name, model in models_with_params:
model = model.half()
model = torch.compile(model, mode='max-autotune') # Optimize for inference
model.eval() # Switch to evaluation mode
compiled_models[name] = model
retargeting_models = ['stitching', 'eye', 'lip']
for retarget in retargeting_models:
module = stitching_retargeting_module[retarget].half()
module = torch.compile(module, mode='max-autotune') # Optimize for inference
module.eval() # Switch to evaluation mode
stitching_retargeting_module[retarget] = module
return compiled_models, stitching_retargeting_module
def warm_up_models(compiled_models, stitching_retargeting_module, inputs):
"""
Warm up models to prepare them for benchmarking
"""
print("Warm up start!")
with torch.no_grad():
for _ in range(10):
compiled_models['Appearance Feature Extractor'](inputs['source_image'])
compiled_models['Motion Extractor'](inputs['source_image'])
compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])
compiled_models['SPADE Decoder'](inputs['generator_input']) # Adjust input as required
stitching_retargeting_module['stitching'](inputs['feat_stitching'])
stitching_retargeting_module['eye'](inputs['feat_eye'])
stitching_retargeting_module['lip'](inputs['feat_lip'])
print("Warm up end!")
def measure_inference_times(compiled_models, stitching_retargeting_module, inputs):
"""
Measure inference times for each model
"""
times = {name: [] for name in compiled_models.keys()}
times['Retargeting Models'] = []
overall_times = []
with torch.no_grad():
for _ in range(100):
torch.cuda.synchronize()
overall_start = time.time()
start = time.time()
compiled_models['Appearance Feature Extractor'](inputs['source_image'])
torch.cuda.synchronize()
times['Appearance Feature Extractor'].append(time.time() - start)
start = time.time()
compiled_models['Motion Extractor'](inputs['source_image'])
torch.cuda.synchronize()
times['Motion Extractor'].append(time.time() - start)
start = time.time()
compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])
torch.cuda.synchronize()
times['Warping Network'].append(time.time() - start)
start = time.time()
compiled_models['SPADE Decoder'](inputs['generator_input']) # Adjust input as required
torch.cuda.synchronize()
times['SPADE Decoder'].append(time.time() - start)
start = time.time()
stitching_retargeting_module['stitching'](inputs['feat_stitching'])
stitching_retargeting_module['eye'](inputs['feat_eye'])
stitching_retargeting_module['lip'](inputs['feat_lip'])
torch.cuda.synchronize()
times['Retargeting Models'].append(time.time() - start)
overall_times.append(time.time() - overall_start)
return times, overall_times
def print_benchmark_results(compiled_models, stitching_retargeting_module, retargeting_models, times, overall_times):
"""
Print benchmark results with average and standard deviation of inference times
"""
average_times = {name: np.mean(times[name]) * 1000 for name in times.keys()}
std_times = {name: np.std(times[name]) * 1000 for name in times.keys()}
for name, model in compiled_models.items():
num_params = sum(p.numel() for p in model.parameters())
num_params_in_millions = num_params / 1e6
print(f"Number of parameters for {name}: {num_params_in_millions:.2f} M")
for index, retarget in enumerate(retargeting_models):
num_params = sum(p.numel() for p in stitching_retargeting_module[retarget].parameters())
num_params_in_millions = num_params / 1e6
print(f"Number of parameters for part_{index} in Stitching and Retargeting Modules: {num_params_in_millions:.2f} M")
for name, avg_time in average_times.items():
std_time = std_times[name]
print(f"Average inference time for {name} over 100 runs: {avg_time:.2f} ms (std: {std_time:.2f} ms)")
def main():
"""
Main function to benchmark speed and model parameters
"""
# Sample input tensors
inputs = initialize_inputs()
# Load configuration
cfg = InferenceConfig(device_id=0)
model_config_path = cfg.models_config
with open(model_config_path, 'r') as file:
model_config = yaml.safe_load(file)
# Load and compile models
compiled_models, stitching_retargeting_module = load_and_compile_models(cfg, model_config)
# Warm up models
warm_up_models(compiled_models, stitching_retargeting_module, inputs)
# Measure inference times
times, overall_times = measure_inference_times(compiled_models, stitching_retargeting_module, inputs)
# Print benchmark results
print_benchmark_results(compiled_models, stitching_retargeting_module, ['stitching', 'eye', 'lip'], times, overall_times)
if __name__ == "__main__":
main()

0
src/config/__init__.py Normal file
View File

View File

@ -0,0 +1,44 @@
# coding: utf-8
"""
config for user
"""
import os.path as osp
from dataclasses import dataclass
import tyro
from typing_extensions import Annotated
from .base_config import PrintableConfig, make_abs_path
@dataclass(repr=False) # use repr from PrintableConfig
class ArgumentConfig(PrintableConfig):
########## input arguments ##########
source_image: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/source/s6.jpg') # path to the source portrait
driving_info: Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d0.mp4') # path to driving video or template (.pkl format)
output_dir: Annotated[str, tyro.conf.arg(aliases=["-o"])] = 'animations/' # directory to save output video
#####################################
########## inference arguments ##########
device_id: int = 0
flag_lip_zero : bool = True # whether let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
flag_eye_retargeting: bool = False
flag_lip_retargeting: bool = False
flag_stitching: bool = True # we recommend setting it to True!
flag_relative: bool = True # whether to use relative motion
flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
flag_do_crop: bool = True # whether to crop the source portrait to the face-cropping space
flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True
#########################################
########## crop arguments ##########
dsize: int = 512
scale: float = 2.3
vx_ratio: float = 0 # vx ratio
vy_ratio: float = -0.125 # vy ratio +up, -down
####################################
########## gradio arguments ##########
server_port: Annotated[int, tyro.conf.arg(aliases=["-p"])] = 8890
share: bool = True
server_name: str = "0.0.0.0"

29
src/config/base_config.py Normal file
View File

@ -0,0 +1,29 @@
# coding: utf-8
"""
pretty printing class
"""
from __future__ import annotations
import os.path as osp
from typing import Tuple
def make_abs_path(fn):
return osp.join(osp.dirname(osp.realpath(__file__)), fn)
class PrintableConfig: # pylint: disable=too-few-public-methods
"""Printable Config defining str function"""
def __repr__(self):
lines = [self.__class__.__name__ + ":"]
for key, val in vars(self).items():
if isinstance(val, Tuple):
flattened_val = "["
for item in val:
flattened_val += str(item) + "\n"
flattened_val = flattened_val.rstrip("\n")
val = flattened_val + "]"
lines += f"{key}: {str(val)}".split("\n")
return "\n ".join(lines)

18
src/config/crop_config.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf-8
"""
parameters used for crop faces
"""
import os.path as osp
from dataclasses import dataclass
from typing import Union, List
from .base_config import PrintableConfig
@dataclass(repr=False) # use repr from PrintableConfig
class CropConfig(PrintableConfig):
dsize: int = 512 # crop size
scale: float = 2.3 # scale factor
vx_ratio: float = 0 # vx ratio
vy_ratio: float = -0.125 # vy ratio +up, -down

View File

@ -0,0 +1,49 @@
# coding: utf-8
"""
config dataclass used for inference
"""
import os.path as osp
from dataclasses import dataclass
from typing import Literal, Tuple
from .base_config import PrintableConfig, make_abs_path
@dataclass(repr=False) # use repr from PrintableConfig
class InferenceConfig(PrintableConfig):
models_config: str = make_abs_path('./models.yaml') # portrait animation config
checkpoint_F: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/appearance_feature_extractor.pth') # path to checkpoint
checkpoint_M: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/motion_extractor.pth') # path to checkpoint
checkpoint_G: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/spade_generator.pth') # path to checkpoint
checkpoint_W: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/warping_module.pth') # path to checkpoint
checkpoint_S: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth') # path to checkpoint
flag_use_half_precision: bool = True # whether to use half precision
flag_lip_zero: bool = True # whether let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
lip_zero_threshold: float = 0.03
flag_eye_retargeting: bool = False
flag_lip_retargeting: bool = False
flag_stitching: bool = True # we recommend setting it to True!
flag_relative: bool = True # whether to use relative motion
anchor_frame: int = 0 # set this value if find_best_frame is True
input_shape: Tuple[int, int] = (256, 256) # input shape
output_format: Literal['mp4', 'gif'] = 'mp4' # output video format
output_fps: int = 30 # fps for output video
crf: int = 15 # crf for output video
flag_write_result: bool = True # whether to write output video
flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
mask_crop = None
flag_write_gif: bool = False
size_gif: int = 256
ref_max_shape: int = 1280
ref_shape_n: int = 2
device_id: int = 0
flag_do_crop: bool = False # whether to crop the source portrait to the face-cropping space
flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True

43
src/config/models.yaml Normal file
View File

@ -0,0 +1,43 @@
model_params:
appearance_feature_extractor_params: # the F in the paper
image_channel: 3
block_expansion: 64
num_down_blocks: 2
max_features: 512
reshape_channel: 32
reshape_depth: 16
num_resblocks: 6
motion_extractor_params: # the M in the paper
num_kp: 21
backbone: convnextv2_tiny
warping_module_params: # the W in the paper
num_kp: 21
block_expansion: 64
max_features: 512
num_down_blocks: 2
reshape_channel: 32
estimate_occlusion_map: True
dense_motion_params:
block_expansion: 32
max_features: 1024
num_blocks: 5
reshape_depth: 16
compress: 4
spade_generator_params: # the G in the paper
upscale: 2 # represents upsample factor 256x256 -> 512x512
block_expansion: 64
max_features: 512
num_down_blocks: 2
stitching_retargeting_module_params: # the S in the paper
stitching:
input_size: 126 # (21*3)*2
hidden_sizes: [128, 128, 64]
output_size: 65 # (21*3)+2(tx,ty)
lip:
input_size: 65 # (21*3)+2
hidden_sizes: [128, 128, 64]
output_size: 63 # (21*3)
eye:
input_size: 66 # (21*3)+3
hidden_sizes: [256, 256, 128, 128, 64]
output_size: 63 # (21*3)

140
src/gradio_pipeline.py Normal file
View File

@ -0,0 +1,140 @@
# coding: utf-8
"""
Pipeline for gradio
"""
import gradio as gr
from .config.argument_config import ArgumentConfig
from .live_portrait_pipeline import LivePortraitPipeline
from .utils.io import load_img_online
from .utils.rprint import rlog as log
from .utils.crop import prepare_paste_back, paste_back
from .utils.camera import get_rotation_matrix
from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
def update_args(args, user_args):
"""update the args according to user inputs
"""
for k, v in user_args.items():
if hasattr(args, k):
setattr(args, k, v)
return args
class GradioPipeline(LivePortraitPipeline):
def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig):
super().__init__(inference_cfg, crop_cfg)
# self.live_portrait_wrapper = self.live_portrait_wrapper
self.args = args
# for single image retargeting
self.start_prepare = False
self.f_s_user = None
self.x_c_s_info_user = None
self.x_s_user = None
self.source_lmk_user = None
self.mask_ori = None
self.img_rgb = None
self.crop_M_c2o = None
def execute_video(
self,
input_image_path,
input_video_path,
flag_relative_input,
flag_do_crop_input,
flag_remap_input,
):
""" for video driven potrait animation
"""
if input_image_path is not None and input_video_path is not None:
args_user = {
'source_image': input_image_path,
'driving_info': input_video_path,
'flag_relative': flag_relative_input,
'flag_do_crop': flag_do_crop_input,
'flag_pasteback': flag_remap_input,
}
# update config from user input
self.args = update_args(self.args, args_user)
self.live_portrait_wrapper.update_config(self.args.__dict__)
self.cropper.update_config(self.args.__dict__)
# video driven animation
video_path, video_path_concat = self.execute(self.args)
gr.Info("Run successfully!", duration=2)
return video_path, video_path_concat,
else:
raise gr.Error("The input source portrait or driving video hasn't been prepared yet 💥!", duration=5)
def execute_image(self, input_eye_ratio: float, input_lip_ratio: float):
""" for single image retargeting
"""
if input_eye_ratio is None or input_eye_ratio is None:
raise gr.Error("Invalid ratio input 💥!", duration=5)
elif self.f_s_user is None:
if self.start_prepare:
raise gr.Error(
"The source portrait is under processing 💥! Please wait for a second.",
duration=5
)
else:
raise gr.Error(
"The source portrait hasn't been prepared yet 💥! Please scroll to the top of the page to upload.",
duration=5
)
else:
# ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio([[input_eye_ratio]], self.source_lmk_user)
eyes_delta = self.live_portrait_wrapper.retarget_eye(self.x_s_user, combined_eye_ratio_tensor)
# ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio([[input_lip_ratio]], self.source_lmk_user)
lip_delta = self.live_portrait_wrapper.retarget_lip(self.x_s_user, combined_lip_ratio_tensor)
num_kp = self.x_s_user.shape[1]
# default: use x_s
x_d_new = self.x_s_user + eyes_delta.reshape(-1, num_kp, 3) + lip_delta.reshape(-1, num_kp, 3)
# D(W(f_s; x_s, x_d))
out = self.live_portrait_wrapper.warp_decode(self.f_s_user, self.x_s_user, x_d_new)
out = self.live_portrait_wrapper.parse_output(out['out'])[0]
out_to_ori_blend = paste_back(out, self.crop_M_c2o, self.img_rgb, self.mask_ori)
gr.Info("Run successfully!", duration=2)
return out, out_to_ori_blend
def prepare_retargeting(self, input_image_path, flag_do_crop = True):
""" for single image retargeting
"""
if input_image_path is not None:
gr.Info("Upload successfully!", duration=2)
self.start_prepare = True
inference_cfg = self.live_portrait_wrapper.cfg
######## process source portrait ########
img_rgb = load_img_online(input_image_path, mode='rgb', max_dim=1280, n=16)
log(f"Load source image from {input_image_path}.")
crop_info = self.cropper.crop_single_image(img_rgb)
if flag_do_crop:
I_s = self.live_portrait_wrapper.prepare_source(crop_info['img_crop_256x256'])
else:
I_s = self.live_portrait_wrapper.prepare_source(img_rgb)
x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
############################################
# record global info for next time use
self.f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)
self.x_s_user = self.live_portrait_wrapper.transform_keypoint(x_s_info)
self.x_s_info_user = x_s_info
self.source_lmk_user = crop_info['lmk_crop']
self.img_rgb = img_rgb
self.crop_M_c2o = crop_info['M_c2o']
self.mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))
# update slider
eye_close_ratio = calc_eye_close_ratio(self.source_lmk_user[None])
eye_close_ratio = float(eye_close_ratio.squeeze(0).mean())
lip_close_ratio = calc_lip_close_ratio(self.source_lmk_user[None])
lip_close_ratio = float(lip_close_ratio.squeeze(0).mean())
# for vis
self.I_s_vis = self.live_portrait_wrapper.parse_output(I_s)[0]
return eye_close_ratio, lip_close_ratio, self.I_s_vis
else:
# when press the clear button, go here
return 0.8, 0.8, self.I_s_vis

View File

@ -0,0 +1,190 @@
# coding: utf-8
"""
Pipeline of LivePortrait
"""
# TODO:
# 1. 当前假定所有的模板都是已经裁好的,需要修改下
# 2. pick样例图 source + driving
import cv2
import numpy as np
import pickle
import os.path as osp
from rich.progress import track
from .config.argument_config import ArgumentConfig
from .config.inference_config import InferenceConfig
from .config.crop_config import CropConfig
from .utils.cropper import Cropper
from .utils.camera import get_rotation_matrix
from .utils.video import images2video, concat_frames
from .utils.crop import _transform_img, prepare_paste_back, paste_back
from .utils.retargeting_utils import calc_lip_close_ratio
from .utils.io import load_image_rgb, load_driving_info, resize_to_limit
from .utils.helper import mkdir, basename, dct2cuda, is_video, is_template
from .utils.rprint import rlog as log
from .live_portrait_wrapper import LivePortraitWrapper
def make_abs_path(fn):
return osp.join(osp.dirname(osp.realpath(__file__)), fn)
class LivePortraitPipeline(object):
def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):
self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(cfg=inference_cfg)
self.cropper = Cropper(crop_cfg=crop_cfg)
def execute(self, args: ArgumentConfig):
inference_cfg = self.live_portrait_wrapper.cfg # for convenience
######## process source portrait ########
img_rgb = load_image_rgb(args.source_image)
img_rgb = resize_to_limit(img_rgb, inference_cfg.ref_max_shape, inference_cfg.ref_shape_n)
log(f"Load source image from {args.source_image}")
crop_info = self.cropper.crop_single_image(img_rgb)
source_lmk = crop_info['lmk_crop']
img_crop, img_crop_256x256 = crop_info['img_crop'], crop_info['img_crop_256x256']
if inference_cfg.flag_do_crop:
I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256)
else:
I_s = self.live_portrait_wrapper.prepare_source(img_rgb)
x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
x_c_s = x_s_info['kp']
R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)
x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
if inference_cfg.flag_lip_zero:
# let lip-open scalar to be 0 at first
c_d_lip_before_animation = [0.]
combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
if combined_lip_ratio_tensor_before_animation[0][0] < inference_cfg.lip_zero_threshold:
inference_cfg.flag_lip_zero = False
else:
lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
############################################
######## process driving info ########
if is_video(args.driving_info):
log(f"Load from video file (mp4 mov avi etc...): {args.driving_info}")
# TODO: 这里track一下驱动视频 -> 构建模板
driving_rgb_lst = load_driving_info(args.driving_info)
driving_rgb_lst_256 = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]
I_d_lst = self.live_portrait_wrapper.prepare_driving_videos(driving_rgb_lst_256)
n_frames = I_d_lst.shape[0]
if inference_cfg.flag_eye_retargeting or inference_cfg.flag_lip_retargeting:
driving_lmk_lst = self.cropper.get_retargeting_lmk_info(driving_rgb_lst)
input_eye_ratio_lst, input_lip_ratio_lst = self.live_portrait_wrapper.calc_retargeting_ratio(source_lmk, driving_lmk_lst)
elif is_template(args.driving_info):
log(f"Load from video templates {args.driving_info}")
with open(args.driving_info, 'rb') as f:
template_lst, driving_lmk_lst = pickle.load(f)
n_frames = template_lst[0]['n_frames']
input_eye_ratio_lst, input_lip_ratio_lst = self.live_portrait_wrapper.calc_retargeting_ratio(source_lmk, driving_lmk_lst)
else:
raise Exception("Unsupported driving types!")
#########################################
######## prepare for pasteback ########
if inference_cfg.flag_pasteback:
mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))
I_p_paste_lst = []
#########################################
I_p_lst = []
R_d_0, x_d_0_info = None, None
for i in track(range(n_frames), description='Animating...', total=n_frames):
if is_video(args.driving_info):
# extract kp info by M
I_d_i = I_d_lst[i]
x_d_i_info = self.live_portrait_wrapper.get_kp_info(I_d_i)
R_d_i = get_rotation_matrix(x_d_i_info['pitch'], x_d_i_info['yaw'], x_d_i_info['roll'])
else:
# from template
x_d_i_info = template_lst[i]
x_d_i_info = dct2cuda(x_d_i_info, inference_cfg.device_id)
R_d_i = x_d_i_info['R_d']
if i == 0:
R_d_0 = R_d_i
x_d_0_info = x_d_i_info
if inference_cfg.flag_relative:
R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
else:
R_new = R_d_i
delta_new = x_d_i_info['exp']
scale_new = x_s_info['scale']
t_new = x_d_i_info['t']
t_new[..., 2].fill_(0) # zero tz
x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
# Algorithm 1:
if not inference_cfg.flag_stitching and not inference_cfg.flag_eye_retargeting and not inference_cfg.flag_lip_retargeting:
# without stitching or retargeting
if inference_cfg.flag_lip_zero:
x_d_i_new += lip_delta_before_animation.reshape(-1, x_s.shape[1], 3)
else:
pass
elif inference_cfg.flag_stitching and not inference_cfg.flag_eye_retargeting and not inference_cfg.flag_lip_retargeting:
# with stitching and without retargeting
if inference_cfg.flag_lip_zero:
x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation.reshape(-1, x_s.shape[1], 3)
else:
x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
else:
eyes_delta, lip_delta = None, None
if inference_cfg.flag_eye_retargeting:
c_d_eyes_i = input_eye_ratio_lst[i]
combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes_i, source_lmk)
# ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor)
if inference_cfg.flag_lip_retargeting:
c_d_lip_i = input_lip_ratio_lst[i]
combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_i, source_lmk)
# ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor)
if inference_cfg.flag_relative: # use x_s
x_d_i_new = x_s + \
(eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \
(lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0)
else: # use x_d,i
x_d_i_new = x_d_i_new + \
(eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \
(lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0)
if inference_cfg.flag_stitching:
x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new)
I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]
I_p_lst.append(I_p_i)
if inference_cfg.flag_pasteback:
I_p_i_to_ori_blend = paste_back(I_p_i, crop_info['M_c2o'], img_rgb, mask_ori)
I_p_paste_lst.append(I_p_i_to_ori_blend)
mkdir(args.output_dir)
wfp_concat = None
if is_video(args.driving_info):
frames_concatenated = concat_frames(I_p_lst, driving_rgb_lst, img_crop_256x256)
# save (driving frames, source image, drived frames) result
wfp_concat = osp.join(args.output_dir, f'{basename(args.source_image)}--{basename(args.driving_info)}_concat.mp4')
images2video(frames_concatenated, wfp=wfp_concat)
# save drived result
wfp = osp.join(args.output_dir, f'{basename(args.source_image)}--{basename(args.driving_info)}.mp4')
if inference_cfg.flag_pasteback:
images2video(I_p_paste_lst, wfp=wfp)
else:
images2video(I_p_lst, wfp=wfp)
return wfp, wfp_concat

View File

@ -0,0 +1,307 @@
# coding: utf-8
"""
Wrapper for LivePortrait core functions
"""
import os.path as osp
import numpy as np
import cv2
import torch
import yaml
from .utils.timer import Timer
from .utils.helper import load_model, concat_feat
from .utils.camera import headpose_pred_to_degree, get_rotation_matrix
from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
from .config.inference_config import InferenceConfig
from .utils.rprint import rlog as log
class LivePortraitWrapper(object):
def __init__(self, cfg: InferenceConfig):
model_config = yaml.load(open(cfg.models_config, 'r'), Loader=yaml.SafeLoader)
# init F
self.appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor')
log(f'Load appearance_feature_extractor done.')
# init M
self.motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor')
log(f'Load motion_extractor done.')
# init W
self.warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module')
log(f'Load warping_module done.')
# init G
self.spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator')
log(f'Load spade_generator done.')
# init S and R
if cfg.checkpoint_S is not None and osp.exists(cfg.checkpoint_S):
self.stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module')
log(f'Load stitching_retargeting_module done.')
else:
self.stitching_retargeting_module = None
self.cfg = cfg
self.device_id = cfg.device_id
self.timer = Timer()
def update_config(self, user_args):
for k, v in user_args.items():
if hasattr(self.cfg, k):
setattr(self.cfg, k, v)
def prepare_source(self, img: np.ndarray) -> torch.Tensor:
""" construct the input as standard
img: HxWx3, uint8, 256x256
"""
h, w = img.shape[:2]
if h != self.cfg.input_shape[0] or w != self.cfg.input_shape[1]:
x = cv2.resize(img, (self.cfg.input_shape[0], self.cfg.input_shape[1]))
else:
x = img.copy()
if x.ndim == 3:
x = x[np.newaxis].astype(np.float32) / 255. # HxWx3 -> 1xHxWx3, normalized to 0~1
elif x.ndim == 4:
x = x.astype(np.float32) / 255. # BxHxWx3, normalized to 0~1
else:
raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
x = np.clip(x, 0, 1) # clip to 0~1
x = torch.from_numpy(x).permute(0, 3, 1, 2) # 1xHxWx3 -> 1x3xHxW
x = x.cuda(self.device_id)
return x
def prepare_driving_videos(self, imgs) -> torch.Tensor:
""" construct the input as standard
imgs: NxBxHxWx3, uint8
"""
if isinstance(imgs, list):
_imgs = np.array(imgs)[..., np.newaxis] # TxHxWx3x1
elif isinstance(imgs, np.ndarray):
_imgs = imgs
else:
raise ValueError(f'imgs type error: {type(imgs)}')
y = _imgs.astype(np.float32) / 255.
y = np.clip(y, 0, 1) # clip to 0~1
y = torch.from_numpy(y).permute(0, 4, 3, 1, 2) # TxHxWx3x1 -> Tx1x3xHxW
y = y.cuda(self.device_id)
return y
def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor:
""" get the appearance feature of the image by F
x: Bx3xHxW, normalized to 0~1
"""
with torch.no_grad():
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision):
feature_3d = self.appearance_feature_extractor(x)
return feature_3d.float()
def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict:
""" get the implicit keypoint information
x: Bx3xHxW, normalized to 0~1
flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
"""
with torch.no_grad():
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision):
kp_info = self.motion_extractor(x)
if self.cfg.flag_use_half_precision:
# float the dict
for k, v in kp_info.items():
if isinstance(v, torch.Tensor):
kp_info[k] = v.float()
flag_refine_info: bool = kwargs.get('flag_refine_info', True)
if flag_refine_info:
bs = kp_info['kp'].shape[0]
kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None] # Bx1
kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None] # Bx1
kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None] # Bx1
kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3) # BxNx3
kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3) # BxNx3
return kp_info
def get_pose_dct(self, kp_info: dict) -> dict:
pose_dct = dict(
pitch=headpose_pred_to_degree(kp_info['pitch']).item(),
yaw=headpose_pred_to_degree(kp_info['yaw']).item(),
roll=headpose_pred_to_degree(kp_info['roll']).item(),
)
return pose_dct
def get_fs_and_kp_info(self, source_prepared, driving_first_frame):
# get the canonical keypoints of source image by M
source_kp_info = self.get_kp_info(source_prepared, flag_refine_info=True)
source_rotation = get_rotation_matrix(source_kp_info['pitch'], source_kp_info['yaw'], source_kp_info['roll'])
# get the canonical keypoints of first driving frame by M
driving_first_frame_kp_info = self.get_kp_info(driving_first_frame, flag_refine_info=True)
driving_first_frame_rotation = get_rotation_matrix(
driving_first_frame_kp_info['pitch'],
driving_first_frame_kp_info['yaw'],
driving_first_frame_kp_info['roll']
)
# get feature volume by F
source_feature_3d = self.extract_feature_3d(source_prepared)
return source_kp_info, source_rotation, source_feature_3d, driving_first_frame_kp_info, driving_first_frame_rotation
def transform_keypoint(self, kp_info: dict):
"""
transform the implicit keypoints with the pose, shift, and expression deformation
kp: BxNx3
"""
kp = kp_info['kp'] # (bs, k, 3)
pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
t, exp = kp_info['t'], kp_info['exp']
scale = kp_info['scale']
pitch = headpose_pred_to_degree(pitch)
yaw = headpose_pred_to_degree(yaw)
roll = headpose_pred_to_degree(roll)
bs = kp.shape[0]
if kp.ndim == 2:
num_kp = kp.shape[1] // 3 # Bx(num_kpx3)
else:
num_kp = kp.shape[1] # Bxnum_kpx3
rot_mat = get_rotation_matrix(pitch, yaw, roll) # (bs, 3, 3)
# Eqn.2: s * (R * x_c,s + exp) + t
kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
kp_transformed *= scale[..., None] # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
kp_transformed[:, :, 0:2] += t[:, None, 0:2] # remove z, only apply tx ty
return kp_transformed
def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor:
"""
kp_source: BxNx3
eye_close_ratio: Bx3
Return: Bx(3*num_kp+2)
"""
feat_eye = concat_feat(kp_source, eye_close_ratio)
with torch.no_grad():
delta = self.stitching_retargeting_module['eye'](feat_eye)
return delta
def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor:
"""
kp_source: BxNx3
lip_close_ratio: Bx2
"""
feat_lip = concat_feat(kp_source, lip_close_ratio)
with torch.no_grad():
delta = self.stitching_retargeting_module['lip'](feat_lip)
return delta
def stitch(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
"""
kp_source: BxNx3
kp_driving: BxNx3
Return: Bx(3*num_kp+2)
"""
feat_stiching = concat_feat(kp_source, kp_driving)
with torch.no_grad():
delta = self.stitching_retargeting_module['stitching'](feat_stiching)
return delta
def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
""" conduct the stitching
kp_source: Bxnum_kpx3
kp_driving: Bxnum_kpx3
"""
if self.stitching_retargeting_module is not None:
bs, num_kp = kp_source.shape[:2]
kp_driving_new = kp_driving.clone()
delta = self.stitch(kp_source, kp_driving_new)
delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3) # 1x20x3
delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2) # 1x1x2
kp_driving_new += delta_exp
kp_driving_new[..., :2] += delta_tx_ty
return kp_driving_new
return kp_driving
def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
""" get the image after the warping of the implicit keypoints
feature_3d: Bx32x16x64x64, feature volume
kp_source: BxNx3
kp_driving: BxNx3
"""
# The line 18 in Algorithm 1: D(W(f_s; x_s, x_d,i)
with torch.no_grad():
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision):
# get decoder input
ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving)
# decode
ret_dct['out'] = self.spade_generator(feature=ret_dct['out'])
# float the dict
if self.cfg.flag_use_half_precision:
for k, v in ret_dct.items():
if isinstance(v, torch.Tensor):
ret_dct[k] = v.float()
return ret_dct
def parse_output(self, out: torch.Tensor) -> np.ndarray:
""" construct the output as standard
return: 1xHxWx3, uint8
"""
out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1]) # 1x3xHxW -> 1xHxWx3
out = np.clip(out, 0, 1) # clip to 0~1
out = np.clip(out * 255, 0, 255).astype(np.uint8) # 0~1 -> 0~255
return out
def calc_retargeting_ratio(self, source_lmk, driving_lmk_lst):
input_eye_ratio_lst = []
input_lip_ratio_lst = []
for lmk in driving_lmk_lst:
# for eyes retargeting
input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
# for lip retargeting
input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
return input_eye_ratio_lst, input_lip_ratio_lst
def calc_combined_eye_ratio(self, input_eye_ratio, source_lmk):
eye_close_ratio = calc_eye_close_ratio(source_lmk[None])
eye_close_ratio_tensor = torch.from_numpy(eye_close_ratio).float().cuda(self.device_id)
input_eye_ratio_tensor = torch.Tensor([input_eye_ratio[0][0]]).reshape(1, 1).cuda(self.device_id)
# [c_s,eyes, c_d,eyes,i]
combined_eye_ratio_tensor = torch.cat([eye_close_ratio_tensor, input_eye_ratio_tensor], dim=1)
return combined_eye_ratio_tensor
def calc_combined_lip_ratio(self, input_lip_ratio, source_lmk):
lip_close_ratio = calc_lip_close_ratio(source_lmk[None])
lip_close_ratio_tensor = torch.from_numpy(lip_close_ratio).float().cuda(self.device_id)
# [c_s,lip, c_d,lip,i]
input_lip_ratio_tensor = torch.Tensor([input_lip_ratio[0]]).cuda(self.device_id)
if input_lip_ratio_tensor.shape != [1, 1]:
input_lip_ratio_tensor = input_lip_ratio_tensor.reshape(1, 1)
combined_lip_ratio_tensor = torch.cat([lip_close_ratio_tensor, input_lip_ratio_tensor], dim=1)
return combined_lip_ratio_tensor

0
src/modules/__init__.py Normal file
View File

View File

@ -0,0 +1,48 @@
# coding: utf-8
"""
Appearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume.
"""
import torch
from torch import nn
from .util import SameBlock2d, DownBlock2d, ResBlock3d
class AppearanceFeatureExtractor(nn.Module):
def __init__(self, image_channel, block_expansion, num_down_blocks, max_features, reshape_channel, reshape_depth, num_resblocks):
super(AppearanceFeatureExtractor, self).__init__()
self.image_channel = image_channel
self.block_expansion = block_expansion
self.num_down_blocks = num_down_blocks
self.max_features = max_features
self.reshape_channel = reshape_channel
self.reshape_depth = reshape_depth
self.first = SameBlock2d(image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1))
down_blocks = []
for i in range(num_down_blocks):
in_features = min(max_features, block_expansion * (2 ** i))
out_features = min(max_features, block_expansion * (2 ** (i + 1)))
down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))
self.down_blocks = nn.ModuleList(down_blocks)
self.second = nn.Conv2d(in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1)
self.resblocks_3d = torch.nn.Sequential()
for i in range(num_resblocks):
self.resblocks_3d.add_module('3dr' + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1))
def forward(self, source_image):
out = self.first(source_image) # Bx3x256x256 -> Bx64x256x256
for i in range(len(self.down_blocks)):
out = self.down_blocks[i](out)
out = self.second(out)
bs, c, h, w = out.shape # ->Bx512x64x64
f_s = out.view(bs, self.reshape_channel, self.reshape_depth, h, w) # ->Bx32x16x64x64
f_s = self.resblocks_3d(f_s) # ->Bx32x16x64x64
return f_s

149
src/modules/convnextv2.py Normal file
View File

@ -0,0 +1,149 @@
# coding: utf-8
"""
This moudle is adapted to the ConvNeXtV2 version for the extraction of implicit keypoints, poses, and expression deformation.
"""
import torch
import torch.nn as nn
# from timm.models.layers import trunc_normal_, DropPath
from .util import LayerNorm, DropPath, trunc_normal_, GRN
__all__ = ['convnextv2_tiny']
class Block(nn.Module):
""" ConvNeXtV2 Block.
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
"""
def __init__(self, dim, drop_path=0.):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.grn = GRN(4 * dim)
self.pwconv2 = nn.Linear(4 * dim, dim)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.grn(x)
x = self.pwconv2(x)
x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
x = input + self.drop_path(x)
return x
class ConvNeXtV2(nn.Module):
""" ConvNeXt V2
Args:
in_chans (int): Number of input image channels. Default: 3
num_classes (int): Number of classes for classification head. Default: 1000
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
"""
def __init__(
self,
in_chans=3,
depths=[3, 3, 9, 3],
dims=[96, 192, 384, 768],
drop_path_rate=0.,
**kwargs
):
super().__init__()
self.depths = depths
self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
)
self.downsample_layers.append(stem)
for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
)
self.downsample_layers.append(downsample_layer)
self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
cur = 0
for i in range(4):
stage = nn.Sequential(
*[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
)
self.stages.append(stage)
cur += depths[i]
self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
# NOTE: the output semantic items
num_bins = kwargs.get('num_bins', 66)
num_kp = kwargs.get('num_kp', 24) # the number of implicit keypoints
self.fc_kp = nn.Linear(dims[-1], 3 * num_kp) # implicit keypoints
# print('dims[-1]: ', dims[-1])
self.fc_scale = nn.Linear(dims[-1], 1) # scale
self.fc_pitch = nn.Linear(dims[-1], num_bins) # pitch bins
self.fc_yaw = nn.Linear(dims[-1], num_bins) # yaw bins
self.fc_roll = nn.Linear(dims[-1], num_bins) # roll bins
self.fc_t = nn.Linear(dims[-1], 3) # translation
self.fc_exp = nn.Linear(dims[-1], 3 * num_kp) # expression / delta
def _init_weights(self, m):
if isinstance(m, (nn.Conv2d, nn.Linear)):
trunc_normal_(m.weight, std=.02)
nn.init.constant_(m.bias, 0)
def forward_features(self, x):
for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x)
return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C)
def forward(self, x):
x = self.forward_features(x)
# implicit keypoints
kp = self.fc_kp(x)
# pose and expression deformation
pitch = self.fc_pitch(x)
yaw = self.fc_yaw(x)
roll = self.fc_roll(x)
t = self.fc_t(x)
exp = self.fc_exp(x)
scale = self.fc_scale(x)
ret_dct = {
'pitch': pitch,
'yaw': yaw,
'roll': roll,
't': t,
'exp': exp,
'scale': scale,
'kp': kp, # canonical keypoint
}
return ret_dct
def convnextv2_tiny(**kwargs):
model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
return model

104
src/modules/dense_motion.py Normal file
View File

@ -0,0 +1,104 @@
# coding: utf-8
"""
The module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
"""
from torch import nn
import torch.nn.functional as F
import torch
from .util import Hourglass, make_coordinate_grid, kp2gaussian
class DenseMotionNetwork(nn.Module):
def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True):
super(DenseMotionNetwork, self).__init__()
self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks) # ~60+G
self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3) # 65G! NOTE: computation cost is large
self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1) # 0.8G
self.norm = nn.BatchNorm3d(compress, affine=True)
self.num_kp = num_kp
self.flag_estimate_occlusion_map = estimate_occlusion_map
if self.flag_estimate_occlusion_map:
self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3)
else:
self.occlusion = None
def create_sparse_motions(self, feature, kp_driving, kp_source):
bs, _, d, h, w = feature.shape # (bs, 4, 16, 64, 64)
identity_grid = make_coordinate_grid((d, h, w), ref=kp_source) # (16, 64, 64, 3)
identity_grid = identity_grid.view(1, 1, d, h, w, 3) # (1, 1, d=16, h=64, w=64, 3)
coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3)
k = coordinate_grid.shape[1]
# NOTE: there lacks an one-order flow
driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3) # (bs, num_kp, d, h, w, 3)
# adding background feature
identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1)
sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1) # (bs, 1+num_kp, d, h, w, 3)
return sparse_motions
def create_deformed_feature(self, feature, sparse_motions):
bs, _, d, h, w = feature.shape
feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1) # (bs, num_kp+1, 1, c, d, h, w)
feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w) # (bs*(num_kp+1), c, d, h, w)
sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1)) # (bs*(num_kp+1), d, h, w, 3)
sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False)
sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w)) # (bs, num_kp+1, c, d, h, w)
return sparse_deformed
def create_heatmap_representations(self, feature, kp_driving, kp_source):
spatial_size = feature.shape[3:] # (d=16, h=64, w=64)
gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01) # (bs, num_kp, d, h, w)
gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01) # (bs, num_kp, d, h, w)
heatmap = gaussian_driving - gaussian_source # (bs, num_kp, d, h, w)
# adding background feature
zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.type()).to(heatmap.device)
heatmap = torch.cat([zeros, heatmap], dim=1)
heatmap = heatmap.unsqueeze(2) # (bs, 1+num_kp, 1, d, h, w)
return heatmap
def forward(self, feature, kp_driving, kp_source):
bs, _, d, h, w = feature.shape # (bs, 32, 16, 64, 64)
feature = self.compress(feature) # (bs, 4, 16, 64, 64)
feature = self.norm(feature) # (bs, 4, 16, 64, 64)
feature = F.relu(feature) # (bs, 4, 16, 64, 64)
out_dict = dict()
# 1. deform 3d feature
sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source) # (bs, 1+num_kp, d, h, w, 3)
deformed_feature = self.create_deformed_feature(feature, sparse_motion) # (bs, 1+num_kp, c=4, d=16, h=64, w=64)
# 2. (bs, 1+num_kp, d, h, w)
heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source) # (bs, 1+num_kp, 1, d, h, w)
input = torch.cat([heatmap, deformed_feature], dim=2) # (bs, 1+num_kp, c=5, d=16, h=64, w=64)
input = input.view(bs, -1, d, h, w) # (bs, (1+num_kp)*c=105, d=16, h=64, w=64)
prediction = self.hourglass(input)
mask = self.mask(prediction)
mask = F.softmax(mask, dim=1) # (bs, 1+num_kp, d=16, h=64, w=64)
out_dict['mask'] = mask
mask = mask.unsqueeze(2) # (bs, num_kp+1, 1, d, h, w)
sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4) # (bs, num_kp+1, 3, d, h, w)
deformation = (sparse_motion * mask).sum(dim=1) # (bs, 3, d, h, w) mask take effect in this place
deformation = deformation.permute(0, 2, 3, 4, 1) # (bs, d, h, w, 3)
out_dict['deformation'] = deformation
if self.flag_estimate_occlusion_map:
bs, _, d, h, w = prediction.shape
prediction_reshape = prediction.view(bs, -1, h, w)
occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape)) # Bx1x64x64
out_dict['occlusion_map'] = occlusion_map
return out_dict

View File

@ -0,0 +1,35 @@
# coding: utf-8
"""
Motion extractor(M), which directly predicts the canonical keypoints, head pose and expression deformation of the input image
"""
from torch import nn
import torch
from .convnextv2 import convnextv2_tiny
from .util import filter_state_dict
model_dict = {
'convnextv2_tiny': convnextv2_tiny,
}
class MotionExtractor(nn.Module):
def __init__(self, **kwargs):
super(MotionExtractor, self).__init__()
# default is convnextv2_base
backbone = kwargs.get('backbone', 'convnextv2_tiny')
self.detector = model_dict.get(backbone)(**kwargs)
def load_pretrained(self, init_path: str):
if init_path not in (None, ''):
state_dict = torch.load(init_path, map_location=lambda storage, loc: storage)['model']
state_dict = filter_state_dict(state_dict, remove_name='head')
ret = self.detector.load_state_dict(state_dict, strict=False)
print(f'Load pretrained model from {init_path}, ret: {ret}')
def forward(self, x):
out = self.detector(x)
return out

View File

@ -0,0 +1,59 @@
# coding: utf-8
"""
Spade decoder(G) defined in the paper, which input the warped feature to generate the animated image.
"""
import torch
from torch import nn
import torch.nn.functional as F
from .util import SPADEResnetBlock
class SPADEDecoder(nn.Module):
def __init__(self, upscale=1, max_features=256, block_expansion=64, out_channels=64, num_down_blocks=2):
for i in range(num_down_blocks):
input_channels = min(max_features, block_expansion * (2 ** (i + 1)))
self.upscale = upscale
super().__init__()
norm_G = 'spadespectralinstance'
label_num_channels = input_channels # 256
self.fc = nn.Conv2d(input_channels, 2 * input_channels, 3, padding=1)
self.G_middle_0 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
self.G_middle_1 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
self.G_middle_2 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
self.G_middle_3 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
self.G_middle_4 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
self.G_middle_5 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels)
self.up_0 = SPADEResnetBlock(2 * input_channels, input_channels, norm_G, label_num_channels)
self.up_1 = SPADEResnetBlock(input_channels, out_channels, norm_G, label_num_channels)
self.up = nn.Upsample(scale_factor=2)
if self.upscale is None or self.upscale <= 1:
self.conv_img = nn.Conv2d(out_channels, 3, 3, padding=1)
else:
self.conv_img = nn.Sequential(
nn.Conv2d(out_channels, 3 * (2 * 2), kernel_size=3, padding=1),
nn.PixelShuffle(upscale_factor=2)
)
def forward(self, feature):
seg = feature # Bx256x64x64
x = self.fc(feature) # Bx512x64x64
x = self.G_middle_0(x, seg)
x = self.G_middle_1(x, seg)
x = self.G_middle_2(x, seg)
x = self.G_middle_3(x, seg)
x = self.G_middle_4(x, seg)
x = self.G_middle_5(x, seg)
x = self.up(x) # Bx512x64x64 -> Bx512x128x128
x = self.up_0(x, seg) # Bx512x128x128 -> Bx256x128x128
x = self.up(x) # Bx256x128x128 -> Bx256x256x256
x = self.up_1(x, seg) # Bx256x256x256 -> Bx64x256x256
x = self.conv_img(F.leaky_relu(x, 2e-1)) # Bx64x256x256 -> Bx3xHxW
x = torch.sigmoid(x) # Bx3xHxW
return x

View File

@ -0,0 +1,38 @@
# coding: utf-8
"""
Stitching module(S) and two retargeting modules(R) defined in the paper.
- The stitching module pastes the animated portrait back into the original image space without pixel misalignment, such as in
the stitching region.
- The eyes retargeting module is designed to address the issue of incomplete eye closure during cross-id reenactment, especially
when a person with small eyes drives a person with larger eyes.
- The lip retargeting module is designed similarly to the eye retargeting module, and can also normalize the input by ensuring that
the lips are in a closed state, which facilitates better animation driving.
"""
from torch import nn
class StitchingRetargetingNetwork(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size):
super(StitchingRetargetingNetwork, self).__init__()
layers = []
for i in range(len(hidden_sizes)):
if i == 0:
layers.append(nn.Linear(input_size, hidden_sizes[i]))
else:
layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
layers.append(nn.ReLU(inplace=True))
layers.append(nn.Linear(hidden_sizes[-1], output_size))
self.mlp = nn.Sequential(*layers)
def initialize_weights_to_zero(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.zeros_(m.weight)
nn.init.zeros_(m.bias)
def forward(self, x):
return self.mlp(x)

441
src/modules/util.py Normal file
View File

@ -0,0 +1,441 @@
# coding: utf-8
"""
This file defines various neural network modules and utility functions, including convolutional and residual blocks,
normalizations, and functions for spatial transformation and tensor manipulation.
"""
from torch import nn
import torch.nn.functional as F
import torch
import torch.nn.utils.spectral_norm as spectral_norm
import math
import warnings
def kp2gaussian(kp, spatial_size, kp_variance):
"""
Transform a keypoint into gaussian like representation
"""
mean = kp
coordinate_grid = make_coordinate_grid(spatial_size, mean)
number_of_leading_dimensions = len(mean.shape) - 1
shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape
coordinate_grid = coordinate_grid.view(*shape)
repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)
coordinate_grid = coordinate_grid.repeat(*repeats)
# Preprocess kp shape
shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)
mean = mean.view(*shape)
mean_sub = (coordinate_grid - mean)
out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)
return out
def make_coordinate_grid(spatial_size, ref, **kwargs):
d, h, w = spatial_size
x = torch.arange(w).type(ref.dtype).to(ref.device)
y = torch.arange(h).type(ref.dtype).to(ref.device)
z = torch.arange(d).type(ref.dtype).to(ref.device)
# NOTE: must be right-down-in
x = (2 * (x / (w - 1)) - 1) # the x axis faces to the right
y = (2 * (y / (h - 1)) - 1) # the y axis faces to the bottom
z = (2 * (z / (d - 1)) - 1) # the z axis faces to the inner
yy = y.view(1, -1, 1).repeat(d, 1, w)
xx = x.view(1, 1, -1).repeat(d, h, 1)
zz = z.view(-1, 1, 1).repeat(1, h, w)
meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
return meshed
class ConvT2d(nn.Module):
"""
Upsampling block for use in decoder.
"""
def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1):
super(ConvT2d, self).__init__()
self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride,
padding=padding, output_padding=output_padding)
self.norm = nn.InstanceNorm2d(out_features)
def forward(self, x):
out = self.convT(x)
out = self.norm(out)
out = F.leaky_relu(out)
return out
class ResBlock3d(nn.Module):
"""
Res block, preserve spatial resolution.
"""
def __init__(self, in_features, kernel_size, padding):
super(ResBlock3d, self).__init__()
self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
self.norm1 = nn.BatchNorm3d(in_features, affine=True)
self.norm2 = nn.BatchNorm3d(in_features, affine=True)
def forward(self, x):
out = self.norm1(x)
out = F.relu(out)
out = self.conv1(out)
out = self.norm2(out)
out = F.relu(out)
out = self.conv2(out)
out += x
return out
class UpBlock3d(nn.Module):
"""
Upsampling block for use in decoder.
"""
def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
super(UpBlock3d, self).__init__()
self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
padding=padding, groups=groups)
self.norm = nn.BatchNorm3d(out_features, affine=True)
def forward(self, x):
out = F.interpolate(x, scale_factor=(1, 2, 2))
out = self.conv(out)
out = self.norm(out)
out = F.relu(out)
return out
class DownBlock2d(nn.Module):
"""
Downsampling block for use in encoder.
"""
def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
super(DownBlock2d, self).__init__()
self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
self.norm = nn.BatchNorm2d(out_features, affine=True)
self.pool = nn.AvgPool2d(kernel_size=(2, 2))
def forward(self, x):
out = self.conv(x)
out = self.norm(out)
out = F.relu(out)
out = self.pool(out)
return out
class DownBlock3d(nn.Module):
"""
Downsampling block for use in encoder.
"""
def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
super(DownBlock3d, self).__init__()
'''
self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
padding=padding, groups=groups, stride=(1, 2, 2))
'''
self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
padding=padding, groups=groups)
self.norm = nn.BatchNorm3d(out_features, affine=True)
self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2))
def forward(self, x):
out = self.conv(x)
out = self.norm(out)
out = F.relu(out)
out = self.pool(out)
return out
class SameBlock2d(nn.Module):
"""
Simple block, preserve spatial resolution.
"""
def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False):
super(SameBlock2d, self).__init__()
self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
self.norm = nn.BatchNorm2d(out_features, affine=True)
if lrelu:
self.ac = nn.LeakyReLU()
else:
self.ac = nn.ReLU()
def forward(self, x):
out = self.conv(x)
out = self.norm(out)
out = self.ac(out)
return out
class Encoder(nn.Module):
"""
Hourglass Encoder
"""
def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
super(Encoder, self).__init__()
down_blocks = []
for i in range(num_blocks):
down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1))
self.down_blocks = nn.ModuleList(down_blocks)
def forward(self, x):
outs = [x]
for down_block in self.down_blocks:
outs.append(down_block(outs[-1]))
return outs
class Decoder(nn.Module):
"""
Hourglass Decoder
"""
def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
super(Decoder, self).__init__()
up_blocks = []
for i in range(num_blocks)[::-1]:
in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
out_filters = min(max_features, block_expansion * (2 ** i))
up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))
self.up_blocks = nn.ModuleList(up_blocks)
self.out_filters = block_expansion + in_features
self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1)
self.norm = nn.BatchNorm3d(self.out_filters, affine=True)
def forward(self, x):
out = x.pop()
for up_block in self.up_blocks:
out = up_block(out)
skip = x.pop()
out = torch.cat([out, skip], dim=1)
out = self.conv(out)
out = self.norm(out)
out = F.relu(out)
return out
class Hourglass(nn.Module):
"""
Hourglass architecture.
"""
def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
super(Hourglass, self).__init__()
self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)
self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)
self.out_filters = self.decoder.out_filters
def forward(self, x):
return self.decoder(self.encoder(x))
class SPADE(nn.Module):
def __init__(self, norm_nc, label_nc):
super().__init__()
self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)
nhidden = 128
self.mlp_shared = nn.Sequential(
nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1),
nn.ReLU())
self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
def forward(self, x, segmap):
normalized = self.param_free_norm(x)
segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')
actv = self.mlp_shared(segmap)
gamma = self.mlp_gamma(actv)
beta = self.mlp_beta(actv)
out = normalized * (1 + gamma) + beta
return out
class SPADEResnetBlock(nn.Module):
def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1):
super().__init__()
# Attributes
self.learned_shortcut = (fin != fout)
fmiddle = min(fin, fout)
self.use_se = use_se
# create conv layers
self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation)
self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation)
if self.learned_shortcut:
self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)
# apply spectral norm if specified
if 'spectral' in norm_G:
self.conv_0 = spectral_norm(self.conv_0)
self.conv_1 = spectral_norm(self.conv_1)
if self.learned_shortcut:
self.conv_s = spectral_norm(self.conv_s)
# define normalization layers
self.norm_0 = SPADE(fin, label_nc)
self.norm_1 = SPADE(fmiddle, label_nc)
if self.learned_shortcut:
self.norm_s = SPADE(fin, label_nc)
def forward(self, x, seg1):
x_s = self.shortcut(x, seg1)
dx = self.conv_0(self.actvn(self.norm_0(x, seg1)))
dx = self.conv_1(self.actvn(self.norm_1(dx, seg1)))
out = x_s + dx
return out
def shortcut(self, x, seg1):
if self.learned_shortcut:
x_s = self.conv_s(self.norm_s(x, seg1))
else:
x_s = x
return x_s
def actvn(self, x):
return F.leaky_relu(x, 2e-1)
def filter_state_dict(state_dict, remove_name='fc'):
new_state_dict = {}
for key in state_dict:
if remove_name in key:
continue
new_state_dict[key] = state_dict[key]
return new_state_dict
class GRN(nn.Module):
""" GRN (Global Response Normalization) layer
"""
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
def forward(self, x):
Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
return self.gamma * (x * Nx) + self.beta + x
class LayerNorm(nn.Module):
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape, )
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
def _no_grad_trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1. + math.erf(x / math.sqrt(2.))) / 2.
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2)
with torch.no_grad():
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
return tensor
def drop_path(x, drop_prob=0., training=False, scale_by_keep=True):
""" Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
if keep_prob > 0.0 and scale_by_keep:
random_tensor.div_(keep_prob)
return x * random_tensor
class DropPath(nn.Module):
""" Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None, scale_by_keep=True):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
self.scale_by_keep = scale_by_keep
def forward(self, x):
return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
return _no_grad_trunc_normal_(tensor, mean, std, a, b)

View File

@ -0,0 +1,77 @@
# coding: utf-8
"""
Warping field estimator(W) defined in the paper, which generates a warping field using the implicit
keypoint representations x_s and x_d, and employs this flow field to warp the source feature volume f_s.
"""
from torch import nn
import torch.nn.functional as F
from .util import SameBlock2d
from .dense_motion import DenseMotionNetwork
class WarpingNetwork(nn.Module):
def __init__(
self,
num_kp,
block_expansion,
max_features,
num_down_blocks,
reshape_channel,
estimate_occlusion_map=False,
dense_motion_params=None,
**kwargs
):
super(WarpingNetwork, self).__init__()
self.upscale = kwargs.get('upscale', 1)
self.flag_use_occlusion_map = kwargs.get('flag_use_occlusion_map', True)
if dense_motion_params is not None:
self.dense_motion_network = DenseMotionNetwork(
num_kp=num_kp,
feature_channel=reshape_channel,
estimate_occlusion_map=estimate_occlusion_map,
**dense_motion_params
)
else:
self.dense_motion_network = None
self.third = SameBlock2d(max_features, block_expansion * (2 ** num_down_blocks), kernel_size=(3, 3), padding=(1, 1), lrelu=True)
self.fourth = nn.Conv2d(in_channels=block_expansion * (2 ** num_down_blocks), out_channels=block_expansion * (2 ** num_down_blocks), kernel_size=1, stride=1)
self.estimate_occlusion_map = estimate_occlusion_map
def deform_input(self, inp, deformation):
return F.grid_sample(inp, deformation, align_corners=False)
def forward(self, feature_3d, kp_driving, kp_source):
if self.dense_motion_network is not None:
# Feature warper, Transforming feature representation according to deformation and occlusion
dense_motion = self.dense_motion_network(
feature=feature_3d, kp_driving=kp_driving, kp_source=kp_source
)
if 'occlusion_map' in dense_motion:
occlusion_map = dense_motion['occlusion_map'] # Bx1x64x64
else:
occlusion_map = None
deformation = dense_motion['deformation'] # Bx16x64x64x3
out = self.deform_input(feature_3d, deformation) # Bx32x16x64x64
bs, c, d, h, w = out.shape # Bx32x16x64x64
out = out.view(bs, c * d, h, w) # -> Bx512x64x64
out = self.third(out) # -> Bx256x64x64
out = self.fourth(out) # -> Bx256x64x64
if self.flag_use_occlusion_map and (occlusion_map is not None):
out = out * occlusion_map
ret_dct = {
'occlusion_map': occlusion_map,
'deformation': deformation,
'out': out,
}
return ret_dct

65
src/template_maker.py Normal file
View File

@ -0,0 +1,65 @@
# coding: utf-8
"""
Make video template
"""
import os
import cv2
import numpy as np
import pickle
from rich.progress import track
from .utils.cropper import Cropper
from .utils.io import load_driving_info
from .utils.camera import get_rotation_matrix
from .utils.helper import mkdir, basename
from .utils.rprint import rlog as log
from .config.crop_config import CropConfig
from .config.inference_config import InferenceConfig
from .live_portrait_wrapper import LivePortraitWrapper
class TemplateMaker:
def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):
self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(cfg=inference_cfg)
self.cropper = Cropper(crop_cfg=crop_cfg)
def make_motion_template(self, video_fp: str, output_path: str, **kwargs):
""" make video template (.pkl format)
video_fp: driving video file path
output_path: where to save the pickle file
"""
driving_rgb_lst = load_driving_info(video_fp)
driving_rgb_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]
driving_lmk_lst = self.cropper.get_retargeting_lmk_info(driving_rgb_lst)
I_d_lst = self.live_portrait_wrapper.prepare_driving_videos(driving_rgb_lst)
n_frames = I_d_lst.shape[0]
templates = []
for i in track(range(n_frames), description='Making templates...', total=n_frames):
I_d_i = I_d_lst[i]
x_d_i_info = self.live_portrait_wrapper.get_kp_info(I_d_i)
R_d_i = get_rotation_matrix(x_d_i_info['pitch'], x_d_i_info['yaw'], x_d_i_info['roll'])
# collect s_d, R_d, δ_d and t_d for inference
template_dct = {
'n_frames': n_frames,
'frames_index': i,
}
template_dct['scale'] = x_d_i_info['scale'].cpu().numpy().astype(np.float32)
template_dct['R_d'] = R_d_i.cpu().numpy().astype(np.float32)
template_dct['exp'] = x_d_i_info['exp'].cpu().numpy().astype(np.float32)
template_dct['t'] = x_d_i_info['t'].cpu().numpy().astype(np.float32)
templates.append(template_dct)
mkdir(output_path)
# Save the dictionary as a pickle file
pickle_fp = os.path.join(output_path, f'{basename(video_fp)}.pkl')
with open(pickle_fp, 'wb') as f:
pickle.dump([templates, driving_lmk_lst], f)
log(f"Template saved at {pickle_fp}")

0
src/utils/__init__.py Normal file
View File

75
src/utils/camera.py Normal file
View File

@ -0,0 +1,75 @@
# coding: utf-8
"""
functions for processing and transforming 3D facial keypoints
"""
import numpy as np
import torch
import torch.nn.functional as F
PI = np.pi
def headpose_pred_to_degree(pred):
"""
pred: (bs, 66) or (bs, 1) or others
"""
if pred.ndim > 1 and pred.shape[1] == 66:
# NOTE: note that the average is modified to 97.5
device = pred.device
idx_tensor = [idx for idx in range(0, 66)]
idx_tensor = torch.FloatTensor(idx_tensor).to(device)
pred = F.softmax(pred, dim=1)
degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5
return degree
return pred
def get_rotation_matrix(pitch_, yaw_, roll_):
""" the input is in degree
"""
# calculate the rotation matrix: vps @ rot
# transform to radian
pitch = pitch_ / 180 * PI
yaw = yaw_ / 180 * PI
roll = roll_ / 180 * PI
device = pitch.device
if pitch.ndim == 1:
pitch = pitch.unsqueeze(1)
if yaw.ndim == 1:
yaw = yaw.unsqueeze(1)
if roll.ndim == 1:
roll = roll.unsqueeze(1)
# calculate the euler matrix
bs = pitch.shape[0]
ones = torch.ones([bs, 1]).to(device)
zeros = torch.zeros([bs, 1]).to(device)
x, y, z = pitch, yaw, roll
rot_x = torch.cat([
ones, zeros, zeros,
zeros, torch.cos(x), -torch.sin(x),
zeros, torch.sin(x), torch.cos(x)
], dim=1).reshape([bs, 3, 3])
rot_y = torch.cat([
torch.cos(y), zeros, torch.sin(y),
zeros, ones, zeros,
-torch.sin(y), zeros, torch.cos(y)
], dim=1).reshape([bs, 3, 3])
rot_z = torch.cat([
torch.cos(z), -torch.sin(z), zeros,
torch.sin(z), torch.cos(z), zeros,
zeros, zeros, ones
], dim=1).reshape([bs, 3, 3])
rot = rot_z @ rot_y @ rot_x
return rot.permute(0, 2, 1) # transpose

412
src/utils/crop.py Normal file
View File

@ -0,0 +1,412 @@
# coding: utf-8
"""
cropping function and the related preprocess functions for cropping
"""
import numpy as np
import os.path as osp
from math import sin, cos, acos, degrees
import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) # NOTE: enforce single thread
from .rprint import rprint as print
DTYPE = np.float32
CV2_INTERP = cv2.INTER_LINEAR
def make_abs_path(fn):
return osp.join(osp.dirname(osp.realpath(__file__)), fn)
def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
""" conduct similarity or affine transformation to the image, do not do border operation!
img:
M: 2x3 matrix or 3x3 matrix
dsize: target shape (width, height)
"""
if isinstance(dsize, tuple) or isinstance(dsize, list):
_dsize = tuple(dsize)
else:
_dsize = (dsize, dsize)
if borderMode is not None:
return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
else:
return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
def _transform_pts(pts, M):
""" conduct similarity or affine transformation to the pts
pts: Nx2 ndarray
M: 2x3 matrix or 3x3 matrix
return: Nx2
"""
return pts @ M[:2, :2].T + M[:2, 2]
def parse_pt2_from_pt101(pt101, use_lip=True):
"""
parsing the 2 points according to the 101 points, which cancels the roll
"""
# the former version use the eye center, but it is not robust, now use interpolation
pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0) # left eye center
pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0) # right eye center
if use_lip:
# use lip
pt_center_eye = (pt_left_eye + pt_right_eye) / 2
pt_center_lip = (pt101[75] + pt101[81]) / 2
pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
else:
pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
return pt2
def parse_pt2_from_pt106(pt106, use_lip=True):
"""
parsing the 2 points according to the 106 points, which cancels the roll
"""
pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0) # left eye center
pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0) # right eye center
if use_lip:
# use lip
pt_center_eye = (pt_left_eye + pt_right_eye) / 2
pt_center_lip = (pt106[52] + pt106[61]) / 2
pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
else:
pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
return pt2
def parse_pt2_from_pt203(pt203, use_lip=True):
"""
parsing the 2 points according to the 203 points, which cancels the roll
"""
pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0) # left eye center
pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0) # right eye center
if use_lip:
# use lip
pt_center_eye = (pt_left_eye + pt_right_eye) / 2
pt_center_lip = (pt203[48] + pt203[66]) / 2
pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
else:
pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
return pt2
def parse_pt2_from_pt68(pt68, use_lip=True):
"""
parsing the 2 points according to the 68 points, which cancels the roll
"""
lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1
if use_lip:
pt5 = np.stack([
np.mean(pt68[lm_idx[[1, 2]], :], 0), # left eye
np.mean(pt68[lm_idx[[3, 4]], :], 0), # right eye
pt68[lm_idx[0], :], # nose
pt68[lm_idx[5], :], # lip
pt68[lm_idx[6], :] # lip
], axis=0)
pt2 = np.stack([
(pt5[0] + pt5[1]) / 2,
(pt5[3] + pt5[4]) / 2
], axis=0)
else:
pt2 = np.stack([
np.mean(pt68[lm_idx[[1, 2]], :], 0), # left eye
np.mean(pt68[lm_idx[[3, 4]], :], 0), # right eye
], axis=0)
return pt2
def parse_pt2_from_pt5(pt5, use_lip=True):
"""
parsing the 2 points according to the 5 points, which cancels the roll
"""
if use_lip:
pt2 = np.stack([
(pt5[0] + pt5[1]) / 2,
(pt5[3] + pt5[4]) / 2
], axis=0)
else:
pt2 = np.stack([
pt5[0],
pt5[1]
], axis=0)
return pt2
def parse_pt2_from_pt_x(pts, use_lip=True):
if pts.shape[0] == 101:
pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip)
elif pts.shape[0] == 106:
pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip)
elif pts.shape[0] == 68:
pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip)
elif pts.shape[0] == 5:
pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip)
elif pts.shape[0] == 203:
pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip)
elif pts.shape[0] > 101:
# take the first 101 points
pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip)
else:
raise Exception(f'Unknow shape: {pts.shape}')
if not use_lip:
# NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually
v = pt2[1] - pt2[0]
pt2[1, 0] = pt2[0, 0] - v[1]
pt2[1, 1] = pt2[0, 1] + v[0]
return pt2
def parse_rect_from_landmark(
pts,
scale=1.5,
need_square=True,
vx_ratio=0,
vy_ratio=0,
use_deg_flag=False,
**kwargs
):
"""parsing center, size, angle from 101/68/5/x landmarks
vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size
vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area
judge with pts.shape
"""
pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True))
uy = pt2[1] - pt2[0]
l = np.linalg.norm(uy)
if l <= 1e-3:
uy = np.array([0, 1], dtype=DTYPE)
else:
uy /= l
ux = np.array((uy[1], -uy[0]), dtype=DTYPE)
# the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system)
# print(uy)
# print(ux)
angle = acos(ux[0])
if ux[1] < 0:
angle = -angle
# rotation matrix
M = np.array([ux, uy])
# calculate the size which contains the angle degree of the bbox, and the center
center0 = np.mean(pts, axis=0)
rpts = (pts - center0) @ M.T # (M @ P.T).T = P @ M.T
lt_pt = np.min(rpts, axis=0)
rb_pt = np.max(rpts, axis=0)
center1 = (lt_pt + rb_pt) / 2
size = rb_pt - lt_pt
if need_square:
m = max(size[0], size[1])
size[0] = m
size[1] = m
size *= scale # scale size
center = center0 + ux * center1[0] + uy * center1[1] # counterclockwise rotation, equivalent to M.T @ center1.T
center = center + ux * (vx_ratio * size) + uy * \
(vy_ratio * size) # considering the offset in vx and vy direction
if use_deg_flag:
angle = degrees(angle)
return center, size, angle
def parse_bbox_from_landmark(pts, **kwargs):
center, size, angle = parse_rect_from_landmark(pts, **kwargs)
cx, cy = center
w, h = size
# calculate the vertex positions before rotation
bbox = np.array([
[cx-w/2, cy-h/2], # left, top
[cx+w/2, cy-h/2],
[cx+w/2, cy+h/2], # right, bottom
[cx-w/2, cy+h/2]
], dtype=DTYPE)
# construct rotation matrix
bbox_rot = bbox.copy()
R = np.array([
[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]
], dtype=DTYPE)
# calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center
bbox_rot = (bbox_rot - center) @ R.T + center
return {
'center': center, # 2x1
'size': size, # scalar
'angle': angle, # rad, counterclockwise
'bbox': bbox, # 4x2
'bbox_rot': bbox_rot, # 4x2
}
def crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs):
left, top, right, bot = bbox
if int(right - left) != int(bot - top):
print(f'right-left {right-left} != bot-top {bot-top}')
size = right - left
src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE)
tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)
s = dsize / size # scale
if flag_rot and angle is not None:
costheta, sintheta = cos(angle), sin(angle)
cx, cy = src_center[0], src_center[1] # ori center
tcx, tcy = tgt_center[0], tgt_center[1] # target center
# need to infer
M_o2c = np.array(
[[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
[-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
dtype=DTYPE
)
else:
M_o2c = np.array(
[[s, 0, tgt_center[0] - s * src_center[0]],
[0, s, tgt_center[1] - s * src_center[1]]],
dtype=DTYPE
)
if flag_rot and angle is None:
print('angle is None, but flag_rotate is True', style="bold yellow")
img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None))
lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None
M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)])
M_c2o = np.linalg.inv(M_o2c)
# cv2.imwrite('crop.jpg', img_crop)
return {
'img_crop': img_crop,
'lmk_crop': lmk_crop,
'M_o2c': M_o2c,
'M_c2o': M_c2o,
}
def _estimate_similar_transform_from_pts(
pts,
dsize,
scale=1.5,
vx_ratio=0,
vy_ratio=-0.1,
flag_do_rot=True,
**kwargs
):
""" calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image
pts: landmark, 101 or 68 points or other points, Nx2
scale: the larger scale factor, the smaller face ratio
vx_ratio: x shift
vy_ratio: y shift, the smaller the y shift, the lower the face region
rot_flag: if it is true, conduct correction
"""
center, size, angle = parse_rect_from_landmark(
pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio,
use_lip=kwargs.get('use_lip', True)
)
s = dsize / size[0] # scale
tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE) # center of dsize
if flag_do_rot:
costheta, sintheta = cos(angle), sin(angle)
cx, cy = center[0], center[1] # ori center
tcx, tcy = tgt_center[0], tgt_center[1] # target center
# need to infer
M_INV = np.array(
[[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
[-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
dtype=DTYPE
)
else:
M_INV = np.array(
[[s, 0, tgt_center[0] - s * center[0]],
[0, s, tgt_center[1] - s * center[1]]],
dtype=DTYPE
)
M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])])
M = np.linalg.inv(M_INV_H)
# M_INV is from the original image to the cropped image, M is from the cropped image to the original image
return M_INV, M[:2, ...]
def crop_image(img, pts: np.ndarray, **kwargs):
dsize = kwargs.get('dsize', 224)
scale = kwargs.get('scale', 1.5) # 1.5 | 1.6
vy_ratio = kwargs.get('vy_ratio', -0.1) # -0.0625 | -0.1
M_INV, _ = _estimate_similar_transform_from_pts(
pts,
dsize=dsize,
scale=scale,
vy_ratio=vy_ratio,
flag_do_rot=kwargs.get('flag_do_rot', True),
)
if img is None:
M_INV_H = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])
M = np.linalg.inv(M_INV_H)
ret_dct = {
'M': M[:2, ...], # from the original image to the cropped image
'M_o2c': M[:2, ...], # from the cropped image to the original image
'img_crop': None,
'pt_crop': None,
}
return ret_dct
img_crop = _transform_img(img, M_INV, dsize) # origin to crop
pt_crop = _transform_pts(pts, M_INV)
M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])
M_c2o = np.linalg.inv(M_o2c)
ret_dct = {
'M_o2c': M_o2c, # from the original image to the cropped image 3x3
'M_c2o': M_c2o, # from the cropped image to the original image 3x3
'img_crop': img_crop, # the cropped image
'pt_crop': pt_crop, # the landmarks of the cropped image
}
return ret_dct
def average_bbox_lst(bbox_lst):
if len(bbox_lst) == 0:
return None
bbox_arr = np.array(bbox_lst)
return np.mean(bbox_arr, axis=0).tolist()
def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
"""prepare mask for later image paste back
"""
if mask_crop is None:
mask_crop = cv2.imread(make_abs_path('./resources/mask_template.png'), cv2.IMREAD_COLOR)
mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
mask_ori = mask_ori.astype(np.float32) / 255.
return mask_ori
def paste_back(image_to_processed, crop_M_c2o, rgb_ori, mask_ori):
"""paste back the image
"""
dsize = (rgb_ori.shape[1], rgb_ori.shape[0])
result = _transform_img(image_to_processed, crop_M_c2o, dsize=dsize)
result = np.clip(mask_ori * result + (1 - mask_ori) * rgb_ori, 0, 255).astype(np.uint8)
return result

145
src/utils/cropper.py Normal file
View File

@ -0,0 +1,145 @@
# coding: utf-8
import gradio as gr
import numpy as np
import os.path as osp
from typing import List, Union, Tuple
from dataclasses import dataclass, field
import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
from .landmark_runner import LandmarkRunner
from .face_analysis_diy import FaceAnalysisDIY
from .helper import prefix
from .crop import crop_image, crop_image_by_bbox, parse_bbox_from_landmark, average_bbox_lst
from .timer import Timer
from .rprint import rlog as log
from .io import load_image_rgb
from .video import VideoWriter, get_fps, change_video_fps
def make_abs_path(fn):
return osp.join(osp.dirname(osp.realpath(__file__)), fn)
@dataclass
class Trajectory:
start: int = -1 # 起始帧 闭区间
end: int = -1 # 结束帧 闭区间
lmk_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # lmk list
bbox_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # bbox list
frame_rgb_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # frame list
frame_rgb_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # frame crop list
class Cropper(object):
def __init__(self, **kwargs) -> None:
device_id = kwargs.get('device_id', 0)
self.landmark_runner = LandmarkRunner(
ckpt_path=make_abs_path('../../pretrained_weights/liveportrait/landmark.onnx'),
onnx_provider='cuda',
device_id=device_id
)
self.landmark_runner.warmup()
self.face_analysis_wrapper = FaceAnalysisDIY(
name='buffalo_l',
root=make_abs_path('../../pretrained_weights/insightface'),
providers=["CUDAExecutionProvider"]
)
self.face_analysis_wrapper.prepare(ctx_id=device_id, det_size=(512, 512))
self.face_analysis_wrapper.warmup()
self.crop_cfg = kwargs.get('crop_cfg', None)
def update_config(self, user_args):
for k, v in user_args.items():
if hasattr(self.crop_cfg, k):
setattr(self.crop_cfg, k, v)
def crop_single_image(self, obj, **kwargs):
direction = kwargs.get('direction', 'large-small')
# crop and align a single image
if isinstance(obj, str):
img_rgb = load_image_rgb(obj)
elif isinstance(obj, np.ndarray):
img_rgb = obj
src_face = self.face_analysis_wrapper.get(
img_rgb,
flag_do_landmark_2d_106=True,
direction=direction
)
if len(src_face) == 0:
log('No face detected in the source image.')
raise gr.Error("No face detected in the source image 💥!", duration=5)
raise Exception("No face detected in the source image!")
elif len(src_face) > 1:
log(f'More than one face detected in the image, only pick one face by rule {direction}.')
src_face = src_face[0]
pts = src_face.landmark_2d_106
# crop the face
ret_dct = crop_image(
img_rgb, # ndarray
pts, # 106x2 or Nx2
dsize=kwargs.get('dsize', 512),
scale=kwargs.get('scale', 2.3),
vy_ratio=kwargs.get('vy_ratio', -0.15),
)
# update a 256x256 version for network input or else
ret_dct['img_crop_256x256'] = cv2.resize(ret_dct['img_crop'], (256, 256), interpolation=cv2.INTER_AREA)
ret_dct['pt_crop_256x256'] = ret_dct['pt_crop'] * 256 / kwargs.get('dsize', 512)
recon_ret = self.landmark_runner.run(img_rgb, pts)
lmk = recon_ret['pts']
ret_dct['lmk_crop'] = lmk
return ret_dct
def get_retargeting_lmk_info(self, driving_rgb_lst):
# TODO: implement a tracking-based version
driving_lmk_lst = []
for driving_image in driving_rgb_lst:
ret_dct = self.crop_single_image(driving_image)
driving_lmk_lst.append(ret_dct['lmk_crop'])
return driving_lmk_lst
def make_video_clip(self, driving_rgb_lst, output_path, output_fps=30, **kwargs):
trajectory = Trajectory()
direction = kwargs.get('direction', 'large-small')
for idx, driving_image in enumerate(driving_rgb_lst):
if idx == 0 or trajectory.start == -1:
src_face = self.face_analysis_wrapper.get(
driving_image,
flag_do_landmark_2d_106=True,
direction=direction
)
if len(src_face) == 0:
# No face detected in the driving_image
continue
elif len(src_face) > 1:
log(f'More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.')
src_face = src_face[0]
pts = src_face.landmark_2d_106
lmk_203 = self.landmark_runner(driving_image, pts)['pts']
trajectory.start, trajectory.end = idx, idx
else:
lmk_203 = self.face_recon_wrapper(driving_image, trajectory.lmk_lst[-1])['pts']
trajectory.end = idx
trajectory.lmk_lst.append(lmk_203)
ret_bbox = parse_bbox_from_landmark(lmk_203, scale=self.crop_cfg.globalscale, vy_ratio=elf.crop_cfg.vy_ratio)['bbox']
bbox = [ret_bbox[0, 0], ret_bbox[0, 1], ret_bbox[2, 0], ret_bbox[2, 1]] # 4,
trajectory.bbox_lst.append(bbox) # bbox
trajectory.frame_rgb_lst.append(driving_image)
global_bbox = average_bbox_lst(trajectory.bbox_lst)
for idx, (frame_rgb, lmk) in enumerate(zip(trajectory.frame_rgb_lst, trajectory.lmk_lst)):
ret_dct = crop_image_by_bbox(
frame_rgb, global_bbox, lmk=lmk,
dsize=self.video_crop_cfg.dsize, flag_rot=self.video_crop_cfg.flag_rot, borderValue=self.video_crop_cfg.borderValue
)
frame_rgb_crop = ret_dct['img_crop']

View File

@ -0,0 +1,20 @@
# coding: utf-8
# pylint: disable=wrong-import-position
"""InsightFace: A Face Analysis Toolkit."""
from __future__ import absolute_import
try:
#import mxnet as mx
import onnxruntime
except ImportError:
raise ImportError(
"Unable to import dependency onnxruntime. "
)
__version__ = '0.7.3'
from . import model_zoo
from . import utils
from . import app
from . import data

View File

@ -0,0 +1 @@
from .face_analysis import *

View File

@ -0,0 +1,49 @@
import numpy as np
from numpy.linalg import norm as l2norm
#from easydict import EasyDict
class Face(dict):
def __init__(self, d=None, **kwargs):
if d is None:
d = {}
if kwargs:
d.update(**kwargs)
for k, v in d.items():
setattr(self, k, v)
# Class attributes
#for k in self.__class__.__dict__.keys():
# if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
# setattr(self, k, getattr(self, k))
def __setattr__(self, name, value):
if isinstance(value, (list, tuple)):
value = [self.__class__(x)
if isinstance(x, dict) else x for x in value]
elif isinstance(value, dict) and not isinstance(value, self.__class__):
value = self.__class__(value)
super(Face, self).__setattr__(name, value)
super(Face, self).__setitem__(name, value)
__setitem__ = __setattr__
def __getattr__(self, name):
return None
@property
def embedding_norm(self):
if self.embedding is None:
return None
return l2norm(self.embedding)
@property
def normed_embedding(self):
if self.embedding is None:
return None
return self.embedding / self.embedding_norm
@property
def sex(self):
if self.gender is None:
return None
return 'M' if self.gender==1 else 'F'

View File

@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-05-04
# @Function :
from __future__ import division
import glob
import os.path as osp
import numpy as np
import onnxruntime
from numpy.linalg import norm
from ..model_zoo import model_zoo
from ..utils import ensure_available
from .common import Face
DEFAULT_MP_NAME = 'buffalo_l'
__all__ = ['FaceAnalysis']
class FaceAnalysis:
def __init__(self, name=DEFAULT_MP_NAME, root='~/.insightface', allowed_modules=None, **kwargs):
onnxruntime.set_default_logger_severity(3)
self.models = {}
self.model_dir = ensure_available('models', name, root=root)
onnx_files = glob.glob(osp.join(self.model_dir, '*.onnx'))
onnx_files = sorted(onnx_files)
for onnx_file in onnx_files:
model = model_zoo.get_model(onnx_file, **kwargs)
if model is None:
print('model not recognized:', onnx_file)
elif allowed_modules is not None and model.taskname not in allowed_modules:
print('model ignore:', onnx_file, model.taskname)
del model
elif model.taskname not in self.models and (allowed_modules is None or model.taskname in allowed_modules):
# print('find model:', onnx_file, model.taskname, model.input_shape, model.input_mean, model.input_std)
self.models[model.taskname] = model
else:
print('duplicated model task type, ignore:', onnx_file, model.taskname)
del model
assert 'detection' in self.models
self.det_model = self.models['detection']
def prepare(self, ctx_id, det_thresh=0.5, det_size=(640, 640)):
self.det_thresh = det_thresh
assert det_size is not None
# print('set det-size:', det_size)
self.det_size = det_size
for taskname, model in self.models.items():
if taskname=='detection':
model.prepare(ctx_id, input_size=det_size, det_thresh=det_thresh)
else:
model.prepare(ctx_id)
def get(self, img, max_num=0):
bboxes, kpss = self.det_model.detect(img,
max_num=max_num,
metric='default')
if bboxes.shape[0] == 0:
return []
ret = []
for i in range(bboxes.shape[0]):
bbox = bboxes[i, 0:4]
det_score = bboxes[i, 4]
kps = None
if kpss is not None:
kps = kpss[i]
face = Face(bbox=bbox, kps=kps, det_score=det_score)
for taskname, model in self.models.items():
if taskname=='detection':
continue
model.get(img, face)
ret.append(face)
return ret
def draw_on(self, img, faces):
import cv2
dimg = img.copy()
for i in range(len(faces)):
face = faces[i]
box = face.bbox.astype(np.int)
color = (0, 0, 255)
cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)
if face.kps is not None:
kps = face.kps.astype(np.int)
#print(landmark.shape)
for l in range(kps.shape[0]):
color = (0, 0, 255)
if l == 0 or l == 3:
color = (0, 255, 0)
cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color,
2)
if face.gender is not None and face.age is not None:
cv2.putText(dimg,'%s,%d'%(face.sex,face.age), (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1)
#for key, value in face.items():
# if key.startswith('landmark_3d'):
# print(key, value.shape)
# print(value[0:10,:])
# lmk = np.round(value).astype(np.int)
# for l in range(lmk.shape[0]):
# color = (255, 0, 0)
# cv2.circle(dimg, (lmk[l][0], lmk[l][1]), 1, color,
# 2)
return dimg

View File

@ -0,0 +1,2 @@
from .image import get_image
from .pickle_object import get_object

View File

@ -0,0 +1,27 @@
import cv2
import os
import os.path as osp
from pathlib import Path
class ImageCache:
data = {}
def get_image(name, to_rgb=False):
key = (name, to_rgb)
if key in ImageCache.data:
return ImageCache.data[key]
images_dir = osp.join(Path(__file__).parent.absolute(), 'images')
ext_names = ['.jpg', '.png', '.jpeg']
image_file = None
for ext_name in ext_names:
_image_file = osp.join(images_dir, "%s%s"%(name, ext_name))
if osp.exists(_image_file):
image_file = _image_file
break
assert image_file is not None, '%s not found'%name
img = cv2.imread(image_file)
if to_rgb:
img = img[:,:,::-1]
ImageCache.data[key] = img
return img

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

View File

@ -0,0 +1,17 @@
import cv2
import os
import os.path as osp
from pathlib import Path
import pickle
def get_object(name):
objects_dir = osp.join(Path(__file__).parent.absolute(), 'objects')
if not name.endswith('.pkl'):
name = name+".pkl"
filepath = osp.join(objects_dir, name)
if not osp.exists(filepath):
return None
with open(filepath, 'rb') as f:
obj = pickle.load(f)
return obj

View File

@ -0,0 +1,71 @@
import pickle
import numpy as np
import os
import os.path as osp
import sys
import mxnet as mx
class RecBuilder():
def __init__(self, path, image_size=(112, 112)):
self.path = path
self.image_size = image_size
self.widx = 0
self.wlabel = 0
self.max_label = -1
assert not osp.exists(path), '%s exists' % path
os.makedirs(path)
self.writer = mx.recordio.MXIndexedRecordIO(os.path.join(path, 'train.idx'),
os.path.join(path, 'train.rec'),
'w')
self.meta = []
def add(self, imgs):
#!!! img should be BGR!!!!
#assert label >= 0
#assert label > self.last_label
assert len(imgs) > 0
label = self.wlabel
for img in imgs:
idx = self.widx
image_meta = {'image_index': idx, 'image_classes': [label]}
header = mx.recordio.IRHeader(0, label, idx, 0)
if isinstance(img, np.ndarray):
s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
else:
s = mx.recordio.pack(header, img)
self.writer.write_idx(idx, s)
self.meta.append(image_meta)
self.widx += 1
self.max_label = label
self.wlabel += 1
def add_image(self, img, label):
#!!! img should be BGR!!!!
#assert label >= 0
#assert label > self.last_label
idx = self.widx
header = mx.recordio.IRHeader(0, label, idx, 0)
if isinstance(label, list):
idlabel = label[0]
else:
idlabel = label
image_meta = {'image_index': idx, 'image_classes': [idlabel]}
if isinstance(img, np.ndarray):
s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
else:
s = mx.recordio.pack(header, img)
self.writer.write_idx(idx, s)
self.meta.append(image_meta)
self.widx += 1
self.max_label = max(self.max_label, idlabel)
def close(self):
with open(osp.join(self.path, 'train.meta'), 'wb') as pfile:
pickle.dump(self.meta, pfile, protocol=pickle.HIGHEST_PROTOCOL)
print('stat:', self.widx, self.wlabel)
with open(os.path.join(self.path, 'property'), 'w') as f:
f.write("%d,%d,%d\n" % (self.max_label+1, self.image_size[0], self.image_size[1]))
f.write("%d\n" % (self.widx))

View File

@ -0,0 +1,6 @@
from .model_zoo import get_model
from .arcface_onnx import ArcFaceONNX
from .retinaface import RetinaFace
from .scrfd import SCRFD
from .landmark import Landmark
from .attribute import Attribute

View File

@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-05-04
# @Function :
from __future__ import division
import numpy as np
import cv2
import onnx
import onnxruntime
from ..utils import face_align
__all__ = [
'ArcFaceONNX',
]
class ArcFaceONNX:
def __init__(self, model_file=None, session=None):
assert model_file is not None
self.model_file = model_file
self.session = session
self.taskname = 'recognition'
find_sub = False
find_mul = False
model = onnx.load(self.model_file)
graph = model.graph
for nid, node in enumerate(graph.node[:8]):
#print(nid, node.name)
if node.name.startswith('Sub') or node.name.startswith('_minus'):
find_sub = True
if node.name.startswith('Mul') or node.name.startswith('_mul'):
find_mul = True
if find_sub and find_mul:
#mxnet arcface model
input_mean = 0.0
input_std = 1.0
else:
input_mean = 127.5
input_std = 127.5
self.input_mean = input_mean
self.input_std = input_std
#print('input mean and std:', self.input_mean, self.input_std)
if self.session is None:
self.session = onnxruntime.InferenceSession(self.model_file, None)
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
input_name = input_cfg.name
self.input_size = tuple(input_shape[2:4][::-1])
self.input_shape = input_shape
outputs = self.session.get_outputs()
output_names = []
for out in outputs:
output_names.append(out.name)
self.input_name = input_name
self.output_names = output_names
assert len(self.output_names)==1
self.output_shape = outputs[0].shape
def prepare(self, ctx_id, **kwargs):
if ctx_id<0:
self.session.set_providers(['CPUExecutionProvider'])
def get(self, img, face):
aimg = face_align.norm_crop(img, landmark=face.kps, image_size=self.input_size[0])
face.embedding = self.get_feat(aimg).flatten()
return face.embedding
def compute_sim(self, feat1, feat2):
from numpy.linalg import norm
feat1 = feat1.ravel()
feat2 = feat2.ravel()
sim = np.dot(feat1, feat2) / (norm(feat1) * norm(feat2))
return sim
def get_feat(self, imgs):
if not isinstance(imgs, list):
imgs = [imgs]
input_size = self.input_size
blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size,
(self.input_mean, self.input_mean, self.input_mean), swapRB=True)
net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
return net_out
def forward(self, batch_data):
blob = (batch_data - self.input_mean) / self.input_std
net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
return net_out

View File

@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-06-19
# @Function :
from __future__ import division
import numpy as np
import cv2
import onnx
import onnxruntime
from ..utils import face_align
__all__ = [
'Attribute',
]
class Attribute:
def __init__(self, model_file=None, session=None):
assert model_file is not None
self.model_file = model_file
self.session = session
find_sub = False
find_mul = False
model = onnx.load(self.model_file)
graph = model.graph
for nid, node in enumerate(graph.node[:8]):
#print(nid, node.name)
if node.name.startswith('Sub') or node.name.startswith('_minus'):
find_sub = True
if node.name.startswith('Mul') or node.name.startswith('_mul'):
find_mul = True
if nid<3 and node.name=='bn_data':
find_sub = True
find_mul = True
if find_sub and find_mul:
#mxnet arcface model
input_mean = 0.0
input_std = 1.0
else:
input_mean = 127.5
input_std = 128.0
self.input_mean = input_mean
self.input_std = input_std
#print('input mean and std:', model_file, self.input_mean, self.input_std)
if self.session is None:
self.session = onnxruntime.InferenceSession(self.model_file, None)
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
input_name = input_cfg.name
self.input_size = tuple(input_shape[2:4][::-1])
self.input_shape = input_shape
outputs = self.session.get_outputs()
output_names = []
for out in outputs:
output_names.append(out.name)
self.input_name = input_name
self.output_names = output_names
assert len(self.output_names)==1
output_shape = outputs[0].shape
#print('init output_shape:', output_shape)
if output_shape[1]==3:
self.taskname = 'genderage'
else:
self.taskname = 'attribute_%d'%output_shape[1]
def prepare(self, ctx_id, **kwargs):
if ctx_id<0:
self.session.set_providers(['CPUExecutionProvider'])
def get(self, img, face):
bbox = face.bbox
w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
rotate = 0
_scale = self.input_size[0] / (max(w, h)*1.5)
#print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)
aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)
input_size = tuple(aimg.shape[0:2][::-1])
#assert input_size==self.input_size
blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
if self.taskname=='genderage':
assert len(pred)==3
gender = np.argmax(pred[:2])
age = int(np.round(pred[2]*100))
face['gender'] = gender
face['age'] = age
return gender, age
else:
return pred

View File

@ -0,0 +1,114 @@
import time
import numpy as np
import onnxruntime
import cv2
import onnx
from onnx import numpy_helper
from ..utils import face_align
class INSwapper():
def __init__(self, model_file=None, session=None):
self.model_file = model_file
self.session = session
model = onnx.load(self.model_file)
graph = model.graph
self.emap = numpy_helper.to_array(graph.initializer[-1])
self.input_mean = 0.0
self.input_std = 255.0
#print('input mean and std:', model_file, self.input_mean, self.input_std)
if self.session is None:
self.session = onnxruntime.InferenceSession(self.model_file, None)
inputs = self.session.get_inputs()
self.input_names = []
for inp in inputs:
self.input_names.append(inp.name)
outputs = self.session.get_outputs()
output_names = []
for out in outputs:
output_names.append(out.name)
self.output_names = output_names
assert len(self.output_names)==1
output_shape = outputs[0].shape
input_cfg = inputs[0]
input_shape = input_cfg.shape
self.input_shape = input_shape
# print('inswapper-shape:', self.input_shape)
self.input_size = tuple(input_shape[2:4][::-1])
def forward(self, img, latent):
img = (img - self.input_mean) / self.input_std
pred = self.session.run(self.output_names, {self.input_names[0]: img, self.input_names[1]: latent})[0]
return pred
def get(self, img, target_face, source_face, paste_back=True):
face_mask = np.zeros((img.shape[0], img.shape[1]), np.uint8)
cv2.fillPoly(face_mask, np.array([target_face.landmark_2d_106[[1,9,10,11,12,13,14,15,16,2,3,4,5,6,7,8,0,24,23,22,21,20,19,18,32,31,30,29,28,27,26,25,17,101,105,104,103,51,49,48,43]].astype('int64')]), 1)
aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0])
blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size,
(self.input_mean, self.input_mean, self.input_mean), swapRB=True)
latent = source_face.normed_embedding.reshape((1,-1))
latent = np.dot(latent, self.emap)
latent /= np.linalg.norm(latent)
pred = self.session.run(self.output_names, {self.input_names[0]: blob, self.input_names[1]: latent})[0]
#print(latent.shape, latent.dtype, pred.shape)
img_fake = pred.transpose((0,2,3,1))[0]
bgr_fake = np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1]
if not paste_back:
return bgr_fake, M
else:
target_img = img
fake_diff = bgr_fake.astype(np.float32) - aimg.astype(np.float32)
fake_diff = np.abs(fake_diff).mean(axis=2)
fake_diff[:2,:] = 0
fake_diff[-2:,:] = 0
fake_diff[:,:2] = 0
fake_diff[:,-2:] = 0
IM = cv2.invertAffineTransform(M)
img_white = np.full((aimg.shape[0],aimg.shape[1]), 255, dtype=np.float32)
bgr_fake = cv2.warpAffine(bgr_fake, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
img_white = cv2.warpAffine(img_white, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
fake_diff = cv2.warpAffine(fake_diff, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
img_white[img_white>20] = 255
fthresh = 10
fake_diff[fake_diff<fthresh] = 0
fake_diff[fake_diff>=fthresh] = 255
img_mask = img_white
mask_h_inds, mask_w_inds = np.where(img_mask==255)
mask_h = np.max(mask_h_inds) - np.min(mask_h_inds)
mask_w = np.max(mask_w_inds) - np.min(mask_w_inds)
mask_size = int(np.sqrt(mask_h*mask_w))
k = max(mask_size//10, 10)
#k = max(mask_size//20, 6)
#k = 6
kernel = np.ones((k,k),np.uint8)
img_mask = cv2.erode(img_mask,kernel,iterations = 1)
kernel = np.ones((2,2),np.uint8)
fake_diff = cv2.dilate(fake_diff,kernel,iterations = 1)
face_mask = cv2.erode(face_mask,np.ones((11,11),np.uint8),iterations = 1)
fake_diff[face_mask==1] = 255
k = max(mask_size//20, 5)
#k = 3
#k = 3
kernel_size = (k, k)
blur_size = tuple(2*i+1 for i in kernel_size)
img_mask = cv2.GaussianBlur(img_mask, blur_size, 0)
k = 5
kernel_size = (k, k)
blur_size = tuple(2*i+1 for i in kernel_size)
fake_diff = cv2.blur(fake_diff, (11,11), 0)
##fake_diff = cv2.GaussianBlur(fake_diff, blur_size, 0)
# print('blur_size: ', blur_size)
# fake_diff = cv2.blur(fake_diff, (21, 21), 0) # blur_size
img_mask /= 255
fake_diff /= 255
# img_mask = fake_diff
img_mask = img_mask*fake_diff
img_mask = np.reshape(img_mask, [img_mask.shape[0],img_mask.shape[1],1])
fake_merged = img_mask * bgr_fake + (1-img_mask) * target_img.astype(np.float32)
fake_merged = fake_merged.astype(np.uint8)
return fake_merged

View File

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-05-04
# @Function :
from __future__ import division
import numpy as np
import cv2
import onnx
import onnxruntime
from ..utils import face_align
from ..utils import transform
from ..data import get_object
__all__ = [
'Landmark',
]
class Landmark:
def __init__(self, model_file=None, session=None):
assert model_file is not None
self.model_file = model_file
self.session = session
find_sub = False
find_mul = False
model = onnx.load(self.model_file)
graph = model.graph
for nid, node in enumerate(graph.node[:8]):
#print(nid, node.name)
if node.name.startswith('Sub') or node.name.startswith('_minus'):
find_sub = True
if node.name.startswith('Mul') or node.name.startswith('_mul'):
find_mul = True
if nid<3 and node.name=='bn_data':
find_sub = True
find_mul = True
if find_sub and find_mul:
#mxnet arcface model
input_mean = 0.0
input_std = 1.0
else:
input_mean = 127.5
input_std = 128.0
self.input_mean = input_mean
self.input_std = input_std
#print('input mean and std:', model_file, self.input_mean, self.input_std)
if self.session is None:
self.session = onnxruntime.InferenceSession(self.model_file, None)
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
input_name = input_cfg.name
self.input_size = tuple(input_shape[2:4][::-1])
self.input_shape = input_shape
outputs = self.session.get_outputs()
output_names = []
for out in outputs:
output_names.append(out.name)
self.input_name = input_name
self.output_names = output_names
assert len(self.output_names)==1
output_shape = outputs[0].shape
self.require_pose = False
#print('init output_shape:', output_shape)
if output_shape[1]==3309:
self.lmk_dim = 3
self.lmk_num = 68
self.mean_lmk = get_object('meanshape_68.pkl')
self.require_pose = True
else:
self.lmk_dim = 2
self.lmk_num = output_shape[1]//self.lmk_dim
self.taskname = 'landmark_%dd_%d'%(self.lmk_dim, self.lmk_num)
def prepare(self, ctx_id, **kwargs):
if ctx_id<0:
self.session.set_providers(['CPUExecutionProvider'])
def get(self, img, face):
bbox = face.bbox
w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
rotate = 0
_scale = self.input_size[0] / (max(w, h)*1.5)
#print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)
aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)
input_size = tuple(aimg.shape[0:2][::-1])
#assert input_size==self.input_size
blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
if pred.shape[0] >= 3000:
pred = pred.reshape((-1, 3))
else:
pred = pred.reshape((-1, 2))
if self.lmk_num < pred.shape[0]:
pred = pred[self.lmk_num*-1:,:]
pred[:, 0:2] += 1
pred[:, 0:2] *= (self.input_size[0] // 2)
if pred.shape[1] == 3:
pred[:, 2] *= (self.input_size[0] // 2)
IM = cv2.invertAffineTransform(M)
pred = face_align.trans_points(pred, IM)
face[self.taskname] = pred
if self.require_pose:
P = transform.estimate_affine_matrix_3d23d(self.mean_lmk, pred)
s, R, t = transform.P2sRt(P)
rx, ry, rz = transform.matrix2angle(R)
pose = np.array( [rx, ry, rz], dtype=np.float32 )
face['pose'] = pose #pitch, yaw, roll
return pred

View File

@ -0,0 +1,103 @@
"""
This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_store.py
"""
from __future__ import print_function
__all__ = ['get_model_file']
import os
import zipfile
import glob
from ..utils import download, check_sha1
_model_sha1 = {
name: checksum
for checksum, name in [
('95be21b58e29e9c1237f229dae534bd854009ce0', 'arcface_r100_v1'),
('', 'arcface_mfn_v1'),
('39fd1e087a2a2ed70a154ac01fecaa86c315d01b', 'retinaface_r50_v1'),
('2c9de8116d1f448fd1d4661f90308faae34c990a', 'retinaface_mnet025_v1'),
('0db1d07921d005e6c9a5b38e059452fc5645e5a4', 'retinaface_mnet025_v2'),
('7dd8111652b7aac2490c5dcddeb268e53ac643e6', 'genderage_v1'),
]
}
base_repo_url = 'https://insightface.ai/files/'
_url_format = '{repo_url}models/{file_name}.zip'
def short_hash(name):
if name not in _model_sha1:
raise ValueError(
'Pretrained model for {name} is not available.'.format(name=name))
return _model_sha1[name][:8]
def find_params_file(dir_path):
if not os.path.exists(dir_path):
return None
paths = glob.glob("%s/*.params" % dir_path)
if len(paths) == 0:
return None
paths = sorted(paths)
return paths[-1]
def get_model_file(name, root=os.path.join('~', '.insightface', 'models')):
r"""Return location for the pretrained on local file system.
This function will download from online model zoo when model cannot be found or has mismatch.
The root directory will be created if it doesn't exist.
Parameters
----------
name : str
Name of the model.
root : str, default '~/.mxnet/models'
Location for keeping the model parameters.
Returns
-------
file_path
Path to the requested pretrained model file.
"""
file_name = name
root = os.path.expanduser(root)
dir_path = os.path.join(root, name)
file_path = find_params_file(dir_path)
#file_path = os.path.join(root, file_name + '.params')
sha1_hash = _model_sha1[name]
if file_path is not None:
if check_sha1(file_path, sha1_hash):
return file_path
else:
print(
'Mismatch in the content of model file detected. Downloading again.'
)
else:
print('Model file is not found. Downloading.')
if not os.path.exists(root):
os.makedirs(root)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
zip_file_path = os.path.join(root, file_name + '.zip')
repo_url = base_repo_url
if repo_url[-1] != '/':
repo_url = repo_url + '/'
download(_url_format.format(repo_url=repo_url, file_name=file_name),
path=zip_file_path,
overwrite=True)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(dir_path)
os.remove(zip_file_path)
file_path = find_params_file(dir_path)
if check_sha1(file_path, sha1_hash):
return file_path
else:
raise ValueError(
'Downloaded file has different hash. Please try again.')

View File

@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-05-04
# @Function :
import os
import os.path as osp
import glob
import onnxruntime
from .arcface_onnx import *
from .retinaface import *
#from .scrfd import *
from .landmark import *
from .attribute import Attribute
from .inswapper import INSwapper
from ..utils import download_onnx
__all__ = ['get_model']
class PickableInferenceSession(onnxruntime.InferenceSession):
# This is a wrapper to make the current InferenceSession class pickable.
def __init__(self, model_path, **kwargs):
super().__init__(model_path, **kwargs)
self.model_path = model_path
def __getstate__(self):
return {'model_path': self.model_path}
def __setstate__(self, values):
model_path = values['model_path']
self.__init__(model_path)
class ModelRouter:
def __init__(self, onnx_file):
self.onnx_file = onnx_file
def get_model(self, **kwargs):
session = PickableInferenceSession(self.onnx_file, **kwargs)
# print(f'Applied providers: {session._providers}, with options: {session._provider_options}')
inputs = session.get_inputs()
input_cfg = inputs[0]
input_shape = input_cfg.shape
outputs = session.get_outputs()
if len(outputs)>=5:
return RetinaFace(model_file=self.onnx_file, session=session)
elif input_shape[2]==192 and input_shape[3]==192:
return Landmark(model_file=self.onnx_file, session=session)
elif input_shape[2]==96 and input_shape[3]==96:
return Attribute(model_file=self.onnx_file, session=session)
elif len(inputs)==2 and input_shape[2]==128 and input_shape[3]==128:
return INSwapper(model_file=self.onnx_file, session=session)
elif input_shape[2]==input_shape[3] and input_shape[2]>=112 and input_shape[2]%16==0:
return ArcFaceONNX(model_file=self.onnx_file, session=session)
else:
#raise RuntimeError('error on model routing')
return None
def find_onnx_file(dir_path):
if not os.path.exists(dir_path):
return None
paths = glob.glob("%s/*.onnx" % dir_path)
if len(paths) == 0:
return None
paths = sorted(paths)
return paths[-1]
def get_default_providers():
return ['CUDAExecutionProvider', 'CPUExecutionProvider']
def get_default_provider_options():
return None
def get_model(name, **kwargs):
root = kwargs.get('root', '~/.insightface')
root = os.path.expanduser(root)
model_root = osp.join(root, 'models')
allow_download = kwargs.get('download', False)
download_zip = kwargs.get('download_zip', False)
if not name.endswith('.onnx'):
model_dir = os.path.join(model_root, name)
model_file = find_onnx_file(model_dir)
if model_file is None:
return None
else:
model_file = name
if not osp.exists(model_file) and allow_download:
model_file = download_onnx('models', model_file, root=root, download_zip=download_zip)
assert osp.exists(model_file), 'model_file %s should exist'%model_file
assert osp.isfile(model_file), 'model_file %s should be a file'%model_file
router = ModelRouter(model_file)
providers = kwargs.get('providers', get_default_providers())
provider_options = kwargs.get('provider_options', get_default_provider_options())
model = router.get_model(providers=providers, provider_options=provider_options)
return model

View File

@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-09-18
# @Function :
from __future__ import division
import datetime
import numpy as np
import onnx
import onnxruntime
import os
import os.path as osp
import cv2
import sys
def softmax(z):
assert len(z.shape) == 2
s = np.max(z, axis=1)
s = s[:, np.newaxis] # necessary step to do broadcasting
e_x = np.exp(z - s)
div = np.sum(e_x, axis=1)
div = div[:, np.newaxis] # dito
return e_x / div
def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
x1 = points[:, 0] - distance[:, 0]
y1 = points[:, 1] - distance[:, 1]
x2 = points[:, 0] + distance[:, 2]
y2 = points[:, 1] + distance[:, 3]
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1])
y1 = y1.clamp(min=0, max=max_shape[0])
x2 = x2.clamp(min=0, max=max_shape[1])
y2 = y2.clamp(min=0, max=max_shape[0])
return np.stack([x1, y1, x2, y2], axis=-1)
def distance2kps(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
preds = []
for i in range(0, distance.shape[1], 2):
px = points[:, i%2] + distance[:, i]
py = points[:, i%2+1] + distance[:, i+1]
if max_shape is not None:
px = px.clamp(min=0, max=max_shape[1])
py = py.clamp(min=0, max=max_shape[0])
preds.append(px)
preds.append(py)
return np.stack(preds, axis=-1)
class RetinaFace:
def __init__(self, model_file=None, session=None):
import onnxruntime
self.model_file = model_file
self.session = session
self.taskname = 'detection'
if self.session is None:
assert self.model_file is not None
assert osp.exists(self.model_file)
self.session = onnxruntime.InferenceSession(self.model_file, None)
self.center_cache = {}
self.nms_thresh = 0.4
self.det_thresh = 0.5
self._init_vars()
def _init_vars(self):
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
#print(input_shape)
if isinstance(input_shape[2], str):
self.input_size = None
else:
self.input_size = tuple(input_shape[2:4][::-1])
#print('image_size:', self.image_size)
input_name = input_cfg.name
self.input_shape = input_shape
outputs = self.session.get_outputs()
output_names = []
for o in outputs:
output_names.append(o.name)
self.input_name = input_name
self.output_names = output_names
self.input_mean = 127.5
self.input_std = 128.0
#print(self.output_names)
#assert len(outputs)==10 or len(outputs)==15
self.use_kps = False
self._anchor_ratio = 1.0
self._num_anchors = 1
if len(outputs)==6:
self.fmc = 3
self._feat_stride_fpn = [8, 16, 32]
self._num_anchors = 2
elif len(outputs)==9:
self.fmc = 3
self._feat_stride_fpn = [8, 16, 32]
self._num_anchors = 2
self.use_kps = True
elif len(outputs)==10:
self.fmc = 5
self._feat_stride_fpn = [8, 16, 32, 64, 128]
self._num_anchors = 1
elif len(outputs)==15:
self.fmc = 5
self._feat_stride_fpn = [8, 16, 32, 64, 128]
self._num_anchors = 1
self.use_kps = True
def prepare(self, ctx_id, **kwargs):
if ctx_id<0:
self.session.set_providers(['CPUExecutionProvider'])
nms_thresh = kwargs.get('nms_thresh', None)
if nms_thresh is not None:
self.nms_thresh = nms_thresh
det_thresh = kwargs.get('det_thresh', None)
if det_thresh is not None:
self.det_thresh = det_thresh
input_size = kwargs.get('input_size', None)
if input_size is not None:
if self.input_size is not None:
print('warning: det_size is already set in detection model, ignore')
else:
self.input_size = input_size
def forward(self, img, threshold):
scores_list = []
bboxes_list = []
kpss_list = []
input_size = tuple(img.shape[0:2][::-1])
blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
net_outs = self.session.run(self.output_names, {self.input_name : blob})
input_height = blob.shape[2]
input_width = blob.shape[3]
fmc = self.fmc
for idx, stride in enumerate(self._feat_stride_fpn):
scores = net_outs[idx]
bbox_preds = net_outs[idx+fmc]
bbox_preds = bbox_preds * stride
if self.use_kps:
kps_preds = net_outs[idx+fmc*2] * stride
height = input_height // stride
width = input_width // stride
K = height * width
key = (height, width, stride)
if key in self.center_cache:
anchor_centers = self.center_cache[key]
else:
#solution-1, c style:
#anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
#for i in range(height):
# anchor_centers[i, :, 1] = i
#for i in range(width):
# anchor_centers[:, i, 0] = i
#solution-2:
#ax = np.arange(width, dtype=np.float32)
#ay = np.arange(height, dtype=np.float32)
#xv, yv = np.meshgrid(np.arange(width), np.arange(height))
#anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
#solution-3:
anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
#print(anchor_centers.shape)
anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
if self._num_anchors>1:
anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
if len(self.center_cache)<100:
self.center_cache[key] = anchor_centers
pos_inds = np.where(scores>=threshold)[0]
bboxes = distance2bbox(anchor_centers, bbox_preds)
pos_scores = scores[pos_inds]
pos_bboxes = bboxes[pos_inds]
scores_list.append(pos_scores)
bboxes_list.append(pos_bboxes)
if self.use_kps:
kpss = distance2kps(anchor_centers, kps_preds)
#kpss = kps_preds
kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
pos_kpss = kpss[pos_inds]
kpss_list.append(pos_kpss)
return scores_list, bboxes_list, kpss_list
def detect(self, img, input_size = None, max_num=0, metric='default'):
assert input_size is not None or self.input_size is not None
input_size = self.input_size if input_size is None else input_size
im_ratio = float(img.shape[0]) / img.shape[1]
model_ratio = float(input_size[1]) / input_size[0]
if im_ratio>model_ratio:
new_height = input_size[1]
new_width = int(new_height / im_ratio)
else:
new_width = input_size[0]
new_height = int(new_width * im_ratio)
det_scale = float(new_height) / img.shape[0]
resized_img = cv2.resize(img, (new_width, new_height))
det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
det_img[:new_height, :new_width, :] = resized_img
scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
scores = np.vstack(scores_list)
scores_ravel = scores.ravel()
order = scores_ravel.argsort()[::-1]
bboxes = np.vstack(bboxes_list) / det_scale
if self.use_kps:
kpss = np.vstack(kpss_list) / det_scale
pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
pre_det = pre_det[order, :]
keep = self.nms(pre_det)
det = pre_det[keep, :]
if self.use_kps:
kpss = kpss[order,:,:]
kpss = kpss[keep,:,:]
else:
kpss = None
if max_num > 0 and det.shape[0] > max_num:
area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
det[:, 1])
img_center = img.shape[0] // 2, img.shape[1] // 2
offsets = np.vstack([
(det[:, 0] + det[:, 2]) / 2 - img_center[1],
(det[:, 1] + det[:, 3]) / 2 - img_center[0]
])
offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
if metric=='max':
values = area
else:
values = area - offset_dist_squared * 2.0 # some extra weight on the centering
bindex = np.argsort(
values)[::-1] # some extra weight on the centering
bindex = bindex[0:max_num]
det = det[bindex, :]
if kpss is not None:
kpss = kpss[bindex, :]
return det, kpss
def nms(self, dets):
thresh = self.nms_thresh
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
def get_retinaface(name, download=False, root='~/.insightface/models', **kwargs):
if not download:
assert os.path.exists(name)
return RetinaFace(name)
else:
from .model_store import get_model_file
_file = get_model_file("retinaface_%s" % name, root=root)
return retinaface(_file)

View File

@ -0,0 +1,348 @@
# -*- coding: utf-8 -*-
# @Organization : insightface.ai
# @Author : Jia Guo
# @Time : 2021-05-04
# @Function :
from __future__ import division
import datetime
import numpy as np
import onnx
import onnxruntime
import os
import os.path as osp
import cv2
import sys
def softmax(z):
assert len(z.shape) == 2
s = np.max(z, axis=1)
s = s[:, np.newaxis] # necessary step to do broadcasting
e_x = np.exp(z - s)
div = np.sum(e_x, axis=1)
div = div[:, np.newaxis] # dito
return e_x / div
def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
x1 = points[:, 0] - distance[:, 0]
y1 = points[:, 1] - distance[:, 1]
x2 = points[:, 0] + distance[:, 2]
y2 = points[:, 1] + distance[:, 3]
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1])
y1 = y1.clamp(min=0, max=max_shape[0])
x2 = x2.clamp(min=0, max=max_shape[1])
y2 = y2.clamp(min=0, max=max_shape[0])
return np.stack([x1, y1, x2, y2], axis=-1)
def distance2kps(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
preds = []
for i in range(0, distance.shape[1], 2):
px = points[:, i%2] + distance[:, i]
py = points[:, i%2+1] + distance[:, i+1]
if max_shape is not None:
px = px.clamp(min=0, max=max_shape[1])
py = py.clamp(min=0, max=max_shape[0])
preds.append(px)
preds.append(py)
return np.stack(preds, axis=-1)
class SCRFD:
def __init__(self, model_file=None, session=None):
import onnxruntime
self.model_file = model_file
self.session = session
self.taskname = 'detection'
self.batched = False
if self.session is None:
assert self.model_file is not None
assert osp.exists(self.model_file)
self.session = onnxruntime.InferenceSession(self.model_file, None)
self.center_cache = {}
self.nms_thresh = 0.4
self.det_thresh = 0.5
self._init_vars()
def _init_vars(self):
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
#print(input_shape)
if isinstance(input_shape[2], str):
self.input_size = None
else:
self.input_size = tuple(input_shape[2:4][::-1])
#print('image_size:', self.image_size)
input_name = input_cfg.name
self.input_shape = input_shape
outputs = self.session.get_outputs()
if len(outputs[0].shape) == 3:
self.batched = True
output_names = []
for o in outputs:
output_names.append(o.name)
self.input_name = input_name
self.output_names = output_names
self.input_mean = 127.5
self.input_std = 128.0
#print(self.output_names)
#assert len(outputs)==10 or len(outputs)==15
self.use_kps = False
self._anchor_ratio = 1.0
self._num_anchors = 1
if len(outputs)==6:
self.fmc = 3
self._feat_stride_fpn = [8, 16, 32]
self._num_anchors = 2
elif len(outputs)==9:
self.fmc = 3
self._feat_stride_fpn = [8, 16, 32]
self._num_anchors = 2
self.use_kps = True
elif len(outputs)==10:
self.fmc = 5
self._feat_stride_fpn = [8, 16, 32, 64, 128]
self._num_anchors = 1
elif len(outputs)==15:
self.fmc = 5
self._feat_stride_fpn = [8, 16, 32, 64, 128]
self._num_anchors = 1
self.use_kps = True
def prepare(self, ctx_id, **kwargs):
if ctx_id<0:
self.session.set_providers(['CPUExecutionProvider'])
nms_thresh = kwargs.get('nms_thresh', None)
if nms_thresh is not None:
self.nms_thresh = nms_thresh
det_thresh = kwargs.get('det_thresh', None)
if det_thresh is not None:
self.det_thresh = det_thresh
input_size = kwargs.get('input_size', None)
if input_size is not None:
if self.input_size is not None:
print('warning: det_size is already set in scrfd model, ignore')
else:
self.input_size = input_size
def forward(self, img, threshold):
scores_list = []
bboxes_list = []
kpss_list = []
input_size = tuple(img.shape[0:2][::-1])
blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
net_outs = self.session.run(self.output_names, {self.input_name : blob})
input_height = blob.shape[2]
input_width = blob.shape[3]
fmc = self.fmc
for idx, stride in enumerate(self._feat_stride_fpn):
# If model support batch dim, take first output
if self.batched:
scores = net_outs[idx][0]
bbox_preds = net_outs[idx + fmc][0]
bbox_preds = bbox_preds * stride
if self.use_kps:
kps_preds = net_outs[idx + fmc * 2][0] * stride
# If model doesn't support batching take output as is
else:
scores = net_outs[idx]
bbox_preds = net_outs[idx + fmc]
bbox_preds = bbox_preds * stride
if self.use_kps:
kps_preds = net_outs[idx + fmc * 2] * stride
height = input_height // stride
width = input_width // stride
K = height * width
key = (height, width, stride)
if key in self.center_cache:
anchor_centers = self.center_cache[key]
else:
#solution-1, c style:
#anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
#for i in range(height):
# anchor_centers[i, :, 1] = i
#for i in range(width):
# anchor_centers[:, i, 0] = i
#solution-2:
#ax = np.arange(width, dtype=np.float32)
#ay = np.arange(height, dtype=np.float32)
#xv, yv = np.meshgrid(np.arange(width), np.arange(height))
#anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
#solution-3:
anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
#print(anchor_centers.shape)
anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
if self._num_anchors>1:
anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
if len(self.center_cache)<100:
self.center_cache[key] = anchor_centers
pos_inds = np.where(scores>=threshold)[0]
bboxes = distance2bbox(anchor_centers, bbox_preds)
pos_scores = scores[pos_inds]
pos_bboxes = bboxes[pos_inds]
scores_list.append(pos_scores)
bboxes_list.append(pos_bboxes)
if self.use_kps:
kpss = distance2kps(anchor_centers, kps_preds)
#kpss = kps_preds
kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
pos_kpss = kpss[pos_inds]
kpss_list.append(pos_kpss)
return scores_list, bboxes_list, kpss_list
def detect(self, img, input_size = None, max_num=0, metric='default'):
assert input_size is not None or self.input_size is not None
input_size = self.input_size if input_size is None else input_size
im_ratio = float(img.shape[0]) / img.shape[1]
model_ratio = float(input_size[1]) / input_size[0]
if im_ratio>model_ratio:
new_height = input_size[1]
new_width = int(new_height / im_ratio)
else:
new_width = input_size[0]
new_height = int(new_width * im_ratio)
det_scale = float(new_height) / img.shape[0]
resized_img = cv2.resize(img, (new_width, new_height))
det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
det_img[:new_height, :new_width, :] = resized_img
scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
scores = np.vstack(scores_list)
scores_ravel = scores.ravel()
order = scores_ravel.argsort()[::-1]
bboxes = np.vstack(bboxes_list) / det_scale
if self.use_kps:
kpss = np.vstack(kpss_list) / det_scale
pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
pre_det = pre_det[order, :]
keep = self.nms(pre_det)
det = pre_det[keep, :]
if self.use_kps:
kpss = kpss[order,:,:]
kpss = kpss[keep,:,:]
else:
kpss = None
if max_num > 0 and det.shape[0] > max_num:
area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
det[:, 1])
img_center = img.shape[0] // 2, img.shape[1] // 2
offsets = np.vstack([
(det[:, 0] + det[:, 2]) / 2 - img_center[1],
(det[:, 1] + det[:, 3]) / 2 - img_center[0]
])
offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
if metric=='max':
values = area
else:
values = area - offset_dist_squared * 2.0 # some extra weight on the centering
bindex = np.argsort(
values)[::-1] # some extra weight on the centering
bindex = bindex[0:max_num]
det = det[bindex, :]
if kpss is not None:
kpss = kpss[bindex, :]
return det, kpss
def nms(self, dets):
thresh = self.nms_thresh
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
def get_scrfd(name, download=False, root='~/.insightface/models', **kwargs):
if not download:
assert os.path.exists(name)
return SCRFD(name)
else:
from .model_store import get_model_file
_file = get_model_file("scrfd_%s" % name, root=root)
return SCRFD(_file)
def scrfd_2p5gkps(**kwargs):
return get_scrfd("2p5gkps", download=True, **kwargs)
if __name__ == '__main__':
import glob
detector = SCRFD(model_file='./det.onnx')
detector.prepare(-1)
img_paths = ['tests/data/t1.jpg']
for img_path in img_paths:
img = cv2.imread(img_path)
for _ in range(1):
ta = datetime.datetime.now()
#bboxes, kpss = detector.detect(img, 0.5, input_size = (640, 640))
bboxes, kpss = detector.detect(img, 0.5)
tb = datetime.datetime.now()
print('all cost:', (tb-ta).total_seconds()*1000)
print(img_path, bboxes.shape)
if kpss is not None:
print(kpss.shape)
for i in range(bboxes.shape[0]):
bbox = bboxes[i]
x1,y1,x2,y2,score = bbox.astype(np.int)
cv2.rectangle(img, (x1,y1) , (x2,y2) , (255,0,0) , 2)
if kpss is not None:
kps = kpss[i]
for kp in kps:
kp = kp.astype(np.int)
cv2.circle(img, tuple(kp) , 1, (0,0,255) , 2)
filename = img_path.split('/')[-1]
print('output:', filename)
cv2.imwrite('./outputs/%s'%filename, img)

View File

@ -0,0 +1,6 @@
from __future__ import absolute_import
from .storage import download, ensure_available, download_onnx
from .filesystem import get_model_dir
from .filesystem import makedirs, try_import_dali
from .constant import *

View File

@ -0,0 +1,3 @@
DEFAULT_MP_NAME = 'buffalo_l'

View File

@ -0,0 +1,95 @@
"""
This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/download.py
"""
import os
import hashlib
import requests
from tqdm import tqdm
def check_sha1(filename, sha1_hash):
"""Check whether the sha1 hash of the file content matches the expected hash.
Parameters
----------
filename : str
Path to the file.
sha1_hash : str
Expected sha1 hash in hexadecimal digits.
Returns
-------
bool
Whether the file content matches the expected hash.
"""
sha1 = hashlib.sha1()
with open(filename, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
sha1_file = sha1.hexdigest()
l = min(len(sha1_file), len(sha1_hash))
return sha1.hexdigest()[0:l] == sha1_hash[0:l]
def download_file(url, path=None, overwrite=False, sha1_hash=None):
"""Download an given URL
Parameters
----------
url : str
URL to download
path : str, optional
Destination path to store downloaded file. By default stores to the
current directory with same name as in url.
overwrite : bool, optional
Whether to overwrite destination file if already exists.
sha1_hash : str, optional
Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
but doesn't match.
Returns
-------
str
The file path of the downloaded file.
"""
if path is None:
fname = url.split('/')[-1]
else:
path = os.path.expanduser(path)
if os.path.isdir(path):
fname = os.path.join(path, url.split('/')[-1])
else:
fname = path
if overwrite or not os.path.exists(fname) or (
sha1_hash and not check_sha1(fname, sha1_hash)):
dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
if not os.path.exists(dirname):
os.makedirs(dirname)
print('Downloading %s from %s...' % (fname, url))
r = requests.get(url, stream=True)
if r.status_code != 200:
raise RuntimeError("Failed downloading url %s" % url)
total_length = r.headers.get('content-length')
with open(fname, 'wb') as f:
if total_length is None: # no content length header
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
else:
total_length = int(total_length)
for chunk in tqdm(r.iter_content(chunk_size=1024),
total=int(total_length / 1024. + 0.5),
unit='KB',
unit_scale=False,
dynamic_ncols=True):
f.write(chunk)
if sha1_hash and not check_sha1(fname, sha1_hash):
raise UserWarning('File {} is downloaded but the content hash does not match. ' \
'The repo may be outdated or download may be incomplete. ' \
'If the "repo_url" is overridden, consider switching to ' \
'the default repo.'.format(fname))
return fname

View File

@ -0,0 +1,103 @@
import cv2
import numpy as np
from skimage import transform as trans
arcface_dst = np.array(
[[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
[41.5493, 92.3655], [70.7299, 92.2041]],
dtype=np.float32)
def estimate_norm(lmk, image_size=112,mode='arcface'):
assert lmk.shape == (5, 2)
assert image_size%112==0 or image_size%128==0
if image_size%112==0:
ratio = float(image_size)/112.0
diff_x = 0
else:
ratio = float(image_size)/128.0
diff_x = 8.0*ratio
dst = arcface_dst * ratio
dst[:,0] += diff_x
tform = trans.SimilarityTransform()
tform.estimate(lmk, dst)
M = tform.params[0:2, :]
return M
def norm_crop(img, landmark, image_size=112, mode='arcface'):
M = estimate_norm(landmark, image_size, mode)
warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
return warped
def norm_crop2(img, landmark, image_size=112, mode='arcface'):
M = estimate_norm(landmark, image_size, mode)
warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
return warped, M
def square_crop(im, S):
if im.shape[0] > im.shape[1]:
height = S
width = int(float(im.shape[1]) / im.shape[0] * S)
scale = float(S) / im.shape[0]
else:
width = S
height = int(float(im.shape[0]) / im.shape[1] * S)
scale = float(S) / im.shape[1]
resized_im = cv2.resize(im, (width, height))
det_im = np.zeros((S, S, 3), dtype=np.uint8)
det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im
return det_im, scale
def transform(data, center, output_size, scale, rotation):
scale_ratio = scale
rot = float(rotation) * np.pi / 180.0
#translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
t1 = trans.SimilarityTransform(scale=scale_ratio)
cx = center[0] * scale_ratio
cy = center[1] * scale_ratio
t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
t3 = trans.SimilarityTransform(rotation=rot)
t4 = trans.SimilarityTransform(translation=(output_size / 2,
output_size / 2))
t = t1 + t2 + t3 + t4
M = t.params[0:2]
cropped = cv2.warpAffine(data,
M, (output_size, output_size),
borderValue=0.0)
return cropped, M
def trans_points2d(pts, M):
new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
for i in range(pts.shape[0]):
pt = pts[i]
new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
new_pt = np.dot(M, new_pt)
#print('new_pt', new_pt.shape, new_pt)
new_pts[i] = new_pt[0:2]
return new_pts
def trans_points3d(pts, M):
scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
#print(scale)
new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
for i in range(pts.shape[0]):
pt = pts[i]
new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
new_pt = np.dot(M, new_pt)
#print('new_pt', new_pt.shape, new_pt)
new_pts[i][0:2] = new_pt[0:2]
new_pts[i][2] = pts[i][2] * scale
return new_pts
def trans_points(pts, M):
if pts.shape[1] == 2:
return trans_points2d(pts, M)
else:
return trans_points3d(pts, M)

View File

@ -0,0 +1,157 @@
"""
This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/filesystem.py
"""
import os
import os.path as osp
import errno
def get_model_dir(name, root='~/.insightface'):
root = os.path.expanduser(root)
model_dir = osp.join(root, 'models', name)
return model_dir
def makedirs(path):
"""Create directory recursively if not exists.
Similar to `makedir -p`, you can skip checking existence before this function.
Parameters
----------
path : str
Path of the desired dir
"""
try:
os.makedirs(path)
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
def try_import(package, message=None):
"""Try import specified package, with custom message support.
Parameters
----------
package : str
The name of the targeting package.
message : str, default is None
If not None, this function will raise customized error message when import error is found.
Returns
-------
module if found, raise ImportError otherwise
"""
try:
return __import__(package)
except ImportError as e:
if not message:
raise e
raise ImportError(message)
def try_import_cv2():
"""Try import cv2 at runtime.
Returns
-------
cv2 module if found. Raise ImportError otherwise
"""
msg = "cv2 is required, you can install by package manager, e.g. 'apt-get', \
or `pip install opencv-python --user` (note that this is unofficial PYPI package)."
return try_import('cv2', msg)
def try_import_mmcv():
"""Try import mmcv at runtime.
Returns
-------
mmcv module if found. Raise ImportError otherwise
"""
msg = "mmcv is required, you can install by first `pip install Cython --user` \
and then `pip install mmcv --user` (note that this is unofficial PYPI package)."
return try_import('mmcv', msg)
def try_import_rarfile():
"""Try import rarfile at runtime.
Returns
-------
rarfile module if found. Raise ImportError otherwise
"""
msg = "rarfile is required, you can install by first `sudo apt-get install unrar` \
and then `pip install rarfile --user` (note that this is unofficial PYPI package)."
return try_import('rarfile', msg)
def import_try_install(package, extern_url=None):
"""Try import the specified package.
If the package not installed, try use pip to install and import if success.
Parameters
----------
package : str
The name of the package trying to import.
extern_url : str or None, optional
The external url if package is not hosted on PyPI.
For example, you can install a package using:
"pip install git+http://github.com/user/repo/tarball/master/egginfo=xxx".
In this case, you can pass the url to the extern_url.
Returns
-------
<class 'Module'>
The imported python module.
"""
try:
return __import__(package)
except ImportError:
try:
from pip import main as pipmain
except ImportError:
from pip._internal import main as pipmain
# trying to install package
url = package if extern_url is None else extern_url
pipmain(['install', '--user',
url]) # will raise SystemExit Error if fails
# trying to load again
try:
return __import__(package)
except ImportError:
import sys
import site
user_site = site.getusersitepackages()
if user_site not in sys.path:
sys.path.append(user_site)
return __import__(package)
return __import__(package)
def try_import_dali():
"""Try import NVIDIA DALI at runtime.
"""
try:
dali = __import__('nvidia.dali', fromlist=['pipeline', 'ops', 'types'])
dali.Pipeline = dali.pipeline.Pipeline
except ImportError:
class dali:
class Pipeline:
def __init__(self):
raise NotImplementedError(
"DALI not found, please check if you installed it correctly."
)
return dali

View File

@ -0,0 +1,52 @@
import os
import os.path as osp
import zipfile
from .download import download_file
BASE_REPO_URL = 'https://github.com/deepinsight/insightface/releases/download/v0.7'
def download(sub_dir, name, force=False, root='~/.insightface'):
_root = os.path.expanduser(root)
dir_path = os.path.join(_root, sub_dir, name)
if osp.exists(dir_path) and not force:
return dir_path
print('download_path:', dir_path)
zip_file_path = os.path.join(_root, sub_dir, name + '.zip')
model_url = "%s/%s.zip"%(BASE_REPO_URL, name)
download_file(model_url,
path=zip_file_path,
overwrite=True)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(dir_path)
#os.remove(zip_file_path)
return dir_path
def ensure_available(sub_dir, name, root='~/.insightface'):
return download(sub_dir, name, force=False, root=root)
def download_onnx(sub_dir, model_file, force=False, root='~/.insightface', download_zip=False):
_root = os.path.expanduser(root)
model_root = osp.join(_root, sub_dir)
new_model_file = osp.join(model_root, model_file)
if osp.exists(new_model_file) and not force:
return new_model_file
if not osp.exists(model_root):
os.makedirs(model_root)
print('download_path:', new_model_file)
if not download_zip:
model_url = "%s/%s"%(BASE_REPO_URL, model_file)
download_file(model_url,
path=new_model_file,
overwrite=True)
else:
model_url = "%s/%s.zip"%(BASE_REPO_URL, model_file)
zip_file_path = new_model_file+".zip"
download_file(model_url,
path=zip_file_path,
overwrite=True)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(model_root)
return new_model_file

View File

@ -0,0 +1,116 @@
import cv2
import math
import numpy as np
from skimage import transform as trans
def transform(data, center, output_size, scale, rotation):
scale_ratio = scale
rot = float(rotation) * np.pi / 180.0
#translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
t1 = trans.SimilarityTransform(scale=scale_ratio)
cx = center[0] * scale_ratio
cy = center[1] * scale_ratio
t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
t3 = trans.SimilarityTransform(rotation=rot)
t4 = trans.SimilarityTransform(translation=(output_size / 2,
output_size / 2))
t = t1 + t2 + t3 + t4
M = t.params[0:2]
cropped = cv2.warpAffine(data,
M, (output_size, output_size),
borderValue=0.0)
return cropped, M
def trans_points2d(pts, M):
new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
for i in range(pts.shape[0]):
pt = pts[i]
new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
new_pt = np.dot(M, new_pt)
#print('new_pt', new_pt.shape, new_pt)
new_pts[i] = new_pt[0:2]
return new_pts
def trans_points3d(pts, M):
scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
#print(scale)
new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
for i in range(pts.shape[0]):
pt = pts[i]
new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
new_pt = np.dot(M, new_pt)
#print('new_pt', new_pt.shape, new_pt)
new_pts[i][0:2] = new_pt[0:2]
new_pts[i][2] = pts[i][2] * scale
return new_pts
def trans_points(pts, M):
if pts.shape[1] == 2:
return trans_points2d(pts, M)
else:
return trans_points3d(pts, M)
def estimate_affine_matrix_3d23d(X, Y):
''' Using least-squares solution
Args:
X: [n, 3]. 3d points(fixed)
Y: [n, 3]. corresponding 3d points(moving). Y = PX
Returns:
P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).
'''
X_homo = np.hstack((X, np.ones([X.shape[0],1]))) #n x 4
P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
return P
def P2sRt(P):
''' decompositing camera matrix P
Args:
P: (3, 4). Affine Camera Matrix.
Returns:
s: scale factor.
R: (3, 3). rotation matrix.
t: (3,). translation.
'''
t = P[:, 3]
R1 = P[0:1, :3]
R2 = P[1:2, :3]
s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0
r1 = R1/np.linalg.norm(R1)
r2 = R2/np.linalg.norm(R2)
r3 = np.cross(r1, r2)
R = np.concatenate((r1, r2, r3), 0)
return s, R, t
def matrix2angle(R):
''' get three Euler angles from Rotation Matrix
Args:
R: (3,3). rotation matrix
Returns:
x: pitch
y: yaw
z: roll
'''
sy = math.sqrt(R[0,0] * R[0,0] + R[1,0] * R[1,0])
singular = sy < 1e-6
if not singular :
x = math.atan2(R[2,1] , R[2,2])
y = math.atan2(-R[2,0], sy)
z = math.atan2(R[1,0], R[0,0])
else :
x = math.atan2(-R[1,2], R[1,1])
y = math.atan2(-R[2,0], sy)
z = 0
# rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi
return rx, ry, rz

View File

@ -0,0 +1,79 @@
# coding: utf-8
"""
face detectoin and alignment using InsightFace
"""
import numpy as np
from .rprint import rlog as log
from .dependencies.insightface.app import FaceAnalysis
from .dependencies.insightface.app.common import Face
from .timer import Timer
def sort_by_direction(faces, direction: str = 'large-small', face_center=None):
if len(faces) <= 0:
return faces
if direction == 'left-right':
return sorted(faces, key=lambda face: face['bbox'][0])
if direction == 'right-left':
return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)
if direction == 'top-bottom':
return sorted(faces, key=lambda face: face['bbox'][1])
if direction == 'bottom-top':
return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)
if direction == 'small-large':
return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))
if direction == 'large-small':
return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True)
if direction == 'distance-from-retarget-face':
return sorted(faces, key=lambda face: (((face['bbox'][2]+face['bbox'][0])/2-face_center[0])**2+((face['bbox'][3]+face['bbox'][1])/2-face_center[1])**2)**0.5)
return faces
class FaceAnalysisDIY(FaceAnalysis):
def __init__(self, name='buffalo_l', root='~/.insightface', allowed_modules=None, **kwargs):
super().__init__(name=name, root=root, allowed_modules=allowed_modules, **kwargs)
self.timer = Timer()
def get(self, img_bgr, **kwargs):
max_num = kwargs.get('max_num', 0) # the number of the detected faces, 0 means no limit
flag_do_landmark_2d_106 = kwargs.get('flag_do_landmark_2d_106', True) # whether to do 106-point detection
direction = kwargs.get('direction', 'large-small') # sorting direction
face_center = None
bboxes, kpss = self.det_model.detect(img_bgr, max_num=max_num, metric='default')
if bboxes.shape[0] == 0:
return []
ret = []
for i in range(bboxes.shape[0]):
bbox = bboxes[i, 0:4]
det_score = bboxes[i, 4]
kps = None
if kpss is not None:
kps = kpss[i]
face = Face(bbox=bbox, kps=kps, det_score=det_score)
for taskname, model in self.models.items():
if taskname == 'detection':
continue
if (not flag_do_landmark_2d_106) and taskname == 'landmark_2d_106':
continue
# print(f'taskname: {taskname}')
model.get(img_bgr, face)
ret.append(face)
ret = sort_by_direction(ret, direction, face_center)
return ret
def warmup(self):
self.timer.tic()
img_bgr = np.zeros((512, 512, 3), dtype=np.uint8)
self.get(img_bgr)
elapse = self.timer.toc()
log(f'FaceAnalysisDIY warmup time: {elapse:.3f}s')

154
src/utils/helper.py Normal file
View File

@ -0,0 +1,154 @@
# coding: utf-8
"""
utility functions and classes to handle feature extraction and model loading
"""
import os
import os.path as osp
import torch
from collections import OrderedDict
from ..modules.spade_generator import SPADEDecoder
from ..modules.warping_network import WarpingNetwork
from ..modules.motion_extractor import MotionExtractor
from ..modules.appearance_feature_extractor import AppearanceFeatureExtractor
from ..modules.stitching_retargeting_network import StitchingRetargetingNetwork
def suffix(filename):
"""a.jpg -> jpg"""
pos = filename.rfind(".")
if pos == -1:
return ""
return filename[pos + 1:]
def prefix(filename):
"""a.jpg -> a"""
pos = filename.rfind(".")
if pos == -1:
return filename
return filename[:pos]
def basename(filename):
"""a/b/c.jpg -> c"""
return prefix(osp.basename(filename))
def is_video(file_path):
if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path):
return True
return False
def is_template(file_path):
if file_path.endswith(".pkl"):
return True
return False
def mkdir(d, log=False):
# return self-assined `d`, for one line code
if not osp.exists(d):
os.makedirs(d, exist_ok=True)
if log:
print(f"Make dir: {d}")
return d
def squeeze_tensor_to_numpy(tensor):
out = tensor.data.squeeze(0).cpu().numpy()
return out
def dct2cuda(dct: dict, device_id: int):
for key in dct:
dct[key] = torch.tensor(dct[key]).cuda(device_id)
return dct
def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
"""
kp_source: (bs, k, 3)
kp_driving: (bs, k, 3)
Return: (bs, 2k*3)
"""
bs_src = kp_source.shape[0]
bs_dri = kp_driving.shape[0]
assert bs_src == bs_dri, 'batch size must be equal'
feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)
return feat
def remove_ddp_dumplicate_key(state_dict):
state_dict_new = OrderedDict()
for key in state_dict.keys():
state_dict_new[key.replace('module.', '')] = state_dict[key]
return state_dict_new
def load_model(ckpt_path, model_config, device, model_type):
model_params = model_config['model_params'][f'{model_type}_params']
if model_type == 'appearance_feature_extractor':
model = AppearanceFeatureExtractor(**model_params).cuda(device)
elif model_type == 'motion_extractor':
model = MotionExtractor(**model_params).cuda(device)
elif model_type == 'warping_module':
model = WarpingNetwork(**model_params).cuda(device)
elif model_type == 'spade_generator':
model = SPADEDecoder(**model_params).cuda(device)
elif model_type == 'stitching_retargeting_module':
# Special handling for stitching and retargeting module
config = model_config['model_params']['stitching_retargeting_module_params']
checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
stitcher = StitchingRetargetingNetwork(**config.get('stitching'))
stitcher.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_shoulder']))
stitcher = stitcher.cuda(device)
stitcher.eval()
retargetor_lip = StitchingRetargetingNetwork(**config.get('lip'))
retargetor_lip.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_mouth']))
retargetor_lip = retargetor_lip.cuda(device)
retargetor_lip.eval()
retargetor_eye = StitchingRetargetingNetwork(**config.get('eye'))
retargetor_eye.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_eye']))
retargetor_eye = retargetor_eye.cuda(device)
retargetor_eye.eval()
return {
'stitching': stitcher,
'lip': retargetor_lip,
'eye': retargetor_eye
}
else:
raise ValueError(f"Unknown model type: {model_type}")
model.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))
model.eval()
return model
# get coefficients of Eqn. 7
def calculate_transformation(config, s_kp_info, t_0_kp_info, t_i_kp_info, R_s, R_t_0, R_t_i):
if config.relative:
new_rotation = (R_t_i @ R_t_0.permute(0, 2, 1)) @ R_s
new_expression = s_kp_info['exp'] + (t_i_kp_info['exp'] - t_0_kp_info['exp'])
else:
new_rotation = R_t_i
new_expression = t_i_kp_info['exp']
new_translation = s_kp_info['t'] + (t_i_kp_info['t'] - t_0_kp_info['t'])
new_translation[..., 2].fill_(0) # Keep the z-axis unchanged
new_scale = s_kp_info['scale'] * (t_i_kp_info['scale'] / t_0_kp_info['scale'])
return new_rotation, new_expression, new_translation, new_scale
def load_description(fp):
with open(fp, 'r', encoding='utf-8') as f:
content = f.read()
return content

97
src/utils/io.py Normal file
View File

@ -0,0 +1,97 @@
# coding: utf-8
import os
from glob import glob
import os.path as osp
import imageio
import numpy as np
import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False)
def load_image_rgb(image_path: str):
if not osp.exists(image_path):
raise FileNotFoundError(f"Image not found: {image_path}")
img = cv2.imread(image_path, cv2.IMREAD_COLOR)
return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
def load_driving_info(driving_info):
driving_video_ori = []
def load_images_from_directory(directory):
image_paths = sorted(glob(osp.join(directory, '*.png')) + glob(osp.join(directory, '*.jpg')))
return [load_image_rgb(im_path) for im_path in image_paths]
def load_images_from_video(file_path):
reader = imageio.get_reader(file_path)
return [image for idx, image in enumerate(reader)]
if osp.isdir(driving_info):
driving_video_ori = load_images_from_directory(driving_info)
elif osp.isfile(driving_info):
driving_video_ori = load_images_from_video(driving_info)
return driving_video_ori
def contiguous(obj):
if not obj.flags.c_contiguous:
obj = obj.copy(order="C")
return obj
def resize_to_limit(img: np.ndarray, max_dim=1920, n=2):
"""
ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
:param img: the image to be processed.
:param max_dim: the maximum dimension constraint.
:param n: the number that needs to be multiples of.
:return: the adjusted image.
"""
h, w = img.shape[:2]
# ajust the size of the image according to the maximum dimension
if max_dim > 0 and max(h, w) > max_dim:
if h > w:
new_h = max_dim
new_w = int(w * (max_dim / h))
else:
new_w = max_dim
new_h = int(h * (max_dim / w))
img = cv2.resize(img, (new_w, new_h))
# ensure that the image dimensions are multiples of n
n = max(n, 1)
new_h = img.shape[0] - (img.shape[0] % n)
new_w = img.shape[1] - (img.shape[1] % n)
if new_h == 0 or new_w == 0:
# when the width or height is less than n, no need to process
return img
if new_h != img.shape[0] or new_w != img.shape[1]:
img = img[:new_h, :new_w]
return img
def load_img_online(obj, mode="bgr", **kwargs):
max_dim = kwargs.get("max_dim", 1920)
n = kwargs.get("n", 2)
if isinstance(obj, str):
if mode.lower() == "gray":
img = cv2.imread(obj, cv2.IMREAD_GRAYSCALE)
else:
img = cv2.imread(obj, cv2.IMREAD_COLOR)
else:
img = obj
# Resize image to satisfy constraints
img = resize_to_limit(img, max_dim=max_dim, n=n)
if mode.lower() == "bgr":
return contiguous(img)
elif mode.lower() == "rgb":
return contiguous(img[..., ::-1])
else:
raise Exception(f"Unknown mode {mode}")

Some files were not shown because too many files have changed in this diff Show More