在StyleGAN Encoder中,人脸对齐(face_alignment)是一个不可缺少的操作。对没有进行人脸对齐的真实人脸图片,直接运行encode_images.py会大概率遇到不能收敛的情况(即:找不到对应真实人脸的dlatents向量),或者生成的图片只有人脸轮廓,五官相貌一片模糊。
这里的一个启发是,如果打算将StyleGAN这样的神经网络重新训练,用于生成服装样式或者汽车外观设计等领域,对原始图片的标注和对齐(alignment)工作将是十分重要的。一个没有进行对齐预处理的原始数据集,其训练与应用难度和时间消耗将是灾难性的。
那么,StyleGAN Encoder中的人脸对齐是怎样实现的呢?它的大致过程如下:
(1)获取dlib人脸识别模型“shape_predictor_68_face_landmarks.dat.bz2”,并解压缩,在我的Windows 10笔记本上,它放在这个目录下:C:\用户\HP\.keras\temp
(2)在原始图片文件目录(.\raw_images)下遍历,读取每个图片文件,获取图片中的每一个人脸的68点landmarks(.\ffhq_dataset\landmarks_detector.py),并对单个人脸进行人脸对齐操作;
(3)在人脸对齐操作中(.\ffhq_dataset\face_alignment.py),首先计算需要裁剪的人脸区域,这一部分计算看起来很复杂,我画了一个图示意,计算的过程与结果大致如下(Pillow的坐标系原点:图像的左上角为 (0, 0) , X轴是从左到右增长的,而Y轴是从上到下增长,因此对应到程序中的坐标,下面的图需要上下翻转才能对应。):
这里,四边形ABCD是下面需要转换到对齐人脸的区域,四边形PQRS是用于裁剪的四边形区域。需要注意,根据计算得到用于裁剪的四边形的顶点,其中P、Q、S三点已经跑到了X轴和Y轴的负值区域(也就是说,跑到了原始图片的像素区域以外);
(4)对于裁剪四边形的顶点落到原始图片像素区域以外的情况,需要对这些外部区域进行数据填充(Pad),然后使用高斯滤波对图片进行锐化;
(5)最后,调用Pillow的transform方法,将四边形ABCD转换成对齐的人脸区域,并保存到文件。
完整的带中文注释的源代码如下:
.\align_images.py
import os
import sys
import bz2
import argparse
from keras.utils import get_file
from ffhq_dataset.face_alignment import image_align
from ffhq_dataset.landmarks_detector import LandmarksDetector
import multiprocessing
LANDMARKS_MODEL_URL = 'http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2'
def unpack_bz2(src_path):
data = bz2.BZ2File(src_path).read()
dst_path = src_path[:-4]
with open(dst_path, 'wb') as fp:
fp.write(data)
return dst_path
if __name__ == "__main__":
"""
Extracts and aligns all faces from images using DLib and a function from original FFHQ dataset preparation step
python align_images.py /raw_images /aligned_images
"""
parser = argparse.ArgumentParser(description='Align faces from input images', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('raw_dir', help='Directory with raw images for face alignment')
parser.add_argument('aligned_dir', help='Directory for storing aligned images')
parser.add_argument('--output_size', default=1024, help='The dimension of images for input to the model', type=int)
parser.add_argument('--x_scale', default=1, help='Scaling factor for x dimension', type=float)
parser.add_argument('--y_scale', default=1, help='Scaling factor for y dimension', type=float)
parser.add_argument('--em_scale', default=0.1, help='Scaling factor for eye-mouth distance', type=float)
parser.add_argument('--use_alpha', default=False, help='Add an alpha channel for masking', type=bool)
args, other_args = parser.parse_known_args()
# 获取dlib人脸识别模型,支持68个点的landmarks,解压缩之
landmarks_model_path = unpack_bz2(get_file('shape_predictor_68_face_landmarks.dat.bz2',
LANDMARKS_MODEL_URL, cache_subdir='temp'))
print('landmark_model_path: ', landmarks_model_path)
RAW_IMAGES_DIR = args.raw_dir
ALIGNED_IMAGES_DIR = args.aligned_dir
# 调用dlib人脸识别模型
landmarks_detector = LandmarksDetector(landmarks_model_path)
# 在原始图片文件目录下遍历,读取每个图片文件
for img_name in os.listdir(RAW_IMAGES_DIR):
print('Aligning %s ...' % img_name)
try:
raw_img_path = os.path.join(RAW_IMAGES_DIR, img_name)
fn = face_img_name = '%s_%02d.png' % (os.path.splitext(img_name)[0], 1)
# 如果是文件,继续
if os.path.isfile(fn):
continue
print('Getting landmarks...')
# 获取图片中的每一个人脸的landmarks
for i, face_landmarks in enumerate(landmarks_detector.get_landmarks(raw_img_path), start=1):
try:
print('Starting face alignment...')
# 若有多个人脸,依次命名为01,02......
face_img_name = '%s_%02d.png' % (os.path.splitext(img_name)[0], i)
aligned_face_path = os.path.join(ALIGNED_IMAGES_DIR, face_img_name)
# 对单个人脸进行人脸对齐操作
image_align(raw_img_path, aligned_face_path, face_landmarks, output_size=args.output_size, x_scale=args.x_scale, y_scale=args.y_scale, em_scale=args.em_scale, alpha=args.use_alpha)
print('Wrote result %s' % aligned_face_path)
except:
print("Exception in face alignment!")
except:
print("Exception in landmark detection!")
.\ffhq_dataset\landmarks_detector.py
import dlib
class LandmarksDetector:
def __init__(self, predictor_model_path):
"""
:param predictor_model_path: path to shape_predictor_68_face_landmarks.dat file
"""
# dlib正向人脸检测器,进行人脸检测,提取人脸外部矩形框
self.detector = dlib.get_frontal_face_detector() # cnn_face_detection_model_v1 also can be used
# dlib 的68点模型
self.shape_predictor = dlib.shape_predictor(predictor_model_path)
def get_landmarks(self, image):
img = dlib.load_rgb_image(image)
# 参数1表示对图片进行上采样一次,有利于检测到更多的人脸
# dets的个数即为检测到的人脸的个数,遍历dets可以获取到检测到的每个人脸四个坐标极值
dets = self.detector(img, 1)
# # 对dets遍历
for detection in dets:
try:
# 用predictor(img, detection)计算检测到的每张人脸的关键点
# 获取每个关键点坐标shape.parts()的x,y值,存入landmark矩阵(模型默认提取68个关键点,所以landmark为68×2矩阵)。
face_landmarks = [(item.x, item.y) for item in self.shape_predictor(img, detection).parts()]
yield face_landmarks
except:
print("Exception in get_landmarks()!")
.\ffhq_dataset\face_alignment.py
import numpy as np
import scipy.ndimage
import os
import PIL.Image
# 根据一张人脸的landmarks,找到人脸框并进行缩放、裁剪、padding和转换
def image_align(src_file, dst_file, face_landmarks, output_size=1024, transform_size=4096, enable_padding=True, x_scale=1, y_scale=1, em_scale=0.1, alpha=False):
# Align function from FFHQ dataset pre-processing step
# https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py
# 将landmarks list(下巴、眉毛、鼻子、眼睛、嘴等)数据转换为数组,每个点有(x, y)两个坐标
lm = np.array(face_landmarks) # lm.shape = (68, 2)
lm_chin = lm[0 : 17] # left-right
lm_eyebrow_left = lm[17 : 22] # left-right
lm_eyebrow_right = lm[22 : 27] # left-right
lm_nose = lm[27 : 31] # top-down
lm_nostrils = lm[31 : 36] # top-down,鼻孔
lm_eye_left = lm[36 : 42] # left-clockwise
lm_eye_right = lm[42 : 48] # left-clockwise
lm_mouth_outer = lm[48 : 60] # left-clockwise
lm_mouth_inner = lm[60 : 68] # left-clockwise
# Calculate auxiliary(辅助) vectors.
eye_left = np.mean(lm_eye_left, axis=0) # axis=0,按列求坐标平均值(x', y')
eye_right = np.mean(lm_eye_right, axis=0)
eye_avg = (eye_left + eye_right) * 0.5
eye_to_eye = eye_right - eye_left
mouth_left = lm_mouth_outer[0]
mouth_right = lm_mouth_outer[6]
mouth_avg = (mouth_left + mouth_right) * 0.5
eye_to_mouth = mouth_avg - eye_avg
# Choose oriented crop rectangle.
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] # flipud函数实现矩阵的上下翻转;数组乘法,每行对应位置相乘
# 双眼连线与眼嘴连线互相垂直,*[-1, 1]调整为相反的斜率
# 二者相减,实际是“双眼差+眼嘴差”,得到计算的“初始值”
x /= np.hypot(*x) # hypot函数计算直角三角形的斜边长,用斜边长对三角形两条直边做归一化
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8) # 双眼差和眼嘴差,选较大的作为基准尺度
x *= x_scale # x_scale是x轴方向的缩放系数1
y = np.flipud(x) * [-y_scale, y_scale] # y_scale是y轴方向的缩放系数1,* [-y_scale, y_scale]调整为垂直方向
c = eye_avg + eye_to_mouth * em_scale # em_scale是眼嘴距离的缩放系数0.1,c是面部的基准位置(比较接近eye_avg)
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) # 定义四边形,以面部基准位置为中心上下左右平移得到四个顶点
# quad.shape: (4, 2)
qsize = np.hypot(*x) * 2 # 定义四边形的大小(边长),为基准尺度的2倍
# Load in-the-wild image.
if not os.path.isfile(src_file):
print('\nCannot find source image. Please run "--wilds" before "--align".')
return
img = PIL.Image.open(src_file)
# Shrink.
# 如果计算出的四边形太大了,就按比例缩小它
shrink = int(np.floor(qsize / output_size * 0.5))
if shrink > 1:
rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink))) # rint函数是四舍五入取整
img = img.resize(rsize, PIL.Image.ANTIALIAS)
quad /= shrink
qsize /= shrink
# Crop(left, upper, right, lower)
# np.floor返回不大于输入参数的最大整数,np.ceil返回不小于输入参数的最小整数
border = max(int(np.rint(qsize * 0.1)), 3)
# 取包括四边形的最小矩形
crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
# 增加border
crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1]))
# 剪裁下这个包括border的最小矩形
if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
img = img.crop(crop)
quad -= crop[0:2] # 把左下角坐标定义为(0,0)
# Pad. 填充数据
pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0))
if enable_padding and max(pad) > border - 4:
pad = np.maximum(pad, int(np.rint(qsize * 0.3))) # np.maximum()取对应位置上的较大值,np.minimum 取对应位置上的较小值
# 对剪切区域溢出到img图片x轴负方向(即:第二、三象限)和y轴负方向(即:第三、四象限)的部分,用图像数据进行填充
# 下面的操作实际上是在当前图像区域的四周,分别填充了pad[0]、pad[1]、pad[2]、pad[3]宽度的数据,填充方法为reflect
img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
# y, x, _为三维正交向量,y为列向量[[1], [2], ...[h]],x为行向量[[1, 2, ...w]]
# (x, y)集合定义了整张图片的像素位置
h, w, _ = img.shape
y, x, _ = np.ogrid[:h, :w, :1]
# mask用于把Pad的边界数据标识出来
mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w-1-x) / pad[2]), 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h-1-y) / pad[3]))
blur = qsize * 0.02
# 高斯滤波让它们一定程度上变模糊,然后相减,提取模糊信息, 并将模糊信息添加到Pad的边界数据上
img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
img += (np.median(img, axis=(0,1)) - img) * np.clip(mask, 0.0, 1.0) # np.median()返回数组元素的中位数
img = np.uint8(np.clip(np.rint(img), 0, 255))
# alpha通道
# The alpha channel is a color component that represents the degree of transparency (or opacity) of a color
# (i.e., the red, green and blue channels). It is used to determine how a pixel is rendered when blended with another.
if alpha:
mask = 1-np.clip(3.0 * mask, 0.0, 1.0)
mask = np.uint8(np.clip(np.rint(mask*255), 0, 255))
img = np.concatenate((img, mask), axis=2)
img = PIL.Image.fromarray(img, 'RGBA')
else:
img = PIL.Image.fromarray(img, 'RGB')
quad += pad[:2]
# Transform.
# QUAD,输入图像的一个四边形(通过四个角定义的区域)映射到给定尺寸(transform_size, transform_size)的长方形
# flattern()把quad压平为一维数组,filter为BILINEAR
img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
if output_size < transform_size:
img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
# Save aligned image.
img.save(dst_file, 'PNG')
(完)