上篇链接
看LZ上篇博客的时间竟然是7月18日,着实是懈怠了,其实有很多东西需要总结归纳,这周末就补一下之前欠的债吧
上篇主要介绍了DBFace的大体框架,这篇主要介绍数据的预处理部分
pytorch数据一般是要写一个类函数来继承Dataset类的,需要定义三个函数__init__(self), len(self), getitem(self)这三个函数,在DBFace中的代码如下所示:
class LDataset(Dataset):
def __init__(self, labelfile, imagesdir, numlandmarks, mean, std, width=800, height=800):
self.width = width
self.height = height
self.numlandmarks = numlandmarks
self.items = common.load_webface(labelfile, imagesdir, numlandmarks)
self.mean = mean
self.std = std
def __len__(self):
return len(self.items)
def __getitem__(self, index):
...
可以看到初始化还是比较简单的,主要是下载对应的参数,这边LZ其实是修改了一部分代码的,感兴趣的小伙伴可以在github上下载原始代码。
在初始化部分需要看的是这一行
self.items = common.load_webface(labelfile, imagesdir, numlandmarks)
我们来看下load_webface部分:
def load_webface(labelfile, imagesdir, numlandmarks):
with open(labelfile, "r") as f:
lines = f.readlines()
lines = [line.replace("\n", "") for line in lines]
stage = 0
facials = []
file = None
files = []
for index, line in enumerate(lines):
if line.startswith("#"):
if file is not None:
files.append([f"{imagesdir}/{file}", parse_facials_webface(facials, numlandmarks)])
file = line[2:]
facials = []
else:
facials.append([float(item) for item in line.split(" ")])
if file is not None:
files.append([f"{imagesdir}/{file}", parse_facials_webface(facials, numlandmarks)])
return files
widerface的label.txt如下所示:
# 0--Parade/0_Parade_marchingband_1_849.jpg
449 330 122 149 488.906 373.643 0.0 542.089 376.442 0.0 515.031 412.83 0.0 485.174 425.893 0.0 538.357 431.491 0.0 0.82
# 0--Parade/0_Parade_Parade_0_904.jpg
361 98 263 339 424.143 251.656 0.0 547.134 232.571 0.0 494.121 325.875 0.0 453.83 368.286 0.0 561.978 342.839 0.0 0.89
# 0--Parade/0_Parade_marchingband_1_799.jpg
...
“#”后面有一个空格,后面跟着图片名称,第二行分别是x,y,w,h也就是人脸框左上角的点坐标和对应框的宽度和高度,后面跟着关键点坐标,这个应该还是很好理解的,在parse_facials_webface 函数中要根据自己的关键点数量进行修改,这个看了源码应该很好理解
self.items里面就是保存的要读取图片的路径和对应人脸框和关键点的数据
# 构建dataset部分,继承torch 的dataset类
self.train_dataset = LDataset(labelfile, imagesdir, numlandmarks, mean=self.mean, std=self.std,
width=self.width, height=self.height)
self.train_loader = DataLoader(dataset=self.train_dataset, batch_size=self.batch_size, shuffle=True,num_workers=24)
# 优化器adam,使用默认的weight_decay=0
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
self.per_epoch_batchs = len(self.train_loader)
self.iter = 0
self.epochs = 150
使用DataLoader类生成一个迭代对象,在训练的时候可以给网络送不同的数据,优化器是adam,学习率是之前设置好的
# warm up一下
lr_scheduer = {
1: 1e-3,
2: 2e-3,
3: 1e-3,
60: 1e-4,
120: 1e-5
}
实际上很多数据的预处理是在训练的阶段进行的,DBFace中APP中的两个函数就是用来训练的
def train_epoch(self, epoch):
for indbatch, (images, heatmap_gt, heatmap_posweight, reg_tlrb, reg_mask, landmark_gt, landmark_mask, num_objs,
keep_mask) in enumerate(self.train_loader):
self.iter += 1
batch_objs = sum(num_objs)
batch_size = self.batch_size
if batch_objs == 0:
batch_objs = 1
heatmap_gt = heatmap_gt.to(self.gpu_master)
heatmap_posweight = heatmap_posweight.to(self.gpu_master)
keep_mask = keep_mask.to(self.gpu_master)
reg_tlrb = reg_tlrb.to(self.gpu_master)
reg_mask = reg_mask.to(self.gpu_master)
landmark_gt = landmark_gt.to(self.gpu_master)
landmark_mask = landmark_mask.to(self.gpu_master)
images = images.to(self.gpu_master)
hm, tlrb, landmark = self.model(images)
# 把数据压到0-1的范围
hm = hm.sigmoid()
hm = torch.clamp(hm, min=1e-4, max=1 - 1e-4)
# 为什么回归出来框坐标要进行exp处理?
# 因为使用exp后的结果进行拟合,换句话说网络推断出来的是log(tlrb)
tlrb = torch.exp(tlrb)
hm_loss = self.focal_loss(hm, heatmap_gt, heatmap_posweight, keep_mask=keep_mask) / batch_objs
reg_loss = self.giou_loss(tlrb, reg_tlrb, reg_mask) * 5 # 这个权重要改吗?
landmark_loss = self.landmark_loss(landmark, landmark_gt, landmark_mask) * 0.1
loss = hm_loss + reg_loss + landmark_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
epoch_flt = epoch + indbatch / self.per_epoch_batchs
if indbatch % 10 == 0:
log.info(
f"iter: {self.iter}, lr: {self.lr:g}, epoch: {epoch_flt:.2f}, loss: {loss.item():.2f}, hm_loss: {hm_loss.item():.2f}, "
f"box_loss: {reg_loss.item():.2f}, lmdk_loss: {landmark_loss.item():.5f}"
)
if indbatch % 1000 == 0:
log.info("save hm")
hm_image = hm[0, 0].cpu().data.numpy()
common.imwrite(f"{jobdir}/imgs/hm_image.jpg", hm_image * 255)
common.imwrite(f"{jobdir}/imgs/hm_image_gt.jpg", heatmap_gt[0, 0].cpu().data.numpy() * 255)
image = np.clip((images[0].permute(1, 2, 0).cpu().data.numpy() * self.std + self.mean) * 255, 0,
255).astype(np.uint8)
outobjs = eval_tool.detect_images_giou_with_netout(hm, tlrb, landmark, threshold=0.1, ibatch=0)
im1 = image.copy()
for obj in outobjs:
common.drawbbox(im1, obj)
common.imwrite(f"{jobdir}/imgs/train_result.jpg", im1)
def train(self):
# warm up?
lr_scheduer = {
1: 1e-3,
2: 2e-3,
3: 1e-3,
60: 1e-4,
120: 1e-5
}
# train
self.model.train()
for epoch in range(self.epochs):
if epoch in lr_scheduer:
self.set_lr(lr_scheduer[epoch])
self.train_epoch(epoch)
file = f"{jobdir}/models/{epoch + 1}.pth"
common.mkdirs_from_file_path(f让ile)
torch.save(self.model.module.state_dict(), file)
在训练的的函数中,pytorch就会调用LDataset中的__getitem__(self, index),这个其实才是比较关键的数据预处理部分
def __getitem__(self, index):
# 获取对应的图片的路径,objs是对应图片中的人脸框和关键点,如果有多个人脸,就会有多个list
imgfile, objs = self.items[index]
image = common.imread(imgfile)
if image is None:
log.info("{} is empty, index={}".format(imgfile, index))
return self[random.randint(0, len(self.items) - 1)]
keepsize = 12
# 进行数据增广
image, objs = augment.webface(image, objs, self.numlandmarks, self.width, self.height, keepsize=0)
# norm, 固定值可以放到NNIE上去做,进行数据归一化,这个可以在生成wk的时候做,也可以用网络做,当然也可以用cpu做,用neon加速
# 现在真的每个操作都得节约时间,1ms也要节约,LZ哭了
image = ((image / 255.0 - self.mean) / self.std).astype(np.float32)
posweight_radius = 2 # 这个有啥用?,后面高斯核的半径
# 这个是通过fpn,输出的feature map stride = 4,加速可以是stride=8,满脑子加速
stride = 4
fm_width = self.width // stride
fm_height = self.height // stride
# 这里需要根据关键点的数量进行修改,初始化一些map
heatmap_gt = np.zeros((1, fm_height, fm_width), np.float32)
heatmap_posweight = np.zeros((1, fm_height, fm_width), np.float32)
keep_mask = np.ones((1, fm_height, fm_width), np.float32)
reg_tlrb = np.zeros((1 * 4, fm_height, fm_width), np.float32)
reg_mask = np.zeros((1, fm_height, fm_width), np.float32)
distance_map = np.zeros((1, fm_height, fm_width), np.float32) + 1000
# 我有25个关键点,有x,y坐标,要改成25×2
# landmark_gt = np.zeros((1 * 10, fm_height, fm_width), np.float32)
# landmark_mask = np.zeros((1, fm_height, fm_width), np.float32)
landmark_gt = np.zeros((1 * 50, fm_height, fm_width), np.float32)
landmark_mask = np.zeros((1, fm_height, fm_width), np.float32)
hassmall = False
for obj in objs:
isSmallObj = obj.area < keepsize * keepsize
if isSmallObj:
cx, cy = obj.safe_scale_center(1 / stride, fm_width, fm_height)
keep_mask[0, cy, cx] = 0
w, h = obj.width / stride, obj.height / stride
x0 = int(common.clip_value(cx - w // 2, fm_width - 1))
y0 = int(common.clip_value(cy - h // 2, fm_height - 1))
x1 = int(common.clip_value(cx + w // 2, fm_width - 1) + 1)
y1 = int(common.clip_value(cy + h // 2, fm_height - 1) + 1)
#这个是计算loss的时候的一个参数,也就是说只有有人脸的区域才参与loss的计算,如果不是人脸区域,不参与loss的计算
if x1 - x0 > 0 and y1 - y0 > 0:
keep_mask[0, y0:y1, x0:x1] = 0
hassmall = True
for obj in objs:
classes = 0
cx, cy = obj.safe_scale_center(1 / stride, fm_width, fm_height)
reg_box = np.array(obj.box) / stride #框的坐标除以对应的stride
isSmallObj = obj.area < keepsize * keepsize
if isSmallObj:
if obj.area >= 5 * 5:
distance_map[classes, cy, cx] = 0
reg_tlrb[classes * 4:(classes + 1) * 4, cy, cx] = reg_box # 通道数代表你回归的框的坐标乘以类别
reg_mask[classes, cy, cx] = 1
continue
w, h = obj.width / stride, obj.height / stride
x0 = int(common.clip_value(cx - w // 2, fm_width - 1))
y0 = int(common.clip_value(cy - h // 2, fm_height - 1))
x1 = int(common.clip_value(cx + w // 2, fm_width - 1) + 1)
y1 = int(common.clip_value(cy + h // 2, fm_height - 1) + 1)
if x1 - x0 > 0 and y1 - y0 > 0:
keep_mask[0, y0:y1, x0:x1] = 1
# 参考cornernet
w_radius, h_radius = common.truncate_radius((obj.width, obj.height)) # size/(4*stride)
gaussian_map = common.draw_truncate_gaussian(heatmap_gt[classes, :, :], (cx, cy), h_radius, w_radius)
mxface = 300
miface = 25
mxline = max(obj.width, obj.height)
gamma = (mxline - miface) / (mxface - miface) * 10
gamma = min(max(0, gamma), 10) + 1
common.draw_gaussian(heatmap_posweight[classes, :, :], (cx, cy), posweight_radius, k=gamma)
range_expand_x = math.ceil(w_radius)
range_expand_y = math.ceil(h_radius)
min_expand_size = 3
range_expand_x = max(min_expand_size, range_expand_x)
range_expand_y = max(min_expand_size, range_expand_y)
icx, icy = cx, cy
reg_landmark = None
fill_threshold = 0.3
# 这里也需要根据关键点数量进行修改
if obj.haslandmark:
reg_landmark = np.array(obj.x5y5_cat_landmark) / stride
# x5y5 = [cx] * 5 + [cy] * 5
x5y5 = [cx] * 25 + [cy] * 25
rvalue = (reg_landmark - x5y5)
# landmark_gt[0:10, cy, cx] = np.array(common.log(rvalue)) / 4
# 注意这里的log
landmark_gt[0:50, cy, cx] = np.array(common.log(rvalue)) / 4
landmark_mask[0, cy, cx] = 1
if not obj.rotate:
for cx in range(icx - range_expand_x, icx + range_expand_x + 1):
for cy in range(icy - range_expand_y, icy + range_expand_y + 1):
if cx < fm_width and cy < fm_height and cx >= 0 and cy >= 0:
my_gaussian_value = 0.9
gy, gx = cy - icy + range_expand_y, cx - icx + range_expand_x
if gy >= 0 and gy < gaussian_map.shape[0] and gx >= 0 and gx < gaussian_map.shape[1]:
my_gaussian_value = gaussian_map[gy, gx]
distance = math.sqrt((cx - icx) ** 2 + (cy - icy) ** 2)
if my_gaussian_value > fill_threshold or distance <= min_expand_size:
already_distance = distance_map[classes, cy, cx]
my_mix_distance = (1 - my_gaussian_value) * distance
if my_mix_distance > already_distance:
continue
distance_map[classes, cy, cx] = my_mix_distance
reg_tlrb[classes * 4:(classes + 1) * 4, cy, cx] = reg_box
reg_mask[classes, cy, cx] = 1
# if hassmall:
# common.imwrite("test_result/keep_mask.jpg", keep_mask[0]*255)
# common.imwrite("test_result/heatmap_gt.jpg", heatmap_gt[0]*255)
# common.imwrite("test_result/keep_ori.jpg", (image*self.std+self.mean)*255)
return T.to_tensor(image), heatmap_gt, heatmap_posweight, reg_tlrb, reg_mask, landmark_gt, landmark_mask, len(
objs), keep_mask
数据增广函数在argument.py脚本中
def webface(image, objs, numlandmarks, outw=800, outh=800, keepsize=8):
funcs = [[augmentWithColorJittering, 0.7], [augmentWithFlip, 0.7]]
random.shuffle(funcs)
num = len(funcs)
for n in range(num):
func, freq = funcs[n]
if randrf(0, 1) < freq:
image, objs = func(image, objs)
if randrf(0, 1) > 0.5:
image, objs = cubeTransform(image, objs, outw, outh, keepsize=keepsize)
image, objs = augmentWithCropScaleWebface(image, objs, numlandmarks, outw, outh, 'cube', keepsize=keepsize)
else:
image, objs = augmentWithCropScaleWebface(image, objs, numlandmarks, outw, outh, keepsize=keepsize)
return image, objs
主要是:
augmentWithColorJittering:对颜色的数据增强,包括图像的亮度,对比度和饱和度
augmentWithFlip:水平翻转,这个当中需要注意的是关键点要根据水平翻转后也要进行镜像处理
augmentWithCropScaleWebface:随机裁剪和尺度变换
cubeTransform: 立方体转换
后面一篇准备写下网络推断部分,未完待续。。。