1.目标检测基础
生成边界框
# bbox是bounding box的缩写
dog_bbox, cat_bbox = [60, 45, 378, 516], [400, 112, 655, 493]
def bbox_to_rect(bbox, color):
# 将边界框(左上x, 左上y, 右下x, 右下y)格式转换成matplotlib格式:
# ((左上x, 左上y), 宽, 高)
return d2l.plt.Rectangle(
xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
fill=False, edgecolor=color, linewidth=2)
什么是锚框
对于一个检测图像,会有大量的采样区域,并调整边缘使得能够更准确预测目标的真实边界框。锚框便是这样一种方法,它以每个像素为中心生成多个宽高比和大小不同的边界框,这些边界框便是锚框(anchor box),基于锚框可以进行目标检测等等。锚框一般有三种选取方式:人为经验、K-Means聚类,作为超参数进行学习
一次生成多个锚框
对于高宽分别为h、w的图像。分别以每个像素为中心生成形状不同的锚框。其他参数有大小,宽高比r,于是我们有了锚框宽高,。
对于一组大小和一组宽高
如果我们对每一个像素都用宽高比和大小组合生成多个锚框,最后会得到个框,复杂度会过高,所以一般我们只会对或者的全部组合感兴趣,即生成:
所以对这个图像最终会生成个框
生成多个锚框的算法,其中feature_map是图像的矩阵,一共有四维:N,C,H,W,分别是数目,通道数,高,宽,sizes是锚框相对图像大小的列表,ratios是宽高比的列表。首先先生成size和sqrt(r)的列表,总共n+m-1个,然后再生成相对坐标中心的位置ss1/2,ss2/2。然后将坐标点组合起来。
返回值是一个三维的tensor,第一维是1,即图像数目,第二维是框的数目,第三维是的长度为3,分别是对角线点的坐标
def MultiBoxPrior(feature_map, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]):
"""
# 按照「9.4.1. 生成多个锚框」所讲的实现, anchor表示成(xmin, ymin, xmax, ymax).
https://zh.d2l.ai/chapter_computer-vision/anchor.html
Args:
feature_map: torch tensor, Shape: [N, C, H, W].
sizes: List of sizes (0~1) of generated MultiBoxPriores.
ratios: List of aspect ratios (non-negative) of generated MultiBoxPriores.
Returns:
anchors of shape (1, num_anchors, 4). 由于batch里每个都一样, 所以第一维为1
"""
pairs = [] # pair of (size, sqrt(ration))
# 生成n + m -1个框
for r in ratios:
pairs.append([sizes[0], math.sqrt(r)])
for s in sizes[1:]:
pairs.append([s, math.sqrt(ratios[0])])
pairs = np.array(pairs)
# 生成相对于坐标中心点的框(x,y,x,y)
ss1 = pairs[:, 0] * pairs[:, 1] # size * sqrt(ration)
ss2 = pairs[:, 0] / pairs[:, 1] # size / sqrt(ration)
base_anchors = np.stack([-ss1, -ss2, ss1, ss2], axis=1) / 2
#将坐标点和anchor组合起来生成hw(n+m-1)个框输出
h, w = feature_map.shape[-2:]
shifts_x = np.arange(0, w) / w
shifts_y = np.arange(0, h) / h
shift_x, shift_y = np.meshgrid(shifts_x, shifts_y)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
shifts = np.stack((shift_x, shift_y, shift_x, shift_y), axis=1)
anchors = shifts.reshape((-1, 1, 4)) + base_anchors.reshape((1, -1, 4))
return torch.tensor(anchors, dtype=torch.float32).view(1, -1, 4)
用于将框可视化:这里我们定义了一个_make_list私有函数,用于将不是list的变量变为list
def show_bboxes(axes, bboxes, labels=None, colors=None):
def _make_list(obj, default_values=None):
if obj is None:
obj = default_values
elif not isinstance(obj, (list, tuple)):
obj = [obj]
return obj
labels = _make_list(labels)
colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
for i, bbox in enumerate(bboxes):
color = colors[i % len(colors)]
rect = bbox_to_rect(bbox.detach().cpu().numpy(), color)
axes.add_patch(rect)
if labels and len(labels) > i:
text_color = 'k' if color == 'w' else 'w'
axes.text(rect.xy[0], rect.xy[1], labels[i],
va='center', ha='center', fontsize=6, color=text_color,
bbox=dict(facecolor=color, lw=0))
交并比
假设一个锚框能较好的覆盖目标,那么怎么衡量多好呢?于是我们引入了交并比用于衡量两个集合的相似度(假设我们知道真实边框)。给定两个集合,则它们的Jaccard系数即为两者交集大小除以并集大小。
我们将边框内的区域看作是像素集合,这样的话交集即重合的像素,并集即所有像素的并。
torch.clamp:四个参数,input、min、max、out=None
将输入的input张量的每个元素归一化到min,max区间内
tensor.unsqueeze(k)用于增加维度例如tensor为(2,3),那么unsqueeze(1)后为(1,2,3)
计算交集:即每个框的最大点减去最小点
def compute_intersection(set_1, set_2):
"""
计算anchor之间的交集
Args:
set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)
set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)
Returns:
intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)
"""
# PyTorch auto-broadcasts singleton dimensions
lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0)) # (n1, n2, 2)
upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0)) # (n1, n2, 2)
intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0) # (n1, n2, 2)
return intersection_dims[:, :, 0] * intersection_dims[:, :, 1] # (n1, n2)
计算两个集合的jaccard系数,即相似度。相并面积即两集合面积之和减去相交面积
def compute_jaccard(set_1, set_2):
"""
计算anchor之间的Jaccard系数(IoU)
Args:
set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)
set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)
Returns:
Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)
"""
# Find intersections
intersection = compute_intersection(set_1, set_2) # (n1, n2)
# Find areas of each box in both sets
areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1]) # (n1)
areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1]) # (n2)
# Find the union
# PyTorch auto-broadcasts singleton dimensions
union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection # (n1, n2)
return intersection / union # (n1, n2)
为锚框标注标签
我们将每一个锚框视为一个训练样本,所以我们需要给锚框标注标签:1.真实标签 2.锚框与真实边框的偏移量,对于测试集,即目标检测时,我们分别预测出锚框的预测类别和偏移量(offset),接着通过预测的偏移量调整锚框位置从而得到预测边界框。
那么如何给锚框分配真实边框呢?
假设锚框分别为,真实边框分别为,最终我们可以得到一个交并比矩阵。对于这个矩阵,我们每次找其中最大的元素,然后清空这个元素所在的行和列,这样递归下去。直到矩阵为空。
那么又如何设定偏移量呢?
由于各个框和位置各异,我们需要一种方法能够将它们归一化,一般是采用这种办法:
特别的,如果一个锚框没有被分配真实边框,便将其真实边框设为背景,这类锚框称为负类锚框,其余称为正类锚框。
举个栗子:
其中ground_truth为真实边框,第一位是类别,其他四个元素是左上角和右下角的的坐标。
bbox_scale = torch.tensor((w, h, w, h), dtype=torch.float32)
ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],
[1, 0.55, 0.2, 0.9, 0.88]])
anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],
[0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],
[0.57, 0.3, 0.92, 0.9]])
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')
show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']);
下面使用之前实现的MultiBoxTarget函数来为锚框标注类别和偏移量。将函数背景类别设为0。
下面为分配锚框和标注标签、偏移量的实现。
def assign_anchor(bb, anchor, jaccard_threshold=0.5):
"""
# 按照「9.4.1. 生成多个锚框」图9.3所讲为每个anchor分配真实的bb, anchor表示成归一化(xmin, ymin, xmax, ymax).
https://zh.d2l.ai/chapter_computer-vision/anchor.html
Args:
bb: 真实边界框(bounding box), shape:(nb, 4)
anchor: 待分配的anchor, shape:(na, 4)
jaccard_threshold: 预先设定的阈值
Returns:
assigned_idx: shape: (na, ), 每个anchor分配的真实bb对应的索引, 若未分配任何bb则为-1
"""
na = anchor.shape[0]
nb = bb.shape[0]
jaccard = compute_jaccard(anchor, bb).detach().cpu().numpy() # shape: (na, nb)
assigned_idx = np.ones(na) * -1 # 存放标签初始全为-1
# 先为每个bb分配一个anchor(不要求满足jaccard_threshold)
jaccard_cp = jaccard.copy()
for j in range(nb):
i = np.argmax(jaccard_cp[:, j])
assigned_idx[i] = j
jaccard_cp[i, :] = float("-inf") # 赋值为负无穷, 相当于去掉这一行
# 处理还未被分配的anchor, 要求满足jaccard_threshold
for i in range(na):
if assigned_idx[i] == -1:
j = np.argmax(jaccard[i, :])
if jaccard[i, j] >= jaccard_threshold:
assigned_idx[i] = j
return torch.tensor(assigned_idx, dtype=torch.long)
def xy_to_cxcy(xy):
"""
将(x_min, y_min, x_max, y_max)形式的anchor转换成(center_x, center_y, w, h)形式的.
https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py
Args:
xy: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4)
Returns:
bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4)
"""
return torch.cat([(xy[:, 2:] + xy[:, :2]) / 2, # c_x, c_y
xy[:, 2:] - xy[:, :2]], 1) # w, h
def MultiBoxTarget(anchor, label):
"""
# 按照「9.4.1. 生成多个锚框」所讲的实现, anchor表示成归一化(xmin, ymin, xmax, ymax).
https://zh.d2l.ai/chapter_computer-vision/anchor.html
Args:
anchor: torch tensor, 输入的锚框, 一般是通过MultiBoxPrior生成, shape:(1,锚框总数,4)
label: 真实标签, shape为(bn, 每张图片最多的真实锚框数, 5)
第二维中,如果给定图片没有这么多锚框, 可以先用-1填充空白, 最后一维中的元素为[类别标签, 四个坐标值]
Returns:
列表, [bbox_offset, bbox_mask, cls_labels]
bbox_offset: 每个锚框的标注偏移量,形状为(bn,锚框总数*4)
bbox_mask: 形状同bbox_offset, 每个锚框的掩码, 一一对应上面的偏移量, 负类锚框(背景)对应的掩码均为0, 正类锚框的掩码均为1
cls_labels: 每个锚框的标注类别, 其中0表示为背景, 形状为(bn,锚框总数)
"""
assert len(anchor.shape) == 3 and len(label.shape) == 3
bn = label.shape[0]
def MultiBoxTarget_one(anc, lab, eps=1e-6):
"""
MultiBoxTarget函数的辅助函数, 处理batch中的一个
Args:
anc: shape of (锚框总数, 4)
lab: shape of (真实锚框数, 5), 5代表[类别标签, 四个坐标值]
eps: 一个极小值, 防止log0
Returns:
offset: (锚框总数*4, )
bbox_mask: (锚框总数*4, ), 0代表背景, 1代表非背景
cls_labels: (锚框总数, 4), 0代表背景
"""
an = anc.shape[0]
# 变量的意义
assigned_idx = assign_anchor(lab[:, 1:], anc) # (锚框总数, )
print("a: ", assigned_idx.shape)
print(assigned_idx)
bbox_mask = ((assigned_idx >= 0).float().unsqueeze(-1)).repeat(1, 4) # (锚框总数, 4)
print("b: " , bbox_mask.shape)
print(bbox_mask)
cls_labels = torch.zeros(an, dtype=torch.long) # 0表示背景
assigned_bb = torch.zeros((an, 4), dtype=torch.float32) # 所有anchor对应的bb坐标
for i in range(an):
bb_idx = assigned_idx[i]
if bb_idx >= 0: # 即非背景
cls_labels[i] = lab[bb_idx, 0].long().item() + 1 # 注意要加一
assigned_bb[i, :] = lab[bb_idx, 1:]
# 如何计算偏移量
center_anc = xy_to_cxcy(anc) # (center_x, center_y, w, h)
center_assigned_bb = xy_to_cxcy(assigned_bb)
offset_xy = 10.0 * (center_assigned_bb[:, :2] - center_anc[:, :2]) / center_anc[:, 2:]
offset_wh = 5.0 * torch.log(eps + center_assigned_bb[:, 2:] / center_anc[:, 2:])
offset = torch.cat([offset_xy, offset_wh], dim = 1) * bbox_mask # (锚框总数, 4)
return offset.view(-1), bbox_mask.view(-1), cls_labels
# 组合输出
batch_offset = []
batch_mask = []
batch_cls_labels = []
for b in range(bn):
offset, bbox_mask, cls_labels = MultiBoxTarget_one(anchor[0, :, :], label[b, :, :])
batch_offset.append(offset)
batch_mask.append(bbox_mask)
batch_cls_labels.append(cls_labels)
bbox_offset = torch.stack(batch_offset)
bbox_mask = torch.stack(batch_mask)
cls_labels = torch.stack(batch_cls_labels)
return [bbox_offset, bbox_mask, cls_labels]
测试:
由于第一维要求为图片数目维,所以增加一维。
labels = MultiBoxTarget(anchors.unsqueeze(dim=0),
ground_truth.unsqueeze(dim=0))
返回一个列表,第一项是偏差,第二项为掩码变量(形状:批量大小、锚框个数的四倍,对应锚框的四个偏移量),第三项为锚框的类别
输出预测边界框
非极大值抑制
当锚框数量较多时,同一个目标上可能会有许多相似的预测边界框,我们移除相似的预测边界框。
原理:对于一个预测边界框B,模型计算其各个类别的预测概率,假设最大概率为p,该概率所对应的类别即为B的预测类别,称p为预测边界框B的置信度。对所有预测边界框(非背景)按置信度由高向低排序形成列表L。对L,设高到底最高置信度的预测边界框为基准,将与它的交并比大于某一阈值的非基准边框移除,并重复这一过程,直到所有边界框都是基准边界框。
下面是MultiBoxDetection的实现
from collections import namedtuple
Pred_BB_Info = namedtuple("Pred_BB_Info", ["index", "class_id", "confidence", "xyxy"])
def non_max_suppression(bb_info_list, nms_threshold = 0.5):
"""
非极大抑制处理预测的边界框
Args:
bb_info_list: Pred_BB_Info的列表, 包含预测类别、置信度等信息
nms_threshold: 阈值
Returns:
output: Pred_BB_Info的列表, 只保留过滤后的边界框信息
"""
output = []
# 先根据置信度从高到低排序
sorted_bb_info_list = sorted(bb_info_list, key = lambda x: x.confidence, reverse=True)
# 循环遍历删除冗余输出
while len(sorted_bb_info_list) != 0:
best = sorted_bb_info_list.pop(0)
output.append(best)
if len(sorted_bb_info_list) == 0:
break
bb_xyxy = []
for bb in sorted_bb_info_list:
bb_xyxy.append(bb.xyxy)
iou = compute_jaccard(torch.tensor([best.xyxy]),
torch.tensor(bb_xyxy))[0] # shape: (len(sorted_bb_info_list), )
n = len(sorted_bb_info_list)
sorted_bb_info_list = [sorted_bb_info_list[i] for i in range(n) if iou[i] <= nms_threshold]
return output
def MultiBoxDetection(cls_prob, loc_pred, anchor, nms_threshold = 0.5):
"""
# 按照「9.4.1. 生成多个锚框」所讲的实现, anchor表示成归一化(xmin, ymin, xmax, ymax).
https://zh.d2l.ai/chapter_computer-vision/anchor.html
Args:
cls_prob: 经过softmax后得到的各个锚框的预测概率, shape:(bn, 预测总类别数+1, 锚框个数)
loc_pred: 预测的各个锚框的偏移量, shape:(bn, 锚框个数*4)
anchor: MultiBoxPrior输出的默认锚框, shape: (1, 锚框个数, 4)
nms_threshold: 非极大抑制中的阈值
Returns:
所有锚框的信息, shape: (bn, 锚框个数, 6)
每个锚框信息由[class_id, confidence, xmin, ymin, xmax, ymax]表示
class_id=-1 表示背景或在非极大值抑制中被移除了
"""
assert len(cls_prob.shape) == 3 and len(loc_pred.shape) == 2 and len(anchor.shape) == 3
bn = cls_prob.shape[0]
def MultiBoxDetection_one(c_p, l_p, anc, nms_threshold = 0.5):
"""
MultiBoxDetection的辅助函数, 处理batch中的一个
Args:
c_p: (预测总类别数+1, 锚框个数)
l_p: (锚框个数*4, )
anc: (锚框个数, 4)
nms_threshold: 非极大抑制中的阈值
Return:
output: (锚框个数, 6)
"""
pred_bb_num = c_p.shape[1]
anc = (anc + l_p.view(pred_bb_num, 4)).detach().cpu().numpy() # 加上偏移量
confidence, class_id = torch.max(c_p, 0)
confidence = confidence.detach().cpu().numpy()
class_id = class_id.detach().cpu().numpy()
pred_bb_info = [Pred_BB_Info(
index = i,
class_id = class_id[i] - 1, # 正类label从0开始
confidence = confidence[i],
xyxy=[*anc[i]]) # xyxy是个列表
for i in range(pred_bb_num)]
# 正类的index
obj_bb_idx = [bb.index for bb in non_max_suppression(pred_bb_info, nms_threshold)]
output = []
for bb in pred_bb_info:
output.append([
(bb.class_id if bb.index in obj_bb_idx else -1.0),
bb.confidence,
*bb.xyxy
])
return torch.tensor(output) # shape: (锚框个数, 6)
batch_output = []
for b in range(bn):
batch_output.append(MultiBoxDetection_one(cls_prob[b], loc_pred[b], anchor[0], nms_threshold))
return torch.stack(batch_output)
测试:
output = MultiBoxDetection(
cls_probs.unsqueeze(dim=0), offset_preds.unsqueeze(dim=0),
anchors.unsqueeze(dim=0), nms_threshold=0.5)
output
输出:其中,输入都增加了样本维(0维),输出的第一维是类别,-1表示被移除,第二个元素是预测边界框的置信度,后四个元素是左上角和右下角的坐标。
tensor([[[ 0.0000, 0.9000, 0.1000, 0.0800, 0.5200, 0.9200],
[-1.0000, 0.8000, 0.0800, 0.2000, 0.5600, 0.9500],
[-1.0000, 0.7000, 0.1500, 0.3000, 0.6200, 0.9100],
[ 1.0000, 0.9000, 0.5500, 0.2000, 0.9000, 0.8800]]])
实际操作中,在非极大抑制前还可以删除一些置信度较低的预测边界框来减少后续计算量。
多尺度目标检测
现实中,对每个像素都生成锚框很容易计算量过大。
减少锚框个数的方法:
1.在输入图像中均匀采样一小部分像素
2.在不同尺度下生成数量和大小不同的锚框。
例子:使用较小锚框来检测较小的目标时可以采样较多的区域,而用较大锚框来检测较大目标时,可以采样较少区域。
2.风格图像迁移
风格图像迁移即将两张图象的内容和样式进行合成。其中一张是内容图像,一张是样式图像
具体原理如下:
首先初始化一个合成图像,然后选择一个预训练过的卷积神经网络来抽取特征。一般来说靠近输入层的特征多包括图像的细节特征,靠近输出层的特征包含图像的整体特征。
以下图为例,预训练的神经网络有三个卷积层,其中第二层输出的图像是内容特征,第一层和第三层输出的图像是细节特征,通过如图实线的正向传播,并计算损失函数(内容损失、样式损失、总差变损失),其中总差变损失有助于减少图像的噪点。训练结束时,输出样式迁移的模型参数得到合成图像。
引入一些库:
%matplotlib inline
import time
import torch
import torch.nn.functional as F
import torchvision
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import sys
sys.path.append("/home/kesci/input")
import d2len9900 as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 均已测试
print(device, torch.__version__)
读取图像
content_img = Image.open('/home/kesci/input/NeuralStyle5603/rainier.jpg')
plt.imshow(content_img);
style_img = Image.open('/home/kesci/input/NeuralStyle5603/autumn_oak.jpg')
plt.imshow(style_img);
接着还要对图像进行预处理,首先对图像在rgb三个通道做标准化,然后将维度变成神经网络可以接受的格式。然后我们还需要后处理函数,对输出的图像的像素值还原回标准化之前的值,并映射到0-1之间
rgb_mean = np.array([0.485, 0.456, 0.406])
rgb_std = np.array([0.229, 0.224, 0.225])
def preprocess(PIL_img, image_shape):
process = torchvision.transforms.Compose([
torchvision.transforms.Resize(image_shape),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=rgb_mean, std=rgb_std)])
return process(PIL_img).unsqueeze(dim = 0) # (batch_size, 3, H, W)
def postprocess(img_tensor):
inv_normalize = torchvision.transforms.Normalize(
mean= -rgb_mean / rgb_std,
std= 1/rgb_std)
to_PIL_image = torchvision.transforms.ToPILImage()
return to_PIL_image(inv_normalize(img_tensor[0].cpu()).clamp(0, 1))
载入VGG-19模型来抽取特征
VGG-19含有5个VGG卷积块,选择第四个卷积块的最后一层作为内容层,选择五个卷积块的第一层作为样式层。最后将要用到的VGG网络的层抽取出来,使用nn.Sequential()构建一个新的网络
style_layers, content_layers = [0, 5, 10, 19, 28], [25]
net_list = []
for i in range(max(content_layers + style_layers) + 1):
net_list.append(pretrained_net.features[i])
net = torch.nn.Sequential(*net_list)
由于正常训练我们只能获得最后一层的输出,所以我们需要逐层前向计算net
def extract_features(X, content_layers, style_layers):
contents = []
styles = []
for i in range(len(net)):
X = net[i](X)
if i in style_layers:
styles.append(X)
if i in content_layers:
contents.append(X)
return contents, styles
定义两个用于获得内容特征和样式特征的函数
def get_contents(image_shape, device):
content_X = preprocess(content_img, image_shape).to(device)
contents_Y, _ = extract_features(content_X, content_layers, style_layers)
return content_X, contents_Y
def get_styles(image_shape, device):
style_X = preprocess(style_img, image_shape).to(device)
_, styles_Y = extract_features(style_X, content_layers, style_layers)
return style_X, styles_Y
接下来我们要计算三部分的损失函数:
内容损失采用MSE作为损失函数
def content_loss(Y_hat, Y):
return F.mse_loss(Y_hat, Y)
样式损失也同样用均方误差来计算
但是需要对样式层输出做一些处理:输出的样本数为1,通道数为c,高和宽分别为h和w,将输出变为c行h*w列的矩阵X,代表了通道i上的样式特征,计算这个矩阵的格拉姆矩阵,即计算了向量和的内积,它表达了这两个通道上特征的相关性。由于计算内积之后元素容易出现较大的值,所以最后要除以矩阵中元素的个数
def gram(X):
num_channels, n = X.shape[1], X.shape[2] * X.shape[3]
X = X.view(num_channels, n)
return torch.matmul(X, X.t()) / (num_channels * n)
def style_loss(Y_hat, gram_Y):
return F.mse_loss(gram(Y_hat), gram_Y)
总变差损失用于减少合成图像中的噪点(特别亮或特别暗的元素)。我们常用总变差降噪(total variation denoising),假设为坐标(i,j)的像素值,则总变差损失为:
它的目的就是使相邻的像素值尽可能相近
def tv_loss(Y_hat):
return 0.5 * (F.l1_loss(Y_hat[:, :, 1:, :], Y_hat[:, :, :-1, :]) +
F.l1_loss(Y_hat[:, :, :, 1:], Y_hat[:, :, :, :-1]))
最后,我们还有一个总的样式迁移损失,它是三种损失函数的加权和,通过超参数我们可以调节内容、样式、噪点三个方面的重要性。
content_weight, style_weight, tv_weight = 1, 1e3, 10
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram):
# 分别计算内容损失、样式损失和总变差损失
contents_l = [content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip(
contents_Y_hat, contents_Y)]
styles_l = [style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip(
styles_Y_hat, styles_Y_gram)]
tv_l = tv_loss(X) * tv_weight
# 对所有损失求和
l = sum(styles_l) + sum(contents_l) + tv_l
return contents_l, styles_l, tv_l, l
最后我们需要创建和初始化合成图像,合成图像是唯一需要更新的变量
class GeneratedImage(torch.nn.Module):
def __init__(self, img_shape):
super(GeneratedImage, self).__init__()
self.weight = torch.nn.Parameter(torch.rand(*img_shape))
def forward(self):
return self.weight
def get_inits(X, device, lr, styles_Y):
gen_img = GeneratedImage(X.shape).to(device)
gen_img.weight.data = X.data
optimizer = torch.optim.Adam(gen_img.parameters(), lr=lr)
styles_Y_gram = [gram(Y) for Y in styles_Y]
return gen_img(), styles_Y_gram, optimizer
训练:
def train(X, contents_Y, styles_Y, device, lr, max_epochs, lr_decay_epoch):
print("training on ", device)
X, styles_Y_gram, optimizer = get_inits(X, device, lr, styles_Y)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_decay_epoch, gamma=0.1)
for i in range(max_epochs):
start = time.time()
contents_Y_hat, styles_Y_hat = extract_features(
X, content_layers, style_layers)
contents_l, styles_l, tv_l, l = compute_loss(
X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram)
optimizer.zero_grad()
l.backward(retain_graph = True)
optimizer.step()
scheduler.step()
if i % 50 == 0 and i != 0:
print('epoch %3d, content loss %.2f, style loss %.2f, '
'TV loss %.2f, %.2f sec'
% (i, sum(contents_l).item(), sum(styles_l).item(), tv_l.item(),
time.time() - start))
return X.detach()
image_shape = (150, 225)
net = net.to(device)
content_X, contents_Y = get_contents(image_shape, device)
style_X, styles_Y = get_styles(image_shape, device)
output = train(content_X, contents_Y, styles_Y, device, 0.01, 500, 200)
plt.imshow(postprocess(output));
3.批量归一化和残差网络
批量归一化(BatchNormalization)
利用小批量的均值和标准差,不断调整神经网络的中间输出,从而使神经网络在各层的中间输出的数值更稳定。
对全连接层做批量归一化的位置是仿射变换和激活函数之间
其中BN函数为
其中拉伸参数和偏移参数是可学习参数。
如果卷积层有多个通道,要对每个通道都单独归一化,每个通道的拉伸参数和偏移参数需要相同。
总的来说就是:
训练时以batch为单位进行计算均值和方差。预测时用移动平均法估算整个训练样本的均值和方差。
具体实现:
def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
# 判断当前模式是训练模式还是预测模式
if not is_training:
# 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差
X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全连接层的情况,计算特征维上的均值和方差
mean = X.mean(dim=0)
var = ((X - mean) ** 2).mean(dim=0)
else:
# 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差。这里我们需要保持
# X的形状以便后面可以做广播运算
mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
# 训练模式下用当前的均值和方差做标准化
X_hat = (X - mean) / torch.sqrt(var + eps)
# 更新移动平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸和偏移
return Y, moving_mean, moving_var
class BatchNorm(nn.Module):
def __init__(self, num_features, num_dims):
super(BatchNorm, self).__init__()
if num_dims == 2:
shape = (1, num_features) #全连接层输出神经元
else:
shape = (1, num_features, 1, 1) #通道数
# 参与求梯度和迭代的拉伸和偏移参数,分别初始化成0和1
self.gamma = nn.Parameter(torch.ones(shape))
self.beta = nn.Parameter(torch.zeros(shape))
# 不参与求梯度和迭代的变量,全在内存上初始化成0
self.moving_mean = torch.zeros(shape)
self.moving_var = torch.zeros(shape)
def forward(self, X):
# 如果X不在内存上,将moving_mean和moving_var复制到X所在显存上
if self.moving_mean.device != X.device:
self.moving_mean = self.moving_mean.to(X.device)
self.moving_var = self.moving_var.to(X.device)
# 保存更新过的moving_mean和moving_var, Module实例的traning属性默认为true, 调用.eval()后设成false
Y, self.moving_mean, self.moving_var = batch_norm(self.training,
X, self.gamma, self.beta, self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.9)
return Y
传入LeNet网络中
net = nn.Sequential(
nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
BatchNorm(6, num_dims=4),
nn.Sigmoid(),
nn.MaxPool2d(2, 2), # kernel_size, stride
nn.Conv2d(6, 16, 5),
BatchNorm(16, num_dims=4),
nn.Sigmoid(),
nn.MaxPool2d(2, 2),
d2l.FlattenLayer(),
nn.Linear(16*4*4, 120),
BatchNorm(120, num_dims=2),
nn.Sigmoid(),
nn.Linear(120, 84),
BatchNorm(84, num_dims=2),
nn.Sigmoid(),
nn.Linear(84, 10)
)
print(net)
当然也有简洁实现:
nn库中有集成BatchNorm,分别是BatchNorm2d和BatchNorm1d
net = nn.Sequential(
nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
nn.BatchNorm2d(6),
nn.Sigmoid(),
nn.MaxPool2d(2, 2), # kernel_size, stride
nn.Conv2d(6, 16, 5),
nn.BatchNorm2d(16),
nn.Sigmoid(),
nn.MaxPool2d(2, 2),
d2l.FlattenLayer(),
nn.Linear(16*4*4, 120),
nn.BatchNorm1d(120),
nn.Sigmoid(),
nn.Linear(120, 84),
nn.BatchNorm1d(84),
nn.Sigmoid(),
nn.Linear(84, 10)
)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
残差网络(ResNet)
CNN的缺陷是当网络过深时,网络的收敛性和准确性都会变差。
右边便是残差网络,将输入的值直接加到输出上,这样给训练的负担更小,传播的也更快。
残差网络的意思是神经网络的输出是输入X的残差
class Residual(nn.Module): # 本类已保存在d2lzh_pytorch包中方便以后使用
#可以设定输出通道数、是否使用额外的1x1卷积层来修改通道数以及卷积层的步幅。
def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
super(Residual, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return F.relu(Y + X)
对于ResNet模型,它是由以下网络结构构成:
net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
if first_block:
assert in_channels == out_channels # 第一个模块的通道数同输入通道数一致
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
else:
blk.append(Residual(out_channels, out_channels))
return nn.Sequential(*blk)
net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, 512, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10)))
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
稠密连接网络(DenseNet)
稠密连接网络是残差网络的变种,它使用列表将输出连接。
它主要由稠密块和过渡块构成。稠密块决定输入和输出的连接方式,过渡层用来控制通道数。
稠密块python实现:
def conv_block(in_channels, out_channels):
blk = nn.Sequential(nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
return blk
class DenseBlock(nn.Module):
def __init__(self, num_convs, in_channels, out_channels):
super(DenseBlock, self).__init__()
net = []
for i in range(num_convs):
in_c = in_channels + i * out_channels
net.append(conv_block(in_c, out_channels))
self.net = nn.ModuleList(net)
self.out_channels = in_channels + num_convs * out_channels # 计算输出通道数
def forward(self, X):
for blk in self.net:
Y = blk(X)
X = torch.cat((X, Y), dim=1) # 在通道维上将输入和输出连结
return X
对于过渡层,采用1*1卷积来减少通道数,用步幅为2的平均池化层来减半高和宽
def transition_block(in_channels, out_channels):
blk = nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
nn.AvgPool2d(kernel_size=2, stride=2))
return blk
所以DenseNet模型实现如下:
net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
num_channels, growth_rate = 64, 32 # num_channels为当前的通道数
num_convs_in_dense_blocks = [4, 4, 4, 4]
for i, num_convs in enumerate(num_convs_in_dense_blocks):
DB = DenseBlock(num_convs, num_channels, growth_rate)
net.add_module("DenseBlosk_%d" % i, DB)
# 上一个稠密块的输出通道数
num_channels = DB.out_channels
# 在稠密块之间加入通道数减半的过渡层
if i != len(num_convs_in_dense_blocks) - 1:
net.add_module("transition_block_%d" % i, transition_block(num_channels, num_channels // 2))
num_channels = num_channels // 2
net.add_module("BN", nn.BatchNorm2d(num_channels))
net.add_module("relu", nn.ReLU())
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, num_channels, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(num_channels, 10)))
X = torch.rand((1, 1, 96, 96))
for name, layer in net.named_children():
X = layer(X)
print(name, ' output shape:\t', X.shape)
batch_size=16
# 如出现“out of memory”的报错信息,可减小batch_size或resize
train_iter, test_iter =load_data_fashion_mnist(batch_size, resize=96)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
4.GAN(生成对抗网络)
生成对抗网络含有两个神经网络,一个是Discrimination分类器,一个是Generator生成器,生成器通过训练产生与真实数据同分布的数据来骗过分类器,分类器通过分辨
分类器的损失函数是交叉熵:
生成器的损失函数是:
引入一些包
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch import nn
import numpy as np
from torch.autograd import Variable
import torch
生成器的实现:
class net_G(nn.Module):
def __init__(self):
super(net_G,self).__init__()
self.model=nn.Sequential(
nn.Linear(2,2),
)
self._initialize_weights()
def forward(self,x):
x=self.model(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m,nn.Linear):
m.weight.data.normal_(0,0.02)
m.bias.data.zero_()
分类器实现:
class net_D(nn.Module):
def __init__(self):
super(net_D,self).__init__()
self.model=nn.Sequential(
nn.Linear(2,5),
nn.Tanh(),
nn.Linear(5,3),
nn.Tanh(),
nn.Linear(3,1),
nn.Sigmoid()
)
self._initialize_weights()
def forward(self,x):
x=self.model(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m,nn.Linear):
m.weight.data.normal_(0,0.02)
m.bias.data.zero_()
def update_D(X,Z,net_D,net_G,loss,trainer_D):
batch_size=X.shape[0]
Tensor=torch.FloatTensor
ones=Variable(Tensor(np.ones(batch_size))).view(batch_size,1)
zeros = Variable(Tensor(np.zeros(batch_size))).view(batch_size,1)
real_Y=net_D(X.float())
fake_X=net_G(Z)
fake_Y=net_D(fake_X)
loss_D=(loss(real_Y,ones)+loss(fake_Y,zeros))/2
loss_D.backward()
trainer_D.step()
return float(loss_D.sum())
def update_G(Z,net_D,net_G,loss,trainer_G):
batch_size=Z.shape[0]
Tensor=torch.FloatTensor
ones=Variable(Tensor(np.ones((batch_size,)))).view(batch_size,1)
fake_X=net_G(Z)
fake_Y=net_D(fake_X)
loss_G=loss(fake_Y,ones)
loss_G.backward()
trainer_G.step()
return float(loss_G.sum())
最终进行训练:
def train(net_D,net_G,data_iter,num_epochs,lr_D,lr_G,latent_dim,data):
loss=nn.BCELoss()
Tensor=torch.FloatTensor
trainer_D=torch.optim.Adam(net_D.parameters(),lr=lr_D)
trainer_G=torch.optim.Adam(net_G.parameters(),lr=lr_G)
plt.figure(figsize=(7,4))
d_loss_point=[]
g_loss_point=[]
d_loss=0
g_loss=0
for epoch in range(1,num_epochs+1):
d_loss_sum=0
g_loss_sum=0
batch=0
for X in data_iter:
batch+=1
X=Variable(X)
batch_size=X.shape[0]
Z=Variable(Tensor(np.random.normal(0,1,(batch_size,latent_dim))))
trainer_D.zero_grad()
d_loss = update_D(X, Z, net_D, net_G, loss, trainer_D)
d_loss_sum+=d_loss
trainer_G.zero_grad()
g_loss = update_G(Z, net_D, net_G, loss, trainer_G)
g_loss_sum+=g_loss
d_loss_point.append(d_loss_sum/batch)
g_loss_point.append(g_loss_sum/batch)
plt.ylabel('Loss', fontdict={'size': 14})
plt.xlabel('epoch', fontdict={'size': 14})
plt.xticks(range(0,num_epochs+1,3))
plt.plot(range(1,num_epochs+1),d_loss_point,color='orange',label='discriminator')
plt.plot(range(1,num_epochs+1),g_loss_point,color='blue',label='generator')
plt.legend()
plt.show()
print(d_loss,g_loss)
Z =Variable(Tensor( np.random.normal(0, 1, size=(100, latent_dim))))
fake_X=net_G(Z).detach().numpy()
plt.figure(figsize=(3.5,2.5))
plt.scatter(data[:,0],data[:,1],color='blue',label='real')
plt.scatter(fake_X[:,0],fake_X[:,1],color='orange',label='generated')
plt.legend()
plt.show()
训练:
lr_D,lr_G,latent_dim,num_epochs=0.05,0.005,2,20
generator=net_G()
discriminator=net_D()
train(discriminator,generator,data_iter,num_epochs,lr_D,lr_G,latent_dim,data)
5.数据增强
对图像来说,主要包括图像增广、翻转、裁剪、变化颜色、改变对比度、亮度等等。进行图像增强可以减少模型对某种属性的依赖,增强泛化能力。