CNN (Convolutional Neural Network) 顾名思义,将卷积用在了网络结构中,本质还是利用卷积核提取二维数组(图片)中的特征信息。其运算过程如下图,蓝色部分为输入图片,绿色部分为卷积操作之后得到的特征图,蓝色边缘虚线部分是作的填充,使得卷积之后的特征图与原图大小相同。
CNN的鼻祖是1998年Yan LeCun首次提出的神经网络LeNet-5,后来20年间没有大的突破被传统的machine learning在效果上压制,在10年左右CNN的作用逐渐被重视,后面便是CNN在CV等方面大爆发的时代。最近几年CNN在残差Resnet和注意力机制之后发展仿佛又遇到了瓶颈,但是其Baseline的发展是CNN的一部活历史,还是很有启发意义的,并且在后面的CV中,这些网络作为Backbone一直沿用,所以本文将常见经典的网络结构用pytorch实现记录一下。
CNN开山之作,最大的特点:首次引入卷积层
结构:卷积层,池化层,全连接层FC,sigmoid()
卷积核为大小为 [公式] ,没有padding,所以每次卷积操作之后特征图会比输入小4个像素;在每次卷积完使用sigmoid函数增加非线性的拟合能力。在最后卷积层输出的结果形状为[batch-size,channels,h,w],全连接层的输入为二维[batch-size,channels×h×w],经过三次fc层之后输出10为手写体的10个类别。
from torch import nn
class LeNet(nn.Module): #一般都继承nn.Moudle,里面包含常用的模型属性
def __init__(self):
super (LeNet,self).__init__()
self.conv=nn.Sequential(
nn.Conv2d(1,6,5), #in-channels,out-channels,kernel-size
nn.Sigmoid(),
nn.MaxPool2d(2,2), #kernel-size,stride
nn.Conv2d(6,16,5),
nn.Sigmoid(),
nn.MaxPool2d(2,2)
)
self.fc=nn.Sequential(
nn.Linear(256,120),
nn.Sigmoid(),
nn.Linear(120,84),
nn.Sigmoid(),
nn.Linear(84,10)
)
def forward(self,img):
feature=self.conv(img)
feature=feature.view(img.shape[0],-1)#将特征图flatten化,喂入linear层
output=self.fc(feature)
return output
net=LeNet()
print(net) #查看网络各层输出的参数和形状
最大特点:卷积网络深度加深
结构:卷积层,池化层,全连接层Dense,ReLu(),dropout
Alex是2012年Alex等人提出的,刷新的2012ImageNet是记录,重新使CNN网络的研究进入大家的视野。其整体设计思路和LeNet比较相似,但是在具体细节上改进了许多,使得在大数据集上的分类效果大幅提升。
from torch import nn
class AlexNet(nn.Module):
def __init__(self):
super(AlexNet,self).__init__()
self.conv=nn.Sequential(
nn.Conv2d(3,48,11,4),#in-channels,out-channels,kernel-size,stride
nn.ReLU(),
nn.MaxPool2d(3,2),
nn.Conv2d(48,128,5,1,2),#stride=1,padding=2
nn.ReLU(),
nn.MaxPool2d(3,2),
nn.Conv2d(128,193,3,1,2),
nn.ReLU(),
nn.Conv2d(192,192,3,1,2),
nn.ReLU(),
nn.Conv2d(192,128,3,1,2),
nn.ReLU(),
nn.MaxPool2d(3,2)
)
self.fc=nn.Sequential(
nn.Linear(128*7*7,2048),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(2048,2048),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(2048,1000)
)
def forward(self,img):
feature=self.conv(img)
feature=feature.view(img.shape[0],-1)
output=self.fc(feature)
return output
net=AlexNet()
print(net)
最大特点:重复使用模块,小卷积核代替大卷积核
2014年,实验室Visual Geometry Group提出的VGG模型结构,给出了一种网络结构改进的思路:将相同操作分装为模块,并且重复使用;而且还证明了在相同感受野的情况下小卷积核在特征提取和参数量两个方面都死具有优势的,所以在VGG中,所有的卷积核为 [公式] .
以较常见的VGG-16为例:
from torch import nn
def vgg_block(num_convs,in_channels,out_channels):
block=[] #存储模块结构的list
for i in range(num_convs):
if i==0: #每个模块只有两个channels不同,先传入输入channels
block.append(nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1))
else:
block.append(nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1))
block.append(nn.ReLU())
block.append(nn.MaxPool2d(2,2)) #每个卷积模块之后池化,图大小减半
return nn.Sequential(*block) #通配符的使用
def vgg(conv_arch):
net=nn.Sequential()
for i,(num_convs,in_channels,out_channels) in enumerate(conv_arch):
net.add_module("block"+str(i),vgg_block(num_convs,in_channels,out_channels))
net.add_module("fc",nn.Sequential(
nn.Linear(512*7*7,4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096,4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096,10)
))
return net
conv_arch=((2,3,64),(2,64,128),(3,128,256),(3,256,512),(3,256,512))
vgg16=vgg(conv_arch)
print(vgg16)
最大特点:由许多子网络串联成大网络, [公式] 卷积代替FC层
相比于AlexNet和VGG在网络加宽加深,最后用FC层用来分类,NiN提出新的思路,将许多相同的模块(子网络)串联构成深度学习网络。这就要求在每个子网络的输出部分不能使用FC层了,或者说FC层必须要再次转换成多次通道的数组进行卷积操作,实际上,[公式] 卷积也是作用于通道上,其和Flatten联合使用的作用与FC层等价。上图中的红色框中为2层 [公式] 卷积层代替了原本的FC层。
from torch import nn
import torch.nn.functional as F
class NinBlock(nn.Module):
def __init__(self,in_channels,out_channels,kernel_size,stride,padding):
super(NinBlock,self).__init__()
self.block=nn.Sequential(
nn.Conv2d(in_channels,out_channels,kernel_size,stride,padding),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels,kernel_size=1),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels,kernel_size=1),
nn.ReLU()
)
def forward(self,x):
output=self.block(x)
return output
#########################################
class GlobalAvgPool2d(nn.Module):
def __init__(self):
super(GlobalAvgPool2d,self).__init__()
def forward(self,x):
return F.avg_pool2d(x,kernel_size=x.size()[2:])
##############################
NinNet=nn.Sequential(
NinBlock(1,96,kernel_size=11,stride=4,padding=0),
nn.MaxPool2d(kernel_size=3,stride=2),
NinBlock(96,256,kernel_size=5,stride=1,padding=2),
nn.MaxPool2d(kernel_size=3,stride=2),
NinBlock(256,384,kernel_size=3,stride=1,padding=1),
nn.MaxPool2d(kernel_size=3,stride=2),
nn.Dropout(0.5),
NinBlock(384,10,kernel_size=3,stride=1,padding=1),
GlobalAvgPool2d()
)
print(NinNet)
最大特点:并行的连结结构
前面的众多网络结构主要是在宽度(通道数)或者深度(层数以及串联形成更深的网络),三但是GoogleNet提出在每个子模块采用并行的操作,每个并行 通路采用不同的特征提取方法,可以得到不同尺度的特征。GoogleNet依旧是由许多重复的子模块Inception构成:
由上图可以看出,Inception模块由四条并行的通路提取特征,模块的输出和输入的长宽是相同的,也即四条路上的的操作加以适当的padding使得在通道合并时的每条路的长宽相同。模块中 [公式]的卷积都是为了改变通道数,从而改变模快的复杂度。
Inception模块:
import torch
from torch import nn
class Inception(nn.Module):
#in_c为模块的输出通道数,c1-c4是每条通路上的输出通道数,第2,3路的c为2个参数
def __init__(self,in_c,c1,c2,c3,c4):
super(Inception,self).__init__()
self.path1=nn.Sequential(
nn.Conv2d(in_c,c1,kernel_size=1),
nn.ReLU()
)
self.path2=nn.Sequential(
nn.Conv2d(in_c,c2[0],kernel_size=1),
nn.Conv2d(c2[0],c2[1],kernel_size=3,padding=1),
nn.ReLU()
)
self.path3=nn.Sequential(
nn.Conv2d(in_c,c3[0],kernel_size=1),
nn.Conv2d(c3[0],c3[1],kernel_size=5,padding=2),
nn.ReLU()
)
self.path4=nn.Sequential(
nn.MaxPool2d(kernel_size=3,stride=1,padding=2),
nn.Conv2d(in_c,c4,kernel_size=1),
nn.ReLU()
)
def forward(self,x):
return torch.cat(self.path1(x),self.path2(x),self.path3(x),self.path4(x))
net=Inception(3,10,[10,20],[20,10],30)
print(net)
GoogleNet:
GoogleNet由5个部分组成:
from torch import nn
class GoogleNet(nn.Module):
def __init__(self):
super(GoogleNet,self).__init__()
self.block1=nn.Sequential(
nn.Conv2d(1,64,kernel_size=7,padding_mode=3),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
self.block2=nn.Sequential(
nn.Conv2d(64,64,kernel_size=1),
nn.ReLU(),
nn.Conv2d(64,192,kernel_size=3,padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
self.block3=nn.Sequential(
Inception(192,64,(96,128),(16,32),32),
Inception(256,128,(128,192),(32,96),64),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
self.block4=nn.Sequential(
Inception(480,192,(96,208),(16,48),64),
Inception(512,160,(112,224),(24,64),64),
Inception(512,128,(128,256),(24,64),64),
Inception(512,112,(144,288),(32,64),64),
Inception(528,256,(160,320),(32,128),128),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
self.block5=nn.Sequential(
Inception(832,256,(160,320),(32,128),128),
Inception(832,384,(192,384),(48,128),128),
nn.AvgPool2d(kernel_size=7),
nn.Linear(1024,10)
)
def forward(self,x):
x=self.block1(x)
x=self.block2(x)
x=self.block3(x)
x=self.block4(x)
x=self.block5(x)
return x
net=GoogleNet()
print(net)
最大特点:引入残差结构,解决网络退化问题
ResNet结构的提出在CNN网络结构上是里程碑式的意义,其主要目的是解决由于网络加深而导致的网络退化问题,区别于以前经典结构模块的输出,ResNet的每个模块输出的是残差,也即 [公式] ,如下图:
从结构参数图看出,当ResNet的层数50以上时,采用"BottleBlock",其中含有 [公式] 卷积来降低模型复杂度。
import torch
from torch import nn
class BasicBlock(nn.Module):
expansion=1 #为了计算Basic/Bottle两个模块的输出通道设定的常数
#BasicBlock是Resnet18/34的子模块
def __init__(self,in_channels,out_channels,stride=1):
super(BasicBlock,self).__init__()
self.residual=nn.Sequential(
nn.Conv2d(in_channels,out_channels,kernel_size=3,stride=stride,padding=1,bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels*BasicBlock.expansion,kernel_size=3,padding=1,bias=False),
nn.BatchNorm2d(out_channels*BasicBlock.expansion)
)
self.shorcut=nn.Sequential(
nn.BatchNorm2d(out_channels*BasicBlock.expansion)
)
#在x与f(x)不同维度时需要1x1卷积match
if in_channels!=out_channels*BasicBlock.expansion or stride!=1:
self.shorcut=nn.Sequential(
nn.Conv2d(in_channels,out_channels*BasicBlock.expansion,kernel_size=1,stride=stride,bias=False),
nn.BatchNorm2d(out_channels*BasicBlock.expansion)
)
def forward(self,x):
return nn.ReLU(inplace=True)(self.residual(x)+slef.shorcut(x))
##################################################
class Bottleneck(nn.Module):
expansion=4
def __init__(self,in_channels,out_channels,stride=1):
super(Bottleneck,self).__init__()
self.residual=nn.Sequential(
nn.Conv2d(in_channels,out_channels,kernel_size=1,bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels,out_channels,stride=stride,kernel_size=3,padding=1,bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels,out_channels*Bottleneck.expansion,kernel_size=1,bias=False),
nn.BatchNorm2d(out_channels*Bottleneck.expansion)
)
self.shorcut=nn.Sequential(
nn.BatchNorm2d(out_channels*Bottleneck.expansion)
)
#在x与f(x)不同维度时需要1x1卷积match
if in_channels!=out_channels*Bottleneck.expansion or stride!=1:
self.shorcut=nn.Sequential(
nn.Conv2d(in_channels,out_channels*Bottleneck.expansion,kernel_size=1,stride=stride,bias=False),
nn.BatchNorm2d(out_channels*Bottleneck.expansion)
)
def forward(self,x):
return nn.ReLU(inplace=True)(self.residual(x)+slef.shorcut(x))
class RestNet(nn.Module):
def __init__(self, block, num_block, num_classes=10):
super(RestNet,self).__init__()
self.in_channels = 64
self.conv1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True))
self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, out_channels, num_blocks, stride):
#每个模块里面的stride=1
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
output = self.conv1(x)
output = self.conv2_x(output)
output = self.conv3_x(output)
output = self.conv4_x(output)
output = self.conv5_x(output)
output = self.avg_pool(output)
output = output.view(output.size(0), -1)
output = self.fc(output)
return output
def resnet18():
return RestNet(BasicBlock,[2,2,2,2])
def resnet34():
return RestNet(BasicBlock,[3,4,6,3])
def resnet50():
return RestNet(Bottleneck,[3,4,6,3])
def resnet101():
return RestNet(Bottleneck,[3,4,23,3])
def resnet152():
return RestNet(Bottleneck,[3,8,36,3])
最大特点:通道连结,过渡层(控制通道数)
DneseNet是在ResNet的基础上加以改进,主要区别是DenseNet在每层卷积之后将输入和输出连结操作(torch.cat),为了防止最后过多的通道产生,在每个稠密块之后,DenseNet采用了过渡层(Transition Layer)来控制通道数,使模型参数量得以控制。
Transition Layer其实就是通过 [公式] 卷积来改变通道数,使用pooling层将特征图的大小减半。
from torch import nn
#稠密块DenseBlock
class DenseBlock(nn.Module):
def __init__(self,num_conv,in_channels,out_channels):
super(DenseBlock,self).__init__()
net=[]
for i in range(num_conv):
in_channels=in_channels+i*out_channels
net.append(self._conv_block(in_channels,out_channels))
self.net=nn.ModuleList(net)
#连结之后的最终输出通道数out_channels
self.out_channels=in_channels+num_conv*out_channels
#将卷积三件套封装起来
def _conv_block(self,in_channels,out_channels):
conv_block=nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1)
)
return conv_block
def forward(self,x):
#每层卷积之后将输出和输入连结之后送入下一层卷积
for block in self.net:
y=block(x)
x=torch.cat((x,y),dim=1)
return x
#过渡层TransitionLayer
def transitionLayer(in_channels,out_channels):
net=nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels,out_channels,kernel_size=1),
nn.AvgPool2d(kernel_size=2,stride=2)
)
return net
########################################
class DenseNet(nn.Module):
#in_channels:稠密层开始的输入,和resnet一样先是经过7x7,maxpooling操作
#growth_rate:每层conv之后增加的通道数(也即DenseBlock中的out_channels参数)
#num_conv:每个稠密块包含的卷积层数
def __init__(self,in_channels,growth_rate,num_convs):
super(DenseNet,self).__init__()
#in_channels,growth_rate=64,32
#num_convs=[4,4,4,4]
self.net=nn.Sequential()
conv1=nn.Sequential(
nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
self.net.add_module("Block1",conv1)
for i ,num_conv in enumerate(num_convs):
block=DenseBlock(num_conv,in_channels,growth_rate)
self.net.add_module("Block%d"%(i+2),block)
in_channels=block.out_channels
#在最后稠密块之后加入过渡层
if i !=len(num_convs)-1:
self.net.add_module("TransitionLayer% d"%i,transitionLayer(in_channels,in_channels//2))
in_channels=in_channels//2
#最后,与Resnet一样接入FC和Avgpool层
def forward(self,x):
x=self.net(x)
output = self.avg_pool(x)
output = output.view(output.size(0), -1)
output = self.fc(output,10)
return output
in_channels,growth_rate=64,32
num_convs=[4,4,4,4]
net=DenseNet(in_channels,growth_rate,num_convs)
print(net)
以上这些经典的CNN结构很少单独来使用了,大多作为新算法、新结构的Backbone,但在CV最基础的分类方向还是用的比较多,下面这个Github上开源的分类项目里面包含了较多的CNN网络结构pytorch实现参考:
GitHub地址:weiaicunzai/pytorch-cifar100