【第一部分】 问题总结
1.为什么每次训练完的结果会有不同,虽然相差不大,但是数据会不稳定
2.怎么样知道修改过后的网络会不会有提升
【第二部分】 论文及代码练习
MobileNetV1 网络
import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.transforms as transforms import matplotlib.pyplot as plt import numpy as np import torch.optim as optim class Block(nn.Module): '''Depthwise conv + Pointwise conv''' def __init__(self, in_planes, out_planes, stride=1): super(Block, self).__init__() # Depthwise 卷积,3*3 的卷积核,分为 in_planes,即各层单独进行卷积 self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) # Pointwise 卷积,1*1 的卷积核 self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_planes) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) return out
class MobileNetV1(nn.Module): # (128,2) means conv planes=128, stride=2 cfg = [(64,1), (128,2), (128,1), (256,2), (256,1), (512,2), (512,1), (1024,2), (1024,1)] def __init__(self, num_classes=10): super(MobileNetV1, self).__init__() self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.linear = nn.Linear(1024, num_classes) def _make_layers(self, in_planes): layers = [] for x in self.cfg: out_planes = x[0] stride = x[1] layers.append(Block(in_planes, out_planes, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.avg_pool2d(out, 2) out = out.view(out.size(0), -1) out = self.linear(out) return out
for epoch in range(10): # 重复多轮训练 for i, (inputs, labels) in enumerate(trainloader): inputs = inputs.to(device) labels = labels.to(device) # 优化器梯度归零 optimizer.zero_grad() # 正向传播 + 反向传播 + 优化 outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # 输出统计信息 if i % 100 == 0: print('Epoch: %d Minibatch: %5d loss: %.3f' %(epoch + 1, i + 1, loss.item())) print('Finished Training')
correct = 0 total = 0 for data in testloader: images, labels = data images, labels = images.to(device), labels.to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: %.2f %%' % ( 100 * correct / total))
论文核心思想:
本文对常见的卷积操作进行了改进,使得计算量得到了降低。
设常规卷积中,卷积核G的大小是Dk * Dk * M * N,Dk表示宽、高,M表示通道数,N表示卷积核个数。
则常规卷积的操作需要执行的计算量为:
乘法次数:Dk * Dk * M * N * Df * Df,加法次数:Dk * Dk * M * N * Df * Df - 1
由于乘法更加耗时 Cost A= Dk * Dk * M * N * Df * Df
其中,Dk * Dk * M表示每个卷积核每滑动到一个位置需要执行的乘法的次数;Df * Df表示每个卷积核需要卷积的次数。前者等于每个卷积核的大小,后者等于输出图片的宽*高。
MobileNets论文提出的深度可分离卷积(Depthwise separable convolutions)由两部分组成:
第1部分:在深度上进行卷积(Depthwise convolution),即:在每一个通道的二维矩阵上进行卷积。
在这个部分中的卷积核是2维的,即卷积核的深度为1,输入图片的深度是多少,卷积核就有多少个。
第2部分:在点上进行卷积(Pointwise convolution ),即:在二维矩阵每一个点上对所有的通道进行卷积。
在这个部分中的卷积核虽然是3维,但是宽和高均为1,输入图片的面积(宽*高的值)是多少,就有多少个卷积核。
MobileNetV1遗留的问题
1、结构问题:
MobileNet V1 的结构其实非常简单,论文里是一个非常复古的直筒结构,类似于VGG一样。这种结构的性价比其实不高。
Depthwise Conv确实是大大降低了计算量,而且N×N Depthwise +1×1PointWise的结构在性能上也能接近N×N Conv。在实际使用的时候,Depthwise部分的kernel比较容易训废掉:训练完之后发现Depthwise训出来的kernel有不少是空的。当时我们认为,Depthwise每个kernel dim相对于普通Conv要小得多,过小的kernel_dim, 加上ReLU的激活影响下,使得神经元输出很容易变为0。ReLU对于0的输出的梯度为0,所以一旦陷入0输出,就没法恢复了。这个问题在定点化低精度训练的时候会进一步放大。
MobileNetV2
import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.transforms as transforms import matplotlib.pyplot as plt import numpy as np import torch.optim as optim class Block(nn.Module): '''expand + depthwise + pointwise''' def __init__(self, in_planes, out_planes, expansion, stride): super(Block, self).__init__() self.stride = stride # 通过 expansion 增大 feature map 的数量 planes = expansion * in_planes self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn3 = nn.BatchNorm2d(out_planes) # 步长为 1 时,如果 in 和 out 的 feature map 通道不同,用一个卷积改变通道数 if stride == 1 and in_planes != out_planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(out_planes)) # 步长为 1 时,如果 in 和 out 的 feature map 通道相同,直接返回输入 if stride == 1 and in_planes == out_planes: self.shortcut = nn.Sequential() def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) # 步长为1,加 shortcut 操作 if self.stride == 1: return out + self.shortcut(x) # 步长为2,直接输出 else: return out
class MobileNetV2(nn.Module): # (expansion, out_planes, num_blocks, stride) cfg = [(1, 16, 1, 1), (6, 24, 2, 1), (6, 32, 3, 2), (6, 64, 4, 2), (6, 96, 3, 1), (6, 160, 3, 2), (6, 320, 1, 1)] def __init__(self, num_classes=10): super(MobileNetV2, self).__init__() self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(1280) self.linear = nn.Linear(1280, num_classes) def _make_layers(self, in_planes): layers = [] for expansion, out_planes, num_blocks, stride in self.cfg: strides = [stride] + [1]*(num_blocks-1) for stride in strides: layers.append(Block(in_planes, out_planes, expansion, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.relu(self.bn2(self.conv2(out))) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out
correct = 0 total = 0 for data in testloader: images, labels = data images, labels = images.to(device), labels.to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: %.2f %%' % ( 100 * correct / total))
MobileNet V1没有很好的利用Residual Connection,而Residual Connection通常情况下总是好的,所以MobileNet V2加上。
V2的新想法包括Linear Bottleneck 和 Inverted Residuals
先用1x1降通道过ReLU,再3x3空间卷积过ReLU,再用1x1卷积过ReLU恢复通道,并和输入相加。之所以要1x1卷积降通道,是为了减少计算量,不然中间的3x3空间卷积计算量太大。所以Residual block是沙漏形,两边宽中间窄。但是,现在我们中间的3x3卷积变为了Depthwise的了,计算量很少了,所以通道可以多一点,效果更好,所以通过1x1卷积先提升通道数,再Depthwise的3x3空间卷积,再用1x1卷积降低维度。两端的通道数都很小,所以1x1卷积升通道或降通道计算量都并不大,而中间通道数虽然多,但是Depthwise 的卷积计算量也不大。两边窄中间宽,较小的计算量得到较好的性能。
HybridSN
class_num = 16 class HybridSN(nn.Module): def __init__(self): super(HybridSN, self).__init__() self.L = 30 self.S = 25 self.conv1 = nn.Conv3d(1, 8, kernel_size=(7, 3, 3), stride=1, padding=0) self.conv2 = nn.Conv3d(8, 16, kernel_size=(5, 3, 3), stride=1, padding=0) self.conv3 = nn.Conv3d(16, 32, kernel_size=(3, 3, 3), stride=1, padding=0) inputX = self.get2Dinput() inputConv4 = inputX.shape[1] * inputX.shape[2] self.conv4 = nn.Conv2d(inputConv4, 64, kernel_size=(3, 3), stride=1, padding=0) num = inputX.shape[3]-2 #二维卷积后(64, 17, 17)-->num = 17 inputFc1 = 64 * num * num self.fc1 = nn.Linear(inputFc1, 256) # 64 * 17 * 17 = 18496 self.fc2 = nn.Linear(256, 128) self.fc3 = nn.Linear(128, class_num) self.dropout = nn.Dropout(0.4) def get2Dinput(self): with torch.no_grad(): x = torch.zeros((1, 1, self.L, self.S, self.S)) x = self.conv1(x) x = self.conv2(x) x = self.conv3(x) return x def forward(self, x): x = F.relu(self.conv1(x)) x = F.relu(self.conv2(x)) x = F.relu(self.conv3(x)) x = x.view(x.shape[0], -1, x.shape[3], x.shape[4]) x = F.relu(self.conv4(x)) x = x.view(-1, x.shape[1] * x.shape[2] * x.shape[3]) x = F.relu(self.fc1(x)) x = self.dropout(x) x = F.relu(self.fc2(x)) x = self.dropout(x) x = self.fc3(x) return x # 随机输入,测试网络结构是否通 # x = torch.randn(1, 1, 30, 25, 25) # net = HybridSN() # y = net(x)
# 对高光谱数据 X 应用 PCA 变换 def applyPCA(X, numComponents): newX = np.reshape(X, (-1, X.shape[2])) pca = PCA(n_components=numComponents, whiten=True) newX = pca.fit_transform(newX) newX = np.reshape(newX, (X.shape[0], X.shape[1], numComponents)) return newX # 对单个像素周围提取 patch 时,边缘像素就无法取了,因此,给这部分像素进行 padding 操作 def padWithZeros(X, margin=2): newX = np.zeros((X.shape[0] + 2 * margin, X.shape[1] + 2* margin, X.shape[2])) x_offset = margin y_offset = margin newX[x_offset:X.shape[0] + x_offset, y_offset:X.shape[1] + y_offset, :] = X return newX # 在每个像素周围提取 patch ,然后创建成符合 keras 处理的格式 def createImageCubes(X, y, windowSize=5, removeZeroLabels = True): # 给 X 做 padding margin = int((windowSize - 1) / 2) zeroPaddedX = padWithZeros(X, margin=margin) # split patches patchesData = np.zeros((X.shape[0] * X.shape[1], windowSize, windowSize, X.shape[2])) patchesLabels = np.zeros((X.shape[0] * X.shape[1])) patchIndex = 0 for r in range(margin, zeroPaddedX.shape[0] - margin): for c in range(margin, zeroPaddedX.shape[1] - margin): patch = zeroPaddedX[r - margin:r + margin + 1, c - margin:c + margin + 1] patchesData[patchIndex, :, :, :] = patch patchesLabels[patchIndex] = y[r-margin, c-margin] patchIndex = patchIndex + 1 if removeZeroLabels: patchesData = patchesData[patchesLabels>0,:,:,:] patchesLabels = patchesLabels[patchesLabels>0] patchesLabels -= 1 return patchesData, patchesLabels def splitTrainTestSet(X, y, testRatio, randomState=345): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testRatio, random_state=randomState, stratify=y) return X_train, X_test, y_train, y_test
# 地物类别 class_num = 16 X = sio.loadmat('Indian_pines_corrected.mat')['indian_pines_corrected'] y = sio.loadmat('Indian_pines_gt.mat')['indian_pines_gt'] # 用于测试样本的比例 test_ratio = 0.90 # 每个像素周围提取 patch 的尺寸 patch_size = 25 # 使用 PCA 降维,得到主成分的数量 pca_components = 30 print('Hyperspectral data shape: ', X.shape) print('Label shape: ', y.shape) print('\n... ... PCA tranformation ... ...') X_pca = applyPCA(X, numComponents=pca_components) print('Data shape after PCA: ', X_pca.shape) print('\n... ... create data cubes ... ...') X_pca, y = createImageCubes(X_pca, y, windowSize=patch_size) print('Data cube X shape: ', X_pca.shape) print('Data cube y shape: ', y.shape) print('\n... ... create train & test data ... ...') Xtrain, Xtest, ytrain, ytest = splitTrainTestSet(X_pca, y, test_ratio) print('Xtrain shape: ', Xtrain.shape) print('Xtest shape: ', Xtest.shape) # 改变 Xtrain, Ytrain 的形状,以符合 keras 的要求 Xtrain = Xtrain.reshape(-1, patch_size, patch_size, pca_components, 1) Xtest = Xtest.reshape(-1, patch_size, patch_size, pca_components, 1) print('before transpose: Xtrain shape: ', Xtrain.shape) print('before transpose: Xtest shape: ', Xtest.shape) # 为了适应 pytorch 结构,数据要做 transpose Xtrain = Xtrain.transpose(0, 4, 3, 1, 2) Xtest = Xtest.transpose(0, 4, 3, 1, 2) print('after transpose: Xtrain shape: ', Xtrain.shape) print('after transpose: Xtest shape: ', Xtest.shape) """ Training dataset""" class TrainDS(torch.utils.data.Dataset): def __init__(self): self.len = Xtrain.shape[0] self.x_data = torch.FloatTensor(Xtrain) self.y_data = torch.LongTensor(ytrain) def __getitem__(self, index): # 根据索引返回数据和对应的标签 return self.x_data[index], self.y_data[index] def __len__(self): # 返回文件数据的数目 return self.len """ Testing dataset""" class TestDS(torch.utils.data.Dataset): def __init__(self): self.len = Xtest.shape[0] self.x_data = torch.FloatTensor(Xtest) self.y_data = torch.LongTensor(ytest) def __getitem__(self, index): # 根据索引返回数据和对应的标签 return self.x_data[index], self.y_data[index] def __len__(self): # 返回文件数据的数目 return self.len # 创建 trainloader 和 testloader trainset = TrainDS() testset = TestDS() train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=128, shuffle=True, num_workers=2) test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=128, shuffle=False, num_workers=2)
# load the original image X = sio.loadmat('Indian_pines_corrected.mat')['indian_pines_corrected'] y = sio.loadmat('Indian_pines_gt.mat')['indian_pines_gt'] height = y.shape[0] width = y.shape[1] X = applyPCA(X, numComponents= pca_components) X = padWithZeros(X, patch_size//2) # 逐像素预测类别 outputs = np.zeros((height,width)) for i in range(height): for j in range(width): if int(y[i,j]) == 0: continue else : image_patch = X[i:i+patch_size, j:j+patch_size, :] image_patch = image_patch.reshape(1,image_patch.shape[0],image_patch.shape[1], image_patch.shape[2], 1) X_test_image = torch.FloatTensor(image_patch.transpose(0, 4, 3, 1, 2)).to(device) prediction = net(X_test_image) prediction = np.argmax(prediction.detach().cpu().numpy(), axis=1) outputs[i][j] = prediction+1 if i % 20 == 0: print('... ... row ', i, ' handling ... ...')
predict_image = spectral.imshow(classes = outputs.astype(int),figsize =(5,5))
背景:高光谱遥感图像(HSI,HyperSpectral Image)的地物分类是一个很重要的问题。最开始的方法就是把每个点的所有通道,也就是其频谱,作为输入特征,然后用诸如SVM等方法进行分类,当然效果不令人满意,因为现实中有同类光谱不同和不同类光谱相同的情况。后来,人们把空间特征利用起来,提取spectral-spatial features,得到了很大的提升。另外,还有用形态学提取纹理特征的方法。
近年来,深度学习算法的出现使人们转向深度网络方法,主要是基于local patch-based samples,来提取这一点的特征,将HSI空间中的点转换到特征空间,便于后面的分类。这样的结果就带上了频谱和空间的双重特征。
仅使用2D-CNN或3D-CNN分别存在缺失通道关系信息或模型非常复杂特点,阻碍了在高光谱图像上取得更好的精度。主要原因是高光谱图像是体积数据,也有光谱维数,仅凭2D-CNN无法从光谱维度中提取出具有良好鉴别能力的feature maps。一个深3D-CNN在计算上更加复杂,单独的类在许多光谱波段上具有相似的纹理时,其性能似乎更差。因此提出混合CVV模型的动机,将3D-CNN和2D-CNN组合到该模型中,以便充分利用光谱和空间特征地图,以达到最大可能的精度。