第二部分 代码练习
-
MobileNetV1网络:
import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.transforms as transforms import matplotlib.pyplot as plt import numpy as np import torch.optim as optim class Block(nn.Module): '''Depthwise conv + Pointwise conv''' def __init__(self, in_planes, out_planes, stride=1): super(Block, self).__init__() # Depthwise 卷积,3*3 的卷积核,分为 in_planes,即各层单独进行卷积 self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) self.bn1 = nn.BatchNorm2d(in_planes) # Pointwise 卷积,1*1 的卷积核 self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_planes) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) return out device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2) class MobileNetV1(nn.Module): # (128,2) means conv planes=128, stride=2 cfg = [(64,1), (128,2), (128,1), (256,2), (256,1), (512,2), (512,1), (1024,2), (1024,1)] def __init__(self, num_classes=10): super(MobileNetV1, self).__init__() self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.linear = nn.Linear(1024, num_classes) def _make_layers(self, in_planes): layers = [] for x in self.cfg: out_planes = x[0] stride = x[1] layers.append(Block(in_planes, out_planes, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.avg_pool2d(out, 2) out = out.view(out.size(0), -1) out = self.linear(out) return out # 网络放到GPU上 net = MobileNetV1().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=0.001) for epoch in range(10): # 重复多轮训练 for i, (inputs, labels) in enumerate(trainloader): inputs = inputs.to(device) labels = labels.to(device) # 优化器梯度归零 optimizer.zero_grad() # 正向传播 + 反向传播 + 优化 outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # 输出统计信息 if i % 100 == 0: print('Epoch: %d Minibatch: %5d loss: %.3f' %(epoch + 1, i + 1, loss.item())) print('Finished Training')
训练结果:
模型测试
correct = 0
total = 0
for data in testloader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %.2f %%' % (
100 * correct / total))
Accuracy of the network on the 10000 test images: 79.04 %
-
MobileNetV2网络:
import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.transforms as transforms import matplotlib.pyplot as plt import numpy as np import torch.optim as optim class Block(nn.Module): '''expand + depthwise + pointwise''' def __init__(self, in_planes, out_planes, expansion, stride): super(Block, self).__init__() self.stride = stride # 通过 expansion 增大 feature map 的数量 planes = expansion * in_planes self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) self.bn3 = nn.BatchNorm2d(out_planes) # 步长为 1 时,如果 in 和 out 的 feature map 通道不同,用一个卷积改变通道数 if stride == 1 and in_planes != out_planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(out_planes)) # 步长为 1 时,如果 in 和 out 的 feature map 通道相同,直接返回输入 if stride == 1 and in_planes == out_planes: self.shortcut = nn.Sequential() def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) # 步长为1,加 shortcut 操作 if self.stride == 1: return out + self.shortcut(x) # 步长为2,直接输出 else: return out class MobileNetV2(nn.Module): # (expansion, out_planes, num_blocks, stride) cfg = [(1, 16, 1, 1), (6, 24, 2, 1), (6, 32, 3, 2), (6, 64, 4, 2), (6, 96, 3, 1), (6, 160, 3, 2), (6, 320, 1, 1)] def __init__(self, num_classes=10): super(MobileNetV2, self).__init__() self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(32) self.layers = self._make_layers(in_planes=32) self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(1280) self.linear = nn.Linear(1280, num_classes) def _make_layers(self, in_planes): layers = [] for expansion, out_planes, num_blocks, stride in self.cfg: strides = [stride] + [1]*(num_blocks-1) for stride in strides: layers.append(Block(in_planes, out_planes, expansion, stride)) in_planes = out_planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layers(out) out = F.relu(self.bn2(self.conv2(out))) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2) # 网络放到GPU上,模型训练 net = MobileNetV2().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=0.001) for epoch in range(10): # 重复多轮训练 for i, (inputs, labels) in enumerate(trainloader): inputs = inputs.to(device) labels = labels.to(device) # 优化器梯度归零 optimizer.zero_grad() # 正向传播 + 反向传播 + 优化 outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # 输出统计信息 if i % 100 == 0: print('Epoch: %d Minibatch: %5d loss: %.3f' %(epoch + 1, i + 1, loss.item())) print('Finished Training')
模型测试
correct = 0
total = 0
for data in testloader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %.2f %%' % (
100 * correct / total))
Accuracy of the network on the 10000 test images: 81.55 %
-
HybridSN高光谱分类网络:
! wget http://www.ehu.eus/ccwintco/uploads/6/67/Indian_pines_corrected.mat ! wget http://www.ehu.eus/ccwintco/uploads/c/c4/Indian_pines_gt.mat ! pip install spectral
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, cohen_kappa_score
import spectral
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class_num = 16
class HybridSN(nn.Module):
def __init__(self, stride=1):
super(HybridSN, self).__init__()
#conv1:(1, 30, 25, 25), 8个 7x3x3 的卷积核 ==>(8, 24, 23, 23)
self.conv1 = nn.Conv3d(1, 8, kernel_size=(7,3,3), stride=stride, padding=0)
self.relu1 = nn.ReLU()
#conv2:(8, 24, 23, 23), 16个 5x3x3 的卷积核 ==>(16, 20, 21, 21)
self.conv2 = nn.Conv3d(8, 16, kernel_size=(5,3,3), stride=stride, padding=0)
self.relu2 = nn.ReLU()
#conv3:(16, 20, 21, 21),32个 3x3x3 的卷积核 ==>(32, 18, 19, 19)
self.conv3 = nn.Conv3d(16, 32, kernel_size=(3,3,3), stride=stride, padding=0)
self.relu3 = nn.ReLU()
#二维卷积:(576, 19, 19) 64个 3x3 的卷积核,得到 (64, 17, 17)
self.conv4 = nn.Conv2d(576, 64, kernel_size=(3,3), stride=stride, padding=0)
self.relu4 = nn.ReLU()
#接下来是一个 flatten 操作,变为 18496 维的向量,
#接下来依次为256,128节点的全连接层,都使用比例为0.4的 Dropout,
#最后输出为 16 个节点,是最终的分类类别数
self.fn1 = nn.Linear(18496,256)
self.fn2 = nn.Linear(256,128)
self.fn3 = nn.Linear(128,16)
self.drop = nn.Dropout(0.4)
def forward(self, x):
out = self.relu1(self.conv1(x))
out = self.relu2(self.conv2(out))
out = self.relu3(self.conv3(out))
out = out.view(out.shape[0], out.shape[1] * out.shape[2], x.shape[3], x.shape[4])
out = self.relu4(self.conv4(out))
out = out.reshape(out.shape[0],-1)
out = self.drop(self.fn1(out))
out = self.drop(self.fn2(out))
out = self.fn3(out)
return out
# 随机输入,测试网络结构是否通
# x = torch.randn(1, 1, 30, 25, 25)
# net = HybridSN()
# y = net(x)
# print(y.shape)
torch.Size([1, 16])
创建数据集
# 对高光谱数据 X 应用 PCA 变换
def applyPCA(X, numComponents):
newX = np.reshape(X, (-1, X.shape[2]))
pca = PCA(n_components=numComponents, whiten=True)
newX = pca.fit_transform(newX)
newX = np.reshape(newX, (X.shape[0], X.shape[1], numComponents))
return newX
# 对单个像素周围提取 patch 时,边缘像素就无法取了,因此,给这部分像素进行 padding 操作
def padWithZeros(X, margin=2):
newX = np.zeros((X.shape[0] + 2 * margin, X.shape[1] + 2* margin, X.shape[2]))
x_offset = margin
y_offset = margin
newX[x_offset:X.shape[0] + x_offset, y_offset:X.shape[1] + y_offset, :] = X
return newX
# 在每个像素周围提取 patch ,然后创建成符合 keras 处理的格式
def createImageCubes(X, y, windowSize=5, removeZeroLabels = True):
# 给 X 做 padding
margin = int((windowSize - 1) / 2)
zeroPaddedX = padWithZeros(X, margin=margin)
# split patches
patchesData = np.zeros((X.shape[0] * X.shape[1], windowSize, windowSize, X.shape[2]))
patchesLabels = np.zeros((X.shape[0] * X.shape[1]))
patchIndex = 0
for r in range(margin, zeroPaddedX.shape[0] - margin):
for c in range(margin, zeroPaddedX.shape[1] - margin):
patch = zeroPaddedX[r - margin:r + margin + 1, c - margin:c + margin + 1]
patchesData[patchIndex, :, :, :] = patch
patchesLabels[patchIndex] = y[r-margin, c-margin]
patchIndex = patchIndex + 1
if removeZeroLabels:
patchesData = patchesData[patchesLabels>0,:,:,:]
patchesLabels = patchesLabels[patchesLabels>0]
patchesLabels -= 1
return patchesData, patchesLabels
def splitTrainTestSet(X, y, testRatio, randomState=345):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testRatio, random_state=randomState, stratify=y)
return X_train, X_test, y_train, y_test
# 地物类别
class_num = 16
X = sio.loadmat('Indian_pines_corrected.mat')['indian_pines_corrected']
y = sio.loadmat('Indian_pines_gt.mat')['indian_pines_gt']
# 用于测试样本的比例
test_ratio = 0.90
# 每个像素周围提取 patch 的尺寸
patch_size = 25
# 使用 PCA 降维,得到主成分的数量
pca_components = 30
print('Hyperspectral data shape: ', X.shape)
print('Label shape: ', y.shape)
print('\n... ... PCA tranformation ... ...')
X_pca = applyPCA(X, numComponents=pca_components)
print('Data shape after PCA: ', X_pca.shape)
print('\n... ... create data cubes ... ...')
X_pca, y = createImageCubes(X_pca, y, windowSize=patch_size)
print('Data cube X shape: ', X_pca.shape)
print('Data cube y shape: ', y.shape)
print('\n... ... create train & test data ... ...')
Xtrain, Xtest, ytrain, ytest = splitTrainTestSet(X_pca, y, test_ratio)
print('Xtrain shape: ', Xtrain.shape)
print('Xtest shape: ', Xtest.shape)
# 改变 Xtrain, Ytrain 的形状,以符合 keras 的要求
Xtrain = Xtrain.reshape(-1, patch_size, patch_size, pca_components, 1)
Xtest = Xtest.reshape(-1, patch_size, patch_size, pca_components, 1)
print('before transpose: Xtrain shape: ', Xtrain.shape)
print('before transpose: Xtest shape: ', Xtest.shape)
# 为了适应 pytorch 结构,数据要做 transpose
Xtrain = Xtrain.transpose(0, 4, 3, 1, 2)
Xtest = Xtest.transpose(0, 4, 3, 1, 2)
print('after transpose: Xtrain shape: ', Xtrain.shape)
print('after transpose: Xtest shape: ', Xtest.shape)
""" Training dataset"""
class TrainDS(torch.utils.data.Dataset):
def __init__(self):
self.len = Xtrain.shape[0]
self.x_data = torch.FloatTensor(Xtrain)
self.y_data = torch.LongTensor(ytrain)
def __getitem__(self, index):
# 根据索引返回数据和对应的标签
return self.x_data[index], self.y_data[index]
def __len__(self):
# 返回文件数据的数目
return self.len
""" Testing dataset"""
class TestDS(torch.utils.data.Dataset):
def __init__(self):
self.len = Xtest.shape[0]
self.x_data = torch.FloatTensor(Xtest)
self.y_data = torch.LongTensor(ytest)
def __getitem__(self, index):
# 根据索引返回数据和对应的标签
return self.x_data[index], self.y_data[index]
def __len__(self):
# 返回文件数据的数目
return self.len
# 创建 trainloader 和 testloader
trainset = TrainDS()
testset = TestDS()
train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=128, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=128, shuffle=False, num_workers=2)
Hyperspectral data shape: (145, 145, 200)
Label shape: (145, 145)
... ... PCA tranformation ... ...
Data shape after PCA: (145, 145, 30)
... ... create data cubes ... ...
Data cube X shape: (10249, 25, 25, 30)
Data cube y shape: (10249,)
... ... create train & test data ... ...
Xtrain shape: (1024, 25, 25, 30)
Xtest shape: (9225, 25, 25, 30)
before transpose: Xtrain shape: (1024, 25, 25, 30, 1)
before transpose: Xtest shape: (9225, 25, 25, 30, 1)
after transpose: Xtrain shape: (1024, 1, 30, 25, 25)
after transpose: Xtest shape: (9225, 1, 30, 25, 25)
开始训练
# 使用GPU训练,可以在菜单 "代码执行工具" -> "更改运行时类型" 里进行设置
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 网络放到GPU上
net = HybridSN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
# 开始训练
total_loss = 0
for epoch in range(100):
for i, (inputs, labels) in enumerate(train_loader):
inputs = inputs.to(device)
labels = labels.to(device)
# 优化器梯度归零
optimizer.zero_grad()
# 正向传播 + 反向传播 + 优化
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
print('[Epoch: %d] [loss avg: %.4f] [current loss: %.4f]' %(epoch + 1, total_loss/(epoch+1), loss.item()))
print('Finished Training')
模型测试:
count = 0
# 模型测试
for inputs, _ in test_loader:
inputs = inputs.to(device)
outputs = net(inputs)
outputs = np.argmax(outputs.detach().cpu().numpy(), axis=1)
if count == 0:
y_pred_test = outputs
count = 1
else:
y_pred_test = np.concatenate( (y_pred_test, outputs) )
# 生成分类报告
classification = classification_report(ytest, y_pred_test, digits=4)
print(classification)
多次测试结果分别为:
1. accuracy 0.9405 9225
2. accuracy 0.9377 9225
3. accuracy 0.9376 9225
4. accuracy 0.9377 9225
5. accuracy 0.9390 9225
尝试先使用二维卷积在进行三维卷积,发现输入输出不匹配还没搞懂在哪进行修改。
第三部分 论文阅读心得
MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
这篇论文描述了一个有效的网络架构和两个超参数来建立非常小的、低延迟的模型,以便轻松匹配移动和嵌入式视觉应用的设计要求。MobileNets是基于一个流线型的架构,它使用深度可分离的卷积来构建轻量级的深层神经网络,两个超参数允许模型构建者根据问题的约束条件,为其应用选择合适大小的模型。
MobileNet的主要工作是用depthwise sparable convolutions(深度级可分离卷积)替代过去的standard convolutions(标准卷积)来解决卷积网络的计算效率和参数量的问题。MobileNets模型基于是depthwise sparable convolutions(深度级可分离卷积),它可以将标准卷积分解成一个深度卷积和一个点卷积(1 × 1卷积核)。深度卷积将每个卷积核应用到每一个通道,而1 × 1卷积用来组合通道卷积的输出。
standard convolutions(标准卷积)分解为depthwise convolution(深度卷积)和pointwise convolution(逐点卷积)
depthwise separable convolution结构:
MobileNets网络结构(由多个depthwise separable convolution组合而成):
MobileNetV2: Inverted Residuals and Linear Bottlenecks
MobileNetV2结构基于inverted residual(本质是一个残差网络设计,传统Residual block是block的两端channel通道数多,中间少,而本文设计的inverted residual是block的两端channel通道数少,block内channel多,类似于沙漏和梭子形态的区别),另外保留Depthwise Separable Convolutions。
Block基本结构:
输入:一个低维 k(通道)的、经压缩的数据
经过:
step 1, point wise卷积扩展维度(通道),扩展因子为t;
step 2, depthwise separable 卷积,stride为 s;
step 3, linear conv把特征再映射的低维,输出维度为 k’;
输出作为下一个block的输入,堆叠block。
V1和V2有两个区别。
v2在原有的dw之前加了一个pw专门用来升维。这么做是因为dw给多少通道就输出多少通道,本身没法改变通道数,先加pw升维后,dw就能在高维提特征了。
v2把原本dw之后用来降维的pw后的激活函数给去掉了。这么做据作者说是因为他认为非线性在高维有益处,但在低维(例如pw降维后的空间)不如线性好。
可以看到MobileNetV2 和ResNet基本结构很相似。不过ResNet是先降维(0.25倍)、提特征、再升维。而MobileNetV2 则是先升维(6倍)、提特征、再降维。
HybridSN: Exploring 3-D–2-DCNN Feature Hierarchy for Hyperspectral Image Classification
网络结构:
Beyond a Gaussian Denoiser: Residual Learning of Deep CNN for Image Denoising
通过进一步研究前馈去噪卷积神经网络(DnCNN)的结构,使用了更深的结构、残差学习算法、正则化和批量归一化等方法提高去噪性能。能够处理具有未知噪声水平的高斯去噪(即,盲高斯去噪)。
该文将图像去噪视为一种简单的判别学习问题,即通过前馈卷积神经网络(CNN)将噪声与图像分离。去噪卷积神经网络称为DnCNN。不是直接输出去噪图像,而是将所提出的DnCNN设计成预测残差图像,即噪声观察和潜在干净图像之间的差异。
对于网络结构设计,修改VGG网络以使其适用于图像去噪,并根据最先进的去噪方法中使用的有效色块大小设置网络的深度。对于模型学习,采用残差学习公式,并将其与批量标准化相结合,以实现快速训练和提高去噪性能。
DnCNN的卷积滤波器的大小设置为3x3,但是去除了所有的池化层,感受野大小设置为35x35,相应的深度为17。对于其他一般图像的去噪任务,将采用更大的感受野并将深度设置为20。
网络结构:
- Conv+ReLU:64个大小为3x3xc的滤波器被用于生成64个特征图。然后将整流的线性单元用于非线性
- Conv+BN+ReLU:64个大小3x3x64的过滤器,并且将批量归一化加在卷积和ReLU之间
- Conv:c个大小为3x3x64的滤波器被用于重建输出。
特征:采用残差学习公式来学习R(y),并结合批量归一化来加速训练以及提高去噪性能
边界伪影:采用0填充的方法解决了边界伪影
Squeeze-and-Excitation Networks
将重点放在特征通道关系上,并提出一个新颖的体系结构单元,称之为“挤压(Squeeze)和激发(Excitation)”(SE)块,具体来说,就是通过学习的方式来自动获取到每个特征通道的重要程度,然后依照这个重要程度去提升有用的特征并抑制对当前任务用处不大的特征。
首先是 Squeeze 操作,我们顺着空间维度来进行特征压缩,将每个二维的特征通道变成一个实数,这个实数某种程度上具有全局的感受野,并且输出的维度和输入的特征通道数相匹配。它表征着在特征通道上响应的全局分布,而且使得靠近输入的层也可以获得全局的感受野,这一点在很多任务中都是非常有用的。
其次是 Excitation 操作,它是一个类似于循环神经网络中门的机制。通过参数 w 来为每个特征通道生成权重,其中参数 w 被学习用来显式地建模特征通道间的相关性。
最后是一个 Reweight 的操作,我们将 Excitation 的输出的权重看做是进过特征选择后的每个特征通道的重要性,然后通过乘法逐通道加权到先前的特征上,完成在通道维度上的对原始特征的重标定。
使用 global average pooling 作为 Squeeze 操作。紧接着两个 Fully Connected 层组成一个 Bottleneck 结构去建模通道间的相关性,并输出和输入特征同样数目的权重。首先将特征维度降低到输入的 1/16,然后经过 ReLu 激活后再通过一个 Fully Connected 层升回到原来的维度。
SENet的核心思想在于通过网络根据loss去学习特征权重,使得有效的feature map权重大,无效或效果小的feature map权重小的方式训练模型达到更好的结果。当然,SE block嵌在原有的一些分类网络中不可避免地增加了一些参数和计算量,但是在效果面前还是可以接受的。
Deep Supervised Cross-modal Retrieval
1.提出了一种基于深度监督的多模态学习结构,以弥补多模态间的异构性。通过端到端同时保持语义区分和模态不变性,可以有效地学习异构数据的公共表示。
2.提出了两种具有权值共享约束的子网络来学习图像和文本模态之间的交叉模态相关。另外,将模态不变性损失直接转化为目标函数,以消除模态间的差异。
3.利用线性分类器对公共表示空间中的样本进行分类。这样,DSCMR最大限度地减少了标签空间和公共表示空间的辨别损失,使得学习的公共表示具有显著的可区分性。
该方法包含两个子网络,一个用于图像模态,另一个用于文本模态,并且它们是以端到端的方式进行训练的。将图像和文本分别输入到两个子网络中,得到原始的高级语义表示。然后,在它们的顶部分别添加一些全连接层,将来自不同模式的样本映射到一个公共表示空间。最后,利用参数为P的线性分类器预测每个样本的类别
- 包括两个子网络——一个是图像模态,另一个是文本模态,端到端训练
- 对于图像:利用预训练在 ImageNet 的网络提取出图像的 4096 维的特征作为原始的图像高级语义表达。然后后续是几个全连接层,来得到图像在公共空间中的表达。
- 对于文本:利用预训练在 Google News 上的 Word2Vec 模型,来得到 k 维的特征向量。一个句子可以表示为一个矩阵,然后使用一个Text CNN来得到原始的句子高级语义表达。之后也是同样的形式,后面是几个全连接层来得到句子在公共空间中的表达。
- 为了确保两个子网络能够为图像和文本学到公共的表达,我们使这两个子网络的最后几层共享权重。直觉上这样可以使得同一类的图片和文本生成尽可能相似的表达
- 最后面是一层全连接层来进行分类