本次实验中使用了DNN、CNN、RNN分别进行了图像识别,具体代码如下:
DNN模型:
全连接层 神经元个数
FC1 512
FC2 128
FC3 2
class DNN(nn.Module):
def __init__(self):
super(DNN, self).__init__()#just do it
self.linear_classification=nn.Sequential(
nn.Linear(150528,512),
nn.ReLU(),
nn.Dropout(p=0.2),
nn.Linear(512,128),
nn.ReLU(),
nn.Linear(128,config.classNum),
nn.Softmax()
)
def forward(self, video_clip):
batch_size=video_clip.size(0)
video_clip = video_clip.type(torch.FloatTensor)
video_clip=video_clip.cuda()
video_clip=video_clip.view(config.TRAIN_BATCH_SIZE,150528)
output=self.linear_classification(video_clip)
return output
CNN模型:
模型采用了经典的VGG16
class VGGNet(nn.Module):
def __init__(self):
super(VGGNet, self).__init__()#just do it
self.conv1=torch.nn.Conv2d(3,64,kernel_size=3,padding='same')
#RELU
self.conv2 = torch.nn.Conv2d(64,64,kernel_size=3,padding='same')
self.pooling1=torch.nn.MaxPool2d(kernel_size=2,stride=2)
#RELU
self.BN1=torch.nn.BatchNorm2d(64),
self.feature_extractor2=nn.Sequential(
nn.Conv2d(64,128,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(128,128,kernel_size=3,padding='same'),
nn.MaxPool2d(kernel_size=2,stride=2),
nn.ReLU(),
nn.BatchNorm2d(128)
)
self.feature_extractor3=nn.Sequential(
nn.Conv2d(128,256,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(256,256,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(256,256,kernel_size=3,padding='same'),
nn.MaxPool2d(kernel_size=2,stride=2),
nn.ReLU(),
nn.BatchNorm2d(256)
)
self.feature_extractor4=nn.Sequential(
nn.Conv2d(256,512,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(512,512,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(512,512,kernel_size=3,padding='same'),
nn.MaxPool2d(kernel_size=2,stride=2),
nn.ReLU(),
nn.BatchNorm2d(512)
)
self.feature_extractor5=nn.Sequential(
nn.Conv2d(512,512,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(512,512,kernel_size=3,padding='same'),
nn.ReLU(),
nn.Conv2d(512,512,kernel_size=3,padding='same'),
nn.MaxPool2d(kernel_size=2,stride=2),
nn.ReLU(),
nn.BatchNorm2d(512)
)
self.linear1=nn.Sequential(
nn.Flatten(),
nn.Linear(512*7*7,4096),
nn.ReLU(),
nn.Dropout(p=0.5)
)
self.linear2=nn.Sequential(
nn.Linear(4096,4096),
nn.ReLU(),
nn.Dropout(p=0.5)
)
self.linear3=nn.Sequential(
nn.Linear(4096,config.classNum),
nn.Softmax()
)
def forward(self, video_clip):
batch_size=video_clip.size(0)
video_clip = video_clip.type(torch.FloatTensor)
video_clip=video_clip.cuda()
features=self.conv1(video_clip)
features=F.relu(features)
features=self.conv2(features)
features=self.pooling1(features)
features=F.relu(features)
BN=torch.nn.BatchNorm2d(64)
BN.cuda()
features=BN(features)
features=self.feature_extractor2(features)
features=self.feature_extractor3(features)
features=self.feature_extractor4(features)
features=self.feature_extractor5(features)
output=self.linear1(features)
output=self.linear2(output)
output=self.linear3(output)
return output
RNN模型:
使用LSTM
class RNN_LSTM(nn.Module):
def __init__(self):
super(RNN_LSTM, self).__init__()
self.lstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True,bidirectional=False)
self.linearLayer = nn.Linear(config.hidden_size*config.seq_length, config.classNum)
def forward(self,video_clip):
batch_size=video_clip.size(0)
video_clip = video_clip.type(torch.FloatTensor)
video_clip=video_clip.cuda()
x=video_clip.view(config.TRAIN_BATCH_SIZE,config.seq_length,config.input_size)
y,_=self.lstm(x)
y=y.reshape(config.TRAIN_BATCH_SIZE,config.hidden_size*config.seq_length)
y = self.linearLayer(y)
y=torch.sigmoid(y)
return y
class RNN_BiLSTM(nn.Module):
def __init__(self):
super(RNN_BiLSTM, self).__init__()
self.Bilstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True,bidirectional=True)
#self.linearLayer = nn.Linear(config.hidden_size*config.seq_length*2, config.hidden_size*config.seq_length)
#self.linearLayer2 = nn.Linear(config.hidden_size*config.seq_length, config.classNum)
self.linearLayer = nn.Linear(config.hidden_size*2,config.classNum)
def forward(self,video_clip):
batch_size=video_clip.size(0)
video_clip = video_clip.type(torch.FloatTensor)
video_clip=video_clip.cuda()
#x=video_clip.view(config.TRAIN_BATCH_SIZE,config.seq_length,config.input_size)
#inputsize (bath,seqLen,input_size)
#print(video_clip.shape)
x=video_clip.view(config.TRAIN_BATCH_SIZE,config.seq_length,config.input_size)
y,_=self.Bilstm(x)
#y=y.reshape(config.TRAIN_BATCH_SIZE,config.hidden_size*config.seq_length*2)
#y = self.linearLayer(y)
#y=self.linearLayer2(y)
#outputsize(batch,seqLen,hidden_size)
y=y[:, -1, :]
y=self.linearLayer(y)
y=torch.sigmoid(y)
return y
加载数据集:
模型训练(以DNN为例):
def train(epoch):
iteration = 0
loss_all=[]
for i in range(epoch):
model.train()
#print('current lr', scheduler.get_last_lr())
#adjust_learning_rate(optimizer,i)
total=0
correct=0
loss_plt=[]
for index, data in enumerate(trainset_loader):
video_clips,label=data
video_clips=video_clips.cuda()
label=label.cuda()
optimizer.zero_grad()
#video_clips=video_clips[:,0,:,:]*0.299+video_clips[:,1,:,:]*0.587+video_clips[:,2,:,:]*0.114
output = model(video_clips)
prediect=output.argmax(dim=1)
total+=label.size(0)
correct+=(prediect==label).sum().item()
output=output.float()
output=output.cuda()
loss = loss_f(output, label)
loss_plt.append(loss.item())
loss.backward()
optimizer.step()
sum_loss=sum(loss_plt)
loss_ave=sum_loss/(index+1)
loss_all.append(loss_ave)
iteration += 1
if index%1==0:
print("Epoch:", i, "/", epoch-1, "\tIteration:", index, "/", len(trainset_loader)-1, "\tLoss: " + str(loss.clone().detach()), "\tAccuracy:", 100*(correct/total),"%")
with open('log.txt', 'a') as f:
f.write("Epoch: " + str(i) + "/" + str(epoch-1) + "\tIteration:" + str(index) + "/" + str(len(trainset_loader)-1) + "\tLoss: " + str(loss.item()) + "\n")
##save_checkpoint('./model/DNN.pth',model,optimizer)
plt.figure()
plt.plot(loss_all)
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('loss')
plt.xlim(0,100)
plt.ylim(0,1)
plt.show()
在训练RNN模型的时候会因为图片的通道数导致输入维度不一致,所以我对3通道数据进行了预处理,将RGB读取的图片格式改为了L读取的图片格式。公式为:Y’ = 0.299 R + 0.587 G + 0.114 B。
实验结果:
其中CNN的准确率最高,达到80+%,CNN和LSTM均为60+%