猫狗大战(kaggle竞赛-猫狗图像分类)

本实验使用kaggle中猫狗大战中的部分数据集(2000张训练数据+500张测试数据)

本次实验中使用了DNN、CNN、RNN分别进行了图像识别,具体代码如下:
DNN模型:
全连接层 神经元个数
FC1 512
FC2 128
FC3 2

class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()#just do it
        self.linear_classification=nn.Sequential(
            nn.Linear(150528,512),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(512,128),
            nn.ReLU(),
            nn.Linear(128,config.classNum),
            nn.Softmax()
        )
    def forward(self, video_clip):
        batch_size=video_clip.size(0)
        video_clip = video_clip.type(torch.FloatTensor)
        video_clip=video_clip.cuda()
        video_clip=video_clip.view(config.TRAIN_BATCH_SIZE,150528)
        output=self.linear_classification(video_clip)
        return output

CNN模型:
模型采用了经典的VGG16

class VGGNet(nn.Module):
    def __init__(self):
        super(VGGNet, self).__init__()#just do it

        self.conv1=torch.nn.Conv2d(3,64,kernel_size=3,padding='same')
        #RELU
        self.conv2 = torch.nn.Conv2d(64,64,kernel_size=3,padding='same')
        self.pooling1=torch.nn.MaxPool2d(kernel_size=2,stride=2)
        #RELU
        self.BN1=torch.nn.BatchNorm2d(64),
        
        

        self.feature_extractor2=nn.Sequential(
            nn.Conv2d(64,128,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(128,128,kernel_size=3,padding='same'),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.ReLU(),
            nn.BatchNorm2d(128)
        )
        

        self.feature_extractor3=nn.Sequential(
            nn.Conv2d(128,256,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(256,256,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(256,256,kernel_size=3,padding='same'),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.ReLU(),
            nn.BatchNorm2d(256)
        )
        

        self.feature_extractor4=nn.Sequential(
            nn.Conv2d(256,512,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(512,512,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(512,512,kernel_size=3,padding='same'),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.ReLU(),
            nn.BatchNorm2d(512)
        )
        

        self.feature_extractor5=nn.Sequential(
            nn.Conv2d(512,512,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(512,512,kernel_size=3,padding='same'),
            nn.ReLU(),
            nn.Conv2d(512,512,kernel_size=3,padding='same'),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.ReLU(),
            nn.BatchNorm2d(512)
        )
        

        self.linear1=nn.Sequential(
            nn.Flatten(),
            nn.Linear(512*7*7,4096),
            nn.ReLU(),
            nn.Dropout(p=0.5)
        )
        self.linear2=nn.Sequential(
            nn.Linear(4096,4096),
            nn.ReLU(),
            nn.Dropout(p=0.5)
        )
        self.linear3=nn.Sequential(
            nn.Linear(4096,config.classNum),
            nn.Softmax()
        )
        
    def forward(self, video_clip):
        batch_size=video_clip.size(0)
        video_clip = video_clip.type(torch.FloatTensor)
        video_clip=video_clip.cuda()
        features=self.conv1(video_clip)
        features=F.relu(features)
        features=self.conv2(features)
        features=self.pooling1(features)
        features=F.relu(features)
        BN=torch.nn.BatchNorm2d(64)
        BN.cuda()
        features=BN(features)
        features=self.feature_extractor2(features)
        features=self.feature_extractor3(features)
        features=self.feature_extractor4(features)
        features=self.feature_extractor5(features)
        output=self.linear1(features)
        output=self.linear2(output)
        output=self.linear3(output)
        return output

RNN模型:
使用LSTM

class RNN_LSTM(nn.Module):
    def __init__(self):
        super(RNN_LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True,bidirectional=False)
        self.linearLayer = nn.Linear(config.hidden_size*config.seq_length, config.classNum)
    def forward(self,video_clip):
        batch_size=video_clip.size(0)
        video_clip = video_clip.type(torch.FloatTensor)
        video_clip=video_clip.cuda()
        x=video_clip.view(config.TRAIN_BATCH_SIZE,config.seq_length,config.input_size)
        y,_=self.lstm(x)
        y=y.reshape(config.TRAIN_BATCH_SIZE,config.hidden_size*config.seq_length)
        y = self.linearLayer(y)
        y=torch.sigmoid(y)
        return y



class RNN_BiLSTM(nn.Module):
    def __init__(self):
        super(RNN_BiLSTM, self).__init__()
        self.Bilstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True,bidirectional=True)
        #self.linearLayer = nn.Linear(config.hidden_size*config.seq_length*2, config.hidden_size*config.seq_length)
        #self.linearLayer2 = nn.Linear(config.hidden_size*config.seq_length, config.classNum)
        self.linearLayer = nn.Linear(config.hidden_size*2,config.classNum)
    def forward(self,video_clip):
        batch_size=video_clip.size(0)
        video_clip = video_clip.type(torch.FloatTensor)
        video_clip=video_clip.cuda()
        #x=video_clip.view(config.TRAIN_BATCH_SIZE,config.seq_length,config.input_size)
        #inputsize (bath,seqLen,input_size)
        #print(video_clip.shape)
        x=video_clip.view(config.TRAIN_BATCH_SIZE,config.seq_length,config.input_size)
        y,_=self.Bilstm(x)
        #y=y.reshape(config.TRAIN_BATCH_SIZE,config.hidden_size*config.seq_length*2)
        #y = self.linearLayer(y)
        #y=self.linearLayer2(y)
        #outputsize(batch,seqLen,hidden_size)
        y=y[:, -1, :]
        y=self.linearLayer(y)
        y=torch.sigmoid(y)
        return y

加载数据集:

  1. 分离照片,将猫狗分离到不同的文件夹
  2. 使用ImageFolder和DataLoader函数加载数据集

模型训练(以DNN为例):

def train(epoch):
    iteration = 0
    loss_all=[]

    for i in range(epoch):
        model.train()
        #print('current lr', scheduler.get_last_lr())
        #adjust_learning_rate(optimizer,i)
        total=0
        correct=0
        loss_plt=[]
        for index, data in enumerate(trainset_loader):
            video_clips,label=data
            video_clips=video_clips.cuda()
            label=label.cuda()
            optimizer.zero_grad()

            #video_clips=video_clips[:,0,:,:]*0.299+video_clips[:,1,:,:]*0.587+video_clips[:,2,:,:]*0.114
            output = model(video_clips)
            prediect=output.argmax(dim=1)
            total+=label.size(0)
            correct+=(prediect==label).sum().item()
            output=output.float()
            output=output.cuda()       
            loss = loss_f(output, label)

            loss_plt.append(loss.item())
            loss.backward()
            optimizer.step()
            sum_loss=sum(loss_plt)
            loss_ave=sum_loss/(index+1)
            loss_all.append(loss_ave)
            iteration += 1
            if index%1==0:
                print("Epoch:", i, "/", epoch-1, "\tIteration:", index, "/", len(trainset_loader)-1, "\tLoss: " + str(loss.clone().detach()), "\tAccuracy:",                       100*(correct/total),"%")
                with open('log.txt', 'a') as f:
                    f.write("Epoch: " + str(i) + "/" + str(epoch-1) + "\tIteration:" + str(index) + "/" + str(len(trainset_loader)-1) + "\tLoss: " + str(loss.item()) + "\n")

    ##save_checkpoint('./model/DNN.pth',model,optimizer)

    plt.figure()
    plt.plot(loss_all)
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('loss')
    plt.xlim(0,100)   
    plt.ylim(0,1)
    plt.show()

在训练RNN模型的时候会因为图片的通道数导致输入维度不一致,所以我对3通道数据进行了预处理,将RGB读取的图片格式改为了L读取的图片格式。公式为:Y’ = 0.299 R + 0.587 G + 0.114 B。

实验结果:
其中CNN的准确率最高,达到80+%,CNN和LSTM均为60+%

你可能感兴趣的:(猫狗大战(kaggle竞赛-猫狗图像分类))