李沐动手学深度学习V2-实战Kaggle比赛:叶子分类(Classify Leaves)和代码实现

一. 叶子分类

  1. 预测叶子图像的类别,该数据集包含 176 个类别,18353 个训练图像,8800 个测试图像。每个类别至少有 50 张图像用于训练,测试集平均分为公共和私人排行榜,网址为:https://www.kaggle.com/competitions/classify-leaves/code
  2. 由于images文件夹包含测试集和训练集所有图像,因此需要手动把测试集和训练集分开,所有代码如下,运行在一个GPU上面(lr,weight_decay,epochs = 2e-4,5e-4,100):
import collections
import math
import os.path
import shutil

import d2l.torch
import torch
import torchvision.transforms
import torch.utils.data
from torch import nn
import pandas as pd

data_dir = '../data/classify-leaves'
def read_csv_data(fname):
    with open(fname,'r') as f:
        lines = f.readlines()[1:]
    #读取文件中每一行数据,获取数据集的标签和对应的图片索引号,需要去除第一行标题名称
    tokens = [line.rstrip().split(',') for line in lines]
    return dict(((name.split('.')[0].split('/')[1],label) for name,label in tokens))
labels = read_csv_data(os.path.join(data_dir,'train.csv'))
print('样本数:',len(labels))
print('类别数:',len(set(labels.values())))
#print(labels.keys())

def copy_file(fname, target_dir):
    # 创建文件夹,如果存在,就不再重复创建
    os.makedirs(name=target_dir, exist_ok=True)
    # 将源文件图片复制到指定文件夹下
    shutil.copy(fname, target_dir)


# 从训练集中拆分一部分图片用作验证集,然后复制到指定文件夹下面
def split_copy_train_valid_test(data_dir, labels, split_to_valid_ratio):
    # labels.values()是具体的标签值,通过使用collections.Counter()函数对训练集类别数目进行计数,然后从大到小排列,获取最少的一类数目
    split_num = collections.Counter(labels.values()).most_common()[-1][1]
    # 获取从训练集中每一类需要选出多少个样本作为验证集
    num_valid_per_label = max(1, math.floor(split_num * split_to_valid_ratio))
    valid_label_count = {}
    absolute_path = os.path.join(data_dir, 'train_valid_test')
    for image_file in os.listdir(os.path.join(data_dir, 'images')):
        # 获取当前图片的label

        key = image_file.split('.')[0]
        #print(".....", key)
        if (key in labels):
            label = labels[key]
            train_file_path = os.path.join(data_dir, 'images', image_file)
            # 复制训练集的图片到'train_valid'文件夹下
            copy_file(train_file_path, os.path.join(absolute_path, 'train_valid', label))
            if label not in valid_label_count or valid_label_count[label] < num_valid_per_label:
                # 复制训练集的图片到'valid'文件夹下
                copy_file(train_file_path, os.path.join(absolute_path, 'valid', label))
                valid_label_count[label] = valid_label_count.get(label, 0) + 1
            else:
                # 复制训练集的图片到'train'文件夹下
                copy_file(train_file_path, os.path.join(absolute_path, 'train', label))
        else:
            copy_file(os.path.join(data_dir, 'images', image_file),
                      os.path.join(data_dir, 'train_valid_test', 'test', 'unknown'))
    return num_valid_per_label


def copy_classify_leaves_data(data_dir, split_to_valid_ratio):
    labels = read_csv_data(fname=os.path.join(data_dir, 'train.csv'))
    split_copy_train_valid_test(data_dir, labels, split_to_valid_ratio)

batch_size = 32
split_to_valid_ratio = 0.1
copy_classify_leaves_data(data_dir,split_to_valid_ratio)

transform_train = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.RandomResizedCrop(96,scale=(0.64,1.0),ratio=(1.0,1.0)),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
])
transform_test = torchvision.transforms.Compose([
    torchvision.transforms.Resize(96),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
])

#ImageFolder重新组织数据集
train_datasets,train_valid_datasets = [torchvision.datasets.ImageFolder(
    root=os.path.join(data_dir,'train_valid_test',folder),transform=transform_train)for folder in ['train','train_valid']]
test_datasets,valid_datasets = [torchvision.datasets.ImageFolder(root=os.path.join(data_dir,'train_valid_test',folder),transform=transform_test)for folder in ['test','valid']]
#创建数据集迭代器
train_iter,train_valid_iter = [torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=True,drop_last=True)for ds in [train_datasets,train_valid_datasets]]
test_iter,valid_iter = [torch.utils.data.DataLoader(dataset=ds,batch_size=batch_size,shuffle=False,drop_last=False)for ds in [test_datasets,valid_datasets]]

def get_net():
    num_classes = 176
    net = d2l.torch.resnet18(num_classes=num_classes,in_channels=3)
    return net
loss = nn.CrossEntropyLoss(reduction='none')


def train(net,train_iter,valid_iter,num_epochs,lr,weight_decay,lr_period,lr_decay,devices):
    #优化器函数:SGD
    optim = torch.optim.SGD(net.parameters(),lr=lr,momentum=0.9,weight_decay=weight_decay)
    #每隔四轮,学习率就衰减为lr*lr_decay
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optim,step_size=lr_period,gamma=lr_decay)
    timer,num_batches = d2l.torch.Timer(),len(train_iter)
    legend = ['train loss','train acc']
    if valid_iter is not None:
        legend.append('valid acc')
    animator = d2l.torch.Animator(xlabel='epoch',xlim=[1,num_epochs],legend=legend)
    #GPU计算
    net = nn.DataParallel(module=net,device_ids=devices).to(devices[0])
    timer_epoch = d2l.torch.Timer()
    for epoch in range(num_epochs):
        timer_epoch.start()
        accumulator=d2l.torch.Accumulator(3)
        net.train()#网络开始训练
        for i,(X,y) in enumerate(train_iter):
            timer.start()
            #每一个批量进行训练所得到的批量损失和精确度
            batch_loss,batch_acc = d2l.torch.train_batch_ch13(net,X,y,loss,optim,devices)
            accumulator.add(batch_loss,batch_acc,y.shape[0])
            timer.stop()
            if i%(num_batches//5)==0 or i == num_batches-1:
                animator.add(epoch+(i+1)/num_batches,(accumulator[0]/accumulator[2],accumulator[1]/accumulator[2],None))
        timer_epoch.stop()
        net.eval()#训练每一轮结束后,模型需要用于验证数据集
        measures = f'train loss {accumulator[0]/accumulator[2]},train acc {accumulator[1]/accumulator[2]},\n'
        if valid_iter is not None:
            valid_acc = d2l.torch.evaluate_accuracy_gpu(net, valid_iter,devices[0])
            animator.add(epoch+1,(None,None,valid_acc))
            measures += f'valid acc {valid_acc},'
        if (epoch+1)%4 ==0:
            torch.save(net.state_dict(),'model'+str(epoch+1)+'.pth')
        lr_scheduler.step()#判断是否需要进行学习率衰减
    print(measures+f'\n{num_epochs*accumulator[2]/timer.sum()} examples/sec and {timer_epoch.avg():.1f}秒/轮,on {str(devices[0])}')

lr,weight_decay,epochs = 2e-4,5e-4,100
lr_decay,lr_period,net= 0.9,4,get_net()
devices = d2l.torch.try_all_gpus()
train(net,train_iter,valid_iter,epochs,lr,weight_decay,lr_period,lr_decay,devices)
  1. 训练完后再使用全部训练集重新训练一遍,最后训练好的模型用于测试集
net,preds = get_net(),[]
#当训练完后,得到合适的超参数,然后重新把之前训练集和验证集合在一起重新进行训练,得到训练好的网络,用于预测
train(net,train_valid_iter,None,epochs,lr,weight_decay,lr_period,lr_decay,devices)
net.eval()
with torch.no_grad:
    for X, _ in test_iter:
        # 将测试数据集复制到GPU上面
        y_hat = net(X.to(devices[0]))
        # list.exend()表示一次性添加多个数据到列表中,list.append()表示一次性只添加一个数据
        # y_hat.argmax(dim=1)表示得到每行值最大的一个索引,转化成int类型,复制到cpu上面(因为要输出到csv文件中),再转成numpy类型
        preds.extend(y_hat.argmax(dim=1).type(torch.int32).cpu().numpy())
indexs = list(range(18353,len(test_datasets)+1))#生成索引号
indexs.sort(key=lambda x:str(x))
indexs = ['images/'+str(index)+'.jpg' for index in indexs]
submission = pd.DataFrame({'image':indexs,'label':preds})#使用pandas.DataFrame组织格式:编号,标签两列
#https://blog.csdn.net/weixin_48249563/article/details/114318425
#将索引数字转换成对应的标签名称,ImageFolder组织数据集时将文件夹名称(如:cat文件夹)生成一个对应的类名(如:cat类)
submission['label'] = submission['label'].apply(lambda x:train_valid_datasets.classes[x])
#输出到csv文件中
submission.to_csv('submission.csv',index=False)

训练和验证结果如下:
训练和验证结果

  1. 由于训练中将每轮训练的模型都保存下来了,因此可以直接选取验证集准确度最高的模型来进行对测试集预测,下面选取第96轮网络训练的结果来对测试集进行预测,在Kaggle上面公榜和私榜预测结果均为95%以上,注意:模型用于训练时需要加上net.train(),模型用于测试和验证时需要加上net.eval() ,对于网络中有batch_norm()和dropout层非常重要,如果测试和验证时不加net.eval(),网络对测试集预测的准确度至少少百分之一,如下代码所示:
import os
import d2l.torch
import pandas as pd
import torch
import torchvision
from torch import nn
def get_net():
    num_classes = 176
   # net = d2l.torch.resnet18(num_classes=num_classes,in_channels=3)
    net = torchvision.models.resnet50(pretrained=False)
    net.fc = nn.Linear(in_features=net.fc.in_features,out_features=num_classes)
    nn.init.xavier_uniform_(net.fc.weight)
    return net
loss = nn.CrossEntropyLoss(reduction='none')
net = get_net()
checkpoints = torch.load('model_resnet96.pth')
#print('skj',checkpoints)
for key in list(checkpoints.keys()):
    if 'module.' in key:
        checkpoints[key[7:]] = checkpoints[key]
        del checkpoints[key]
net.load_state_dict(checkpoints)
data_dir = '../data/classify-leaves'
transform_test = torchvision.transforms.Compose([
    torchvision.transforms.Resize(96),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
])
test_datasets = torchvision.datasets.ImageFolder(root=os.path.join(data_dir,'train_valid_test','test'),transform=transform_test)
test_iter = torch.utils.data.DataLoader(dataset=test_datasets,batch_size=64,shuffle=False,drop_last=False)
net = net.to(torch.device('cuda:0'))
preds = []
net.eval()
with torch.no_grad():
    for X, _ in test_iter:
        # 将测试数据集复制到GPU上面
        y_hat = net(X.to(torch.device('cuda:0')))
        # list.exend()表示一次性添加多个数据到列表中,list.append()表示一次性只添加一个数据
        # y_hat.argmax(dim=1)表示得到每行值最大的一个索引,转化成int类型,复制到cpu上面(因为要输出到csv文件中),再转成numpy类型
        preds.extend(y_hat.argmax(dim=1).type(torch.int32).cpu().numpy())
indexs = list(range(18353,len(test_datasets)+18353))#生成索引号
indexs.sort(key=lambda x:str(x))

indexs = ['images/'+str(index)+'.jpg' for index in indexs]
#print(indexs)
transform_train = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(96,scale=(0.64,1.0),ratio=(1.0,1.0)),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
])
train_valid_datasets = torchvision.datasets.ImageFolder(
    root=os.path.join(data_dir,'train_valid_test','train_valid'),transform=transform_train)
submission = pd.DataFrame({'image':indexs,'label':preds})#使用pandas.DataFrame组织格式:编号,标签两列
#https://blog.csdn.net/weixin_48249563/article/details/114318425
#将索引数字转换成对应的标签名称,ImageFolder组织数据集时将文件夹名称(如:cat文件夹)生成一个对应的类名(如:cat类)
submission['label'] = submission['label'].apply(lambda x:train_valid_datasets.classes[x])
#输出到csv文件中
submission.to_csv('submission5.csv',index=False)
  1. 注意:
    预测和验证时需要加上net.eval(),训练时需要加上net.train,同时网络对模型进行预测labels时需要加上with torch.no_grad(),此时不需要网络模型参数的梯度,有利于提高GPU训练性能,另外每个batch训练开始后需要加上optim.zero_grad(),清除上一个batch的梯度,因为pytorch默认是对上一个batch的梯度累加,否则如果不清除上一个batch的梯度,网络无法训练成功,训练失败,如下代码所示:
#net.eval()
#with torch.no_grad():
    #for X, _ in test_iter:
        ## 将测试数据集复制到GPU上面
        #y_hat = net(X.to(torch.device('cuda:0')))

你可能感兴趣的:(李沐动手学深度学习笔记,深度学习,分类,计算机视觉,python,pytorch)