[李宏毅深度学习作业笔记]HW1 Covid-19(Regression))

作业代码展示1

Import Some Package

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# For data preprocess
import numpy as np
import csv
import os
# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

myseed = 2020815  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

对于myseed、torch.backends.cudnn.deteministic、torch.backends.cudnn.benchmark 的设置是为了能够固定在深度学习训练过程中所产生的一些随机数字,包括优化过程中对卷积操作的随机优化过程。该设计是为了能够尽量的保证在代码的结果能够被复现。相关的资料可以参考如下链接torch.backends.cudnn.benchmark ? 2

Some Utilities

def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(0.0, 5.)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()


def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Prediction')
    plt.show()

Preprocess

对于表格数据的回归,我们很有可能会遇到特征比较多的情况,这样往往会导致模型的过拟合因此我们需要实现做一些特征选择的工作

import pandas as pd
import numpy as np

data = pd.read_csv('/kaggle/input/ml2021spring-hw1/covid.train.csv')
x = data[data.columns[1:94]]
y = data[data.columns[94]]

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn import preprocessing
x = (x - x.min()) / (x.max() - x.min())

bestfeatures = SelectKBest(score_func=f_regression, k=5)
fit = bestfeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(15,'Score'))  #print 15 best features

其中的特征选择使用了scikit-learn中提供的特征选择方法,特征选择可以说是特征工程(数据预处理的一部分)中一个分支,另外还包括了特征提取、降维。虽然现在由于深度学习的的发展,目前图像等数据的特征提取工作都是可以直接用深度网络进行学习,但是网络是会考虑到你所有的输入的数据,这样训练出来的网络并不能知道这些数据所学出来的特征是否合适。对于数据还是需要实现进行预处理。这也是数据科学家的主要工作之一。相关的特征工程的博客。这一部分是一个很值的研究的问题,也是这个作业想要提升准确率的关键之所在。但是由于篇幅原因。在此只给出相应的链接。感兴趣的可以取相关链接学习。

DataSet

  • 读取.cvsfiles
  • 依据特征工程所选取的特征来提取相应的数据进行训练
  • 将covid.train.cvs分成训练数据与验证数据集
  • 对数据进行nomalization,注意对于数据normalization有利于后面的训练,就是batch-normalization
lass COVID19Dataset(Dataset):
    ''' Dataset for loading and preprocessing the COVID19 dataset '''
    def __init__(self,
                 path,
                 mode='train',
                 target_only=True):
        self.mode = mode

        # Read data into numpy arrays
        with open(path, 'r') as fp:
            data = list(csv.reader(fp))
            data = np.array(data[1:])[:, 1:].astype(float)
            //将相应的数据转换成对应的array,去除掉list中的第一列,即去除掉第一行的标签数据
            //在narray中去除掉第一列序号信息
        
        if not target_only:
            feats = list(range(93))
            # feats = list(range(1, 40)) + [57, 75]
        else:
            # TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
            # feats = list(range(1, 41)) + [57, 75]
            # feats = [75, 57, 42, 60, 78, 43, 61, 79, 40, 58, 76, 41, 59, 77, 45, 85]
            feats = [75, 57, 42, 60, 78, 43, 61, 79, 40, 58, 76, 41, 59, 77]//利用特征选择的方法所选取的特征
        if mode == 'test'://测试集的数据准备
            # Testing data
            # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
            data = data[:, feats]
            self.data = torch.FloatTensor(data)//由于数据需要输入后进行求导计算,所以需要将数据送入到tensor中
        else:
            # Training data (train/dev sets)
            # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
            target = data[:, -1]
            data = data[:, feats]
            
            # Splitting training data into train & dev sets
            if mode == 'train':
                #indices = [i for i in range(len(data)) if i % 10 != 0]
                indices = [i for i in range(len(data))]
            elif mode == 'dev':
                #indices = [i for i in range(len(data)) if i % 10 == 0]
                indices = [i for i in range(len(data))]
            
            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.FloatTensor(target[indices])

        # Normalize features (you may remove this part to see what will happen)
        self.data[:, 40:] = \
            (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
            / self.data[:, 40:].std(dim=0, keepdim=True)

        self.dim = self.data.shape[1]

        print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            return self.data[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

DataLoader

将输入的dataset进行切分,方便后期的训练

def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = COVID19Dataset(path, mode=mode, target_only=target_only)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

Deep Neural Network

首先对于模型效果的提升,模型的训练都可以遵循如下的方法:

1. 模型复杂度的选择

方法

  • 方法一:从特征的多少来选择即(通过前文的特征工程的方法来对模型的复杂度进行控制,注意此处的主要是为了防止模型出现过拟合的问题)
  • 方法二:对FC的层数以及每一层的宽度等超参数进行选择
  • 方法三:正则化
    对于方法一中直接对特征进行删减,这是一种比较粗粒度的方法来降低模型复杂度,而我们需要使用一种精细一些的方法来降低模型复杂度。为什么这么做有效在李沐的动手学习深度学习3一书中有简单的解释
    加入L1或者L2正则化能够有效的防止模型过拟合,其中L2正则化通常也被我们称之为权重衰减,然而需要注意的是L2正则化与Adam优化器之间的选择存在问题,因为Adam在优化过程中
  • 方法四: dropout
    dropout也是正则化的一种,也可以将dropout看成一种ensemble的方法。

思想

  • 对于模型的bias和overfitting的权衡:对于如何去判断模型是否出现了overfitting还是出现了模型bias较大导致模型的整体效果不能提升,就需要使用训练集与验证集来进行判断,K折验证是一个比较好的验证方法。

2. 激活函数的选择

激活函数通常愿意选择Relu能够避免发生梯度消失的问题

3. dropout

防止模型的过拟合

4. Batch normalization

思想

feature normalization对于批量数据进行归一化,能够使梯度下降法收敛的更快

方法

注意在batch normal需要获取批量数据的均值和方差,因此需要区别对待模型在训练过程还是在测试过程,pytorch中能够自动的帮我们进行调整。

5.模型优化器的选择

思想

  • 当确定好模型后训练过程中如果发现loss比较大:还需要确定是否是模型本身bias比较大还是optmization的方法导致的。这一块的思考可以参考Residual NetWork的思考方式:简单模型能够train下去,但是复杂模型train不下去,那么说明肯定不是模型本身的问题而是模型的训练存在问题,不能很好的收敛。
  • 对于训练过程中优化问题的解决:根本原因是由于梯度非常小,无法进行梯度的更新导致的。而这其中的情况有梯度消失、到达模型的鞍点、局部最小值(critical point)这就说到对优化器的选择,使用更好的优化器。而residual的引入时从另一方面解决训练不下去的问题。
  • Cirtical Point:saddle point 容易避免; local minima的情况在高维空间的中其实并不常见,因此我们首先需要解决saddle point,解决saddle point 的方法一个是使用batch,另一个是使用momentum的方法。
  • learning rate: 有时训练不下去可能是由于learning rate 的选择存在问题。判断方法可以求取norm of gradient来进行判断。同时对norm of gradient与Hessian矩阵进行分析能够获取训练情况处于critical的情况(local minima or saddle point)。因此采用变化的learning rate
  • 总结:对于优化器的选择,选择较小的batch引入噪声和使用momentum能避免saddle point 的发生。使用变化的learning rate 能够让训练继续下去。Adam就能实现上述的要求。因此一般用Adam硬train一发。然而凡事没有绝对,随着研究的深入对于优化器的选择以及调优过程也在不断的更新。adam的问题篇文章对于调优有非常深刻的理解。非常非常值的参考。
    [李宏毅深度学习作业笔记]HW1 Covid-19(Regression))_第1张图片
class NeuralNet(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        # Define your neural network here
        # TODO: How to modify this model to achieve better performance?
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.BatchNorm1d(32),
            nn.Dropout(p=0.2),
            nn.LeakyReLU(),
            #nn.Linear(input_dim, 128),
            #nn.BatchNorm1d(128),
            #nn.Dropout(p=0.2),
            #nn.LeakyReLU(),
            #nn.Linear(128, 64),
            #nn.BatchNorm1d(64),
            #nn.Dropout(p=0.2),
            #nn.LeakyReLU(),
            #nn.Linear(64, 32),
            #nn.BatchNorm1d(32),
            #nn.Dropout(p=0.2),
            ##nn.LeakyReLU(),
            nn.Linear(32, 1)
        )

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')
        #self.criterion = nn.SmoothL1Loss(size_average=True)

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        return self.net(x).squeeze(1)

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        regularization_loss = 0
        for param in model.parameters():
         # regularization_loss += torch.sum(abs(param))
         regularization_loss += torch.sum(param ** 2)
        # TODO: you may implement L1/L2 regularization here
        return self.criterion(pred, target) + 0.00075 * regularization_loss
        ##此处采用的了L2正则化的方法来对防止模型的过拟合问题,在前面一篇参考的博文中有提到过L2正则化与Adam优化存在一定的问题
        ##有意者可以仔细阅读论文

Train/Dev/Test

Training

def train(tr_set, dv_set, model, config, device):
    ''' DNN training '''

    n_epochs = config['n_epochs']  # Maximum number of epochs

    # Setup optimizer
    optimizer = getattr(torch.optim, config['optimizer'])(
        model.parameters(), **config['optim_hparas']) ##这样写的方法有利于尝试,或者更换不同的优化器与优化器参数,该方法值得学习

    min_mse = 1000.
    loss_record = {'train': [], 'dev': []}      # for recording training loss
    early_stop_cnt = 0                          # 提前结束训练是训练神经网络的一种方法,
                                                # 这样也可以一定程度上防止模型出现过拟合
    epoch = 0
    while epoch < n_epochs:
        model.train()                           # set model to training mode
                                                #注意在模型的建立过程中使用的Batch normalization、
                                                #DropOut等技术在训练以及验证的过程中是不相同的,
                                                #所以这一句话是非常重要的
        for x, y in tr_set:                     # iterate through the dataloader
            optimizer.zero_grad()               # set gradient to zero
            x, y = x.to(device), y.to(device)   # move data to device (cpu/cuda)
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            mse_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            loss_record['train'].append(mse_loss.detach().cpu().item())

        # After each epoch, test your model on the validation (development) set.
        dev_mse = dev(dv_set, model, device)
        if dev_mse < min_mse:
            # Save model if your model improved
            min_mse = dev_mse
            print('Saving model (epoch = {:4d}, loss = {:.4f})'
                .format(epoch + 1, min_mse))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1

        epoch += 1
        loss_record['dev'].append(dev_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break

    print('Finished training after {} epochs'.format(epoch))
    return min_mse, loss_record

Validation

def dev(dv_set, model, device):
    model.eval()                                # set model to evalutation mode
    total_loss = 0
    for x, y in dv_set:                         # iterate through the dataloader
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
        total_loss += mse_loss.detach().cpu().item() * len(x)  # accumulate loss
    total_loss = total_loss / len(dv_set.dataset)              # compute averaged loss

    return total_loss

Testing

def test(tt_set, model, device):
    model.eval()                                # set model to evalutation mode
    preds = []
    for x in tt_set:                            # iterate through the dataloader
        x = x.to(device)                        # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            preds.append(pred.detach().cpu())   # collect prediction
    preds = torch.cat(preds, dim=0).numpy()     # concatenate all predictions and convert to a numpy array
    return preds

Setup Hyper-Parameters

device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/
target_only = True                   # TODO: Using 40 states & 2 tested_positive features

# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
    'n_epochs': 10000,                # maximum number of epochs
    'batch_size': 200,               # mini-batch size for dataloader
    'optimizer': 'Adam',              # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)
        #'lr': 0.0001,                 # learning rate of SGD
        #'momentum': 0.9,              # momentum for SGD
        #'weight_decay': 5e-4,
    },
    'early_stop': 500,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.pth'  # your model will be saved here
}

load data and model

tr_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)
model = NeuralNet(tr_set.dataset.dim).to(device)  # Construct model and move to device


###开始训练
model_loss, model_loss_record = train(tr_set, dv_set, model, config, device)

plot_learning_curve(model_loss_record, title='deep model')

del model
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu')  # Load your best model
model.load_state_dict(ckpt)
plot_pred(dv_set, model, device)  # Show prediction on the validation set

def save_pred(preds, file):
    ''' Save predictions to specified file '''
    print('Saving results to {}'.format(file))
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

preds = test(tt_set, model, device)  # predict COVID-19 cases with your model
save_pred(preds, 'pred.csv')         # save prediction file to pred.csv

应用了如下的几篇文章与代码,如有侵删!!


  1. Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb) ↩︎

  2. torch.backends.cudnn.benchmark ? ↩︎

  3. 动手学习深度学习 ↩︎

你可能感兴趣的:(深度学习基础,pycharm,ubuntu,intellij-idea)