【机器学习】pytorch如何加载自定义数据集并进行数据集划分

文章目录

  • 举例数据集:基因疾病关联数据集
  • 1.通过torch.utils.data.random_split划分7:3
    • 继承torch.utils.data.Dataset类
    • 实例化Dataset类
    • 划分训练集和测试集
    • 生成数据迭代器data_iter
    • 利用iter进行训练
  • 2.通过sklearn直接划分五折,再加载
    • 划分五折并存储
    • 继承Dataset类
    • 实例化Dataset类并用DataLoader生成数据迭代器

举例数据集:基因疾病关联数据集

数据集包含:样本序号、样本标签、和基因疾病对的特征(由二者特征拼接而成 dim=256)

from torch.utils.data import Dataset, DataLoader

1.通过torch.utils.data.random_split划分7:3

继承torch.utils.data.Dataset类

class Mirna_die_Dataset(Dataset):
    def __init__(self, x=None,y=None):
        data_file = os.path.join('data', 'labelmirnadiease', 'wfy_label_mirna_die.csv') #加载从
        data_frame = pd.read_csv(data_file, header=None)
        print(len(data_frame))
        outputs, inputs = data_frame.iloc[:, 1], data_frame.iloc[:, 2:258]
        # x, y = torch.tensor(inputs.values,dtype=torch.float32), torch.tensor(outputs.values.reshape(-1, 1),dtype=torch.float32)
        x, y = torch.tensor(inputs.values, dtype=torch.float32), torch.tensor(outputs.values, dtype=torch.float32)
        self.x = x
        self.y = y

    def __len__(self): # 返回样本数量
        return len(self.x)

    def __getitem__(self, idx):#根据索引返回样本和标签(tensor类型)
        x = self.x[idx]
        y = self.y[idx]
        return x,y   

实例化Dataset类

md_dataset = Mirna_die_Dataset()

划分训练集和测试集

train_size = int(len(md_dataset) * 0.7)
test_size = len(md_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(md_dataset, [train_size, test_size])

生成数据迭代器data_iter

train_iter = DataLoader(train_dataset, batch_size=5,
                        shuffle=True, num_workers=0)
 
test_iter = DataLoader(test_dataset, batch_size=5,
                        shuffle=True, num_workers=0)

利用iter进行训练

def train_epoch_ch3(net, train_iter, loss, updater):  #@save
    """训练模型一个迭代周期(定义见第3章)。"""
    # 将模型设置为训练模式
    if isinstance(net, torch.nn.Module):
        net.train()
    # 训练损失总和、训练准确度总和、样本数
    metric = Accumulator(3)
    #
    num = 0
    #
    for X, y in train_iter:
        # 计算梯度并更新参数
        y_hat = net(X)
        l = loss(y_hat, y)
        ##wfy
        num+=1
        print('batch num:',num,str(l)+'\n')
        ##
        if isinstance(updater, torch.optim.Optimizer):
            # 使用PyTorch内置的优化器和损失函数
            updater.zero_grad()
            l.backward()
            updater.step()
            metric.add(
                float(l) * len(y), accuracy(y_hat, y),
                y.size().numel())
        else:
            # 使用PyTorch内置的优化器和损失函数
            l.sum().backward()
            updater(X.shape[0])
            metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
    # 返回训练损失和训练准确率
    return metric[0] / metric[2], metric[1] / metric[2]

2.通过sklearn直接划分五折,再加载

划分五折并存储

import pandas as pd
import os
import random
#加载由不同的表示学习算法所得到的结点嵌入表示
data_file = os.path.join('data', 'labelmirnadieasedeepwalksampling', 'wfy_label_mirna_die_hdmm_deepwalk_sampling.csv')

# print(data_file)
train = pd.read_csv(data_file,header=None)
# train = train[:20]
train.info()

def k_fold_split(train_df, k):
    # os.system("mkdir data")
    #分别存储5折
    k_fold = []
    index = set(range(train.shape[0]))
    for i in range(k):
        # 防止所有数据不能整除k,最后将剩余的都放到最后一折
        if i == k - 1:
            k_fold.append(list(index))
        else:
            tmp = random.sample(list(index), int(1.0 / k * train.shape[0]))
            k_fold.append(tmp)
            index -= set(tmp)
    # 将原始训练集划分为k个包含训练集和验证集的训练集,同时每个训练集中,训练集:验证集=k-1:1
    for i in range(k):
        print("第{}折........".format(i + 1))
        tra = []
        dev = k_fold[i]
        for j in range(k):
            if i != j:
                tra += k_fold[j] #拼接除了第i折之外的另外四折作为训练集
        train.iloc[tra].to_csv("data/node2vec_5_fold/train_{}.csv".format(i), index=False,header=None )
        train.iloc[dev].to_csv("data/node2vec_5_fold/val_{}.csv".format(i), index=False,header=None)

    print("done!")


if __name__ == "__main__":
    k_fold_split(train, 5)

继承Dataset类

class Mirna_die_Dataset(Dataset):
    def __init__(self, x=None, y=None,data_file=None):
        wfy_utils.save_result(data_file)

        ##直接读取
        data_frame = pd.read_csv(data_file, header=None)
        print(len(data_frame))
        outputs = data_frame.iloc[:, 1]
        inputs = data_frame.iloc[:, 2:258]
        x, y = torch.tensor(inputs.values,dtype=torch.float32), torch.tensor(outputs.values.reshape(-1, 1),dtype=torch.float32)

        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x,y   #返回样本和标签的tensor

实例化Dataset类并用DataLoader生成数据迭代器

train_data_file = os.path.join('data', 'hmdd20_deepwalk_5_fold', 'train_0.csv')
val_data_file = os.path.join('data', 'hmdd20_deepwalk_5_fold', 'val_0.csv')

train_dataset = Mirna_die_Dataset(data_file=train_data_file)
test_dataset = Mirna_die_Dataset(data_file=val_data_file)
# 迭代划分的数据集
train_iter = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=0)

test_iter = DataLoader(test_dataset, batch_size=len(test_dataset),
                        shuffle=True, num_workers=0)

你可能感兴趣的:(机器学习,pytorch,机器学习,深度学习)