数据集包含:样本序号、样本标签、和基因疾病对的特征(由二者特征拼接而成 dim=256)
from torch.utils.data import Dataset, DataLoader
class Mirna_die_Dataset(Dataset):
def __init__(self, x=None,y=None):
data_file = os.path.join('data', 'labelmirnadiease', 'wfy_label_mirna_die.csv') #加载从
data_frame = pd.read_csv(data_file, header=None)
print(len(data_frame))
outputs, inputs = data_frame.iloc[:, 1], data_frame.iloc[:, 2:258]
# x, y = torch.tensor(inputs.values,dtype=torch.float32), torch.tensor(outputs.values.reshape(-1, 1),dtype=torch.float32)
x, y = torch.tensor(inputs.values, dtype=torch.float32), torch.tensor(outputs.values, dtype=torch.float32)
self.x = x
self.y = y
def __len__(self): # 返回样本数量
return len(self.x)
def __getitem__(self, idx):#根据索引返回样本和标签(tensor类型)
x = self.x[idx]
y = self.y[idx]
return x,y
md_dataset = Mirna_die_Dataset()
train_size = int(len(md_dataset) * 0.7)
test_size = len(md_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(md_dataset, [train_size, test_size])
train_iter = DataLoader(train_dataset, batch_size=5,
shuffle=True, num_workers=0)
test_iter = DataLoader(test_dataset, batch_size=5,
shuffle=True, num_workers=0)
def train_epoch_ch3(net, train_iter, loss, updater): #@save
"""训练模型一个迭代周期(定义见第3章)。"""
# 将模型设置为训练模式
if isinstance(net, torch.nn.Module):
net.train()
# 训练损失总和、训练准确度总和、样本数
metric = Accumulator(3)
#
num = 0
#
for X, y in train_iter:
# 计算梯度并更新参数
y_hat = net(X)
l = loss(y_hat, y)
##wfy
num+=1
print('batch num:',num,str(l)+'\n')
##
if isinstance(updater, torch.optim.Optimizer):
# 使用PyTorch内置的优化器和损失函数
updater.zero_grad()
l.backward()
updater.step()
metric.add(
float(l) * len(y), accuracy(y_hat, y),
y.size().numel())
else:
# 使用PyTorch内置的优化器和损失函数
l.sum().backward()
updater(X.shape[0])
metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
# 返回训练损失和训练准确率
return metric[0] / metric[2], metric[1] / metric[2]
import pandas as pd
import os
import random
#加载由不同的表示学习算法所得到的结点嵌入表示
data_file = os.path.join('data', 'labelmirnadieasedeepwalksampling', 'wfy_label_mirna_die_hdmm_deepwalk_sampling.csv')
# print(data_file)
train = pd.read_csv(data_file,header=None)
# train = train[:20]
train.info()
def k_fold_split(train_df, k):
# os.system("mkdir data")
#分别存储5折
k_fold = []
index = set(range(train.shape[0]))
for i in range(k):
# 防止所有数据不能整除k,最后将剩余的都放到最后一折
if i == k - 1:
k_fold.append(list(index))
else:
tmp = random.sample(list(index), int(1.0 / k * train.shape[0]))
k_fold.append(tmp)
index -= set(tmp)
# 将原始训练集划分为k个包含训练集和验证集的训练集,同时每个训练集中,训练集:验证集=k-1:1
for i in range(k):
print("第{}折........".format(i + 1))
tra = []
dev = k_fold[i]
for j in range(k):
if i != j:
tra += k_fold[j] #拼接除了第i折之外的另外四折作为训练集
train.iloc[tra].to_csv("data/node2vec_5_fold/train_{}.csv".format(i), index=False,header=None )
train.iloc[dev].to_csv("data/node2vec_5_fold/val_{}.csv".format(i), index=False,header=None)
print("done!")
if __name__ == "__main__":
k_fold_split(train, 5)
class Mirna_die_Dataset(Dataset):
def __init__(self, x=None, y=None,data_file=None):
wfy_utils.save_result(data_file)
##直接读取
data_frame = pd.read_csv(data_file, header=None)
print(len(data_frame))
outputs = data_frame.iloc[:, 1]
inputs = data_frame.iloc[:, 2:258]
x, y = torch.tensor(inputs.values,dtype=torch.float32), torch.tensor(outputs.values.reshape(-1, 1),dtype=torch.float32)
self.x = x
self.y = y
def __len__(self):
return len(self.x)
def __getitem__(self, idx):
x = self.x[idx]
y = self.y[idx]
return x,y #返回样本和标签的tensor
train_data_file = os.path.join('data', 'hmdd20_deepwalk_5_fold', 'train_0.csv')
val_data_file = os.path.join('data', 'hmdd20_deepwalk_5_fold', 'val_0.csv')
train_dataset = Mirna_die_Dataset(data_file=train_data_file)
test_dataset = Mirna_die_Dataset(data_file=val_data_file)
# 迭代划分的数据集
train_iter = DataLoader(train_dataset, batch_size=batch_size,
shuffle=True, num_workers=0)
test_iter = DataLoader(test_dataset, batch_size=len(test_dataset),
shuffle=True, num_workers=0)