"""
如何从excel,csv文件中读取数据,转成pytorch tensor用于机器学习模型训练
# https://www.bilibili.com/video/BV1nR4y1N7Vj/?spm_id_from=333.880.my_history.page.click&vd_source=dc27cbd311ff3ca278726381630036f0
【学习总结】
1、class的__init__()是可以自定义入参的
2、
"""
# pip install openpyxl
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
"""
csv中内容
id feat1 feat2 label
1 0.796 0.7831 9
2 0.2183 0.2791 5
"""
class ExcelDataset(Dataset):
# 继承自Dataset父类
def __init__(self, filepath="data.xlsx", sheet_name=0):
# excel中第一个表单 sheet_name=0
print(f"reading {filepath}, sheet={sheet_name}")
df = pd.read_excel(
filepath, header=0, index_col=0,
names=['feat1', 'feat2', 'label'],
sheet_name=sheet_name,
dtype={"feat1":np.float32, "feat1":np.float32, "feat1":np.int32}
) # dtype在读取的时候确定数据类型
# 如果同时读取两个表单,那么df是个字典,需要才能读取数据
# df=df['corpus1']
print(f"the shape of dataframe is {df.shape}")
feat = df.iloc[:,:2].values # 这样可以从dataframe转为np.array
label = df.iloc[:,2].values
self.x = torch.from_numpy(feat) # 转为torch.tensor
self.y = torch.from_numpy(label)
def __len__(self):
return len(self.y)
def __getitem__(self, index):
return self.x[index], self.y[index] # 传一个元组出去
class CsvDataset(Dataset):
def __init__(self, filepath="data.csv"):
# there is no sheet name definition in csv format file
print(f"reading {filepath}")
df = pd.read_csv(
filepath, header=0, index_col=0,
encoding='utf-8',
names=['feat1', 'feat2', 'label'],
dtype={"feat1": np.float32, "feat1": np.float32, "feat1": np.int32},
skip_blank_lines=True,
)
# header=0 表头不要,标题不要
# index_col=0 第一列,id自增列不要
print(f"the shape if dataframe is {df.shape}")
feat = df.iloc[:,:2].values # 这样可以从dataframe转为np.array
label = df.iloc[:,2].values
self.x = torch.from_numpy(feat) # 转为torch.tensor
self.y = torch.from_numpy(label)
def __len__(self):
return len(self.y)
def __getitem__(self, index):
return self.x[index], self.y[index] # 传一个元组出去
# 不用pandas,直接将csv当做文本文件进行读取
class Csv2Dataset(Dataset):
def __init__(self, filepath="data.csv"):
print(f"reading {filepath}")
with open(filepath, encoding='utf-8') as f:
lines = f.readlines()
feat = []
label = []
for line in lines[1:]:
values = line.strip().split(',')
row_feat = [float(v) for v in values[1:3]]
row_label = int(values[3])
feat.append(row_feat)
label.append(row_label)
feat = np.array(feat, dtype=np.float32) #### list转np.array,才能放到torch里
label = np.array(label, dtype=np.float32)
self.x = torch.from_numpy(feat) # 转为torch.tensor
self.y = torch.from_numpy(label)
def __len__(self):
return len(self.y)
def __getitem__(self, index):
return self.x[index], self.y[index] # 传一个元组出去
if __name__ == '__main__':
print("test for exceldataset")
# 1.实例化一个excel_dataset对象
excel_dataset = ExcelDataset(sheet_name="corpus1") # 自己写的torch dataset,一个类
# excel_dataset = ExcelDataset(sheet_name="corpus2")
# excel_dataset = ExcelDataset(sheet_name=None) # 同时读取两个sheet
# DataLoader是一个生成器,把对象传入DataLoader中
excel_dataloader = DataLoader(excel_dataset, batch_size=8, shuffle=True)
# 使用for循环去取它
for idx, (batch_x, batch_y) in enumerate(excel_dataset):
print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}")
# 9, torch. size([8,2]), torch.size([8])
print(batch_x, batch_y)
""" 以下是伪代码
output = model(batch_x)
loss = criterion(output, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
"""
print("test for csvdataset")
csv_dataset = CsvDataset()
csv_dataloader = DataLoader(csv_dataset, batch_size=8, shuffle=True)
for idx, (batch_x, batch_y) in enumerate(csv_dataloader):
print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}")
# 9, torch. size([8,2]), torch.size([8])
print(batch_x, batch_y)
print("test for csv2dataset")
csv2_dataset = Csv2Dataset()
csv2_dataloader = DataLoader(csv2_dataset, batch_size=8, shuffle=True)
for idx, (batch_x, batch_y) in enumerate(csv2_dataloader):
print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}")
# 9, torch. size([8,2]), torch.size([8])
print(batch_x, batch_y)