难度不大,只需在手写字符的基础上做些改变。
重点在于pytorch学习,效果并不是特别好,在测试集上的loss有16
下面是pytorch实现代码:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
#定义一个cpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 定义函数将类别标签转为id表示,方便后面计算交叉熵
def label2id(labels):
id = []
for label in labels:
id.append(int(label.split("_")[-1])-1)
return id
# prepare dataset
# 定义训练集
class Train_OttoDataset(Dataset):
def __init__(self, filepath):
#导入数据
data = pd.read_csv(filepath)
labels = data['target']
labels = label2id(labels) #将字符串标签转换成id
self.len = data.shape[0]
train_num = int(self.len * 1)
#切分80%为训练集
self.train_data = data[:train_num]
self.train_label = labels[:train_num]
self.x_data = torch.from_numpy(np.array(self.train_data)[:,1:-1].astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.train_label).astype(np.float32))
self.train_len = self.train_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.train_len
train_dataset = Train_OttoDataset('train.csv')
train_loader = DataLoader(dataset = train_dataset, batch_size = 32, shuffle = True, num_workers = 2)
# 定义验证集
class Dev_OttoDataset(Dataset):
def __init__(self, filepath):
#导入数据
data = pd.read_csv(filepath)
labels = data['target']
labels = label2id(labels) #将字符串标签转换成id
self.len = data.shape[0]
train_num = int(self.len * 0.8)
#切分80%为数据集
self.dev_data = data[train_num:]
self.dev_label = labels[train_num:]
self.x_data = torch.from_numpy(np.array(self.dev_data)[:,1:-1].astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.dev_label).astype(np.float32))
self.dev_len = self.dev_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.dev_len
dev_dataset = Dev_OttoDataset('train.csv')
dev_loader = DataLoader(dataset = dev_dataset, batch_size = 8, shuffle = False, num_workers = 2)
# design model using class
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.linear1 = torch.nn.Linear(93, 64)
self.bn = torch.nn.BatchNorm1d(num_features=64)
self.linear2 = torch.nn.Linear(64, 32)
self.dropout = torch.nn.Dropout(p=0.1)
self.linear3 = torch.nn.Linear(32, 16)
self.linear4 = torch.nn.Linear(16, 9)
self.relu = torch.nn.ReLU()
def forward(self, x):
# first layer
x = self.relu(self.bn(self.linear1(x)))
# second layer
x = self.relu(self.dropout(self.linear2(x)))
x = self.relu(self.linear3(x))
return self.linear4(x) # 最后一层不做激活,直接接到softmax
model = Net()
model.to(device)
# construct loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.5)
def train(epoch):
running_loss = 0.0
for batch_idx, data in enumerate(tqdm(train_loader), 0):
inputs, labels = data[0].to(device), data[1].to(device) # 使用gpu训练
optimizer.zero_grad()
#forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels.long())
loss.backward()
optimizer.step()
running_loss += loss.item()
if batch_idx % 300 == 299: # 输出每次的平均loss
print('\n [%d, %5d] loss: %.3f' % (epoch+1, batch_idx+1, running_loss/300))
running_loss = 0.0
# 验证
def dev():
with torch.no_grad():
correct = 0.0
total = 0.0
dev_mean_loss = 0.0
for batch_idx, data in enumerate(tqdm(train_loader), 0):
inputs, labels = data[0].to(device), data[1].to(device)
outputs = model(inputs)
dev_loss = criterion(outputs, labels.long())
dev_mean_loss += dev_loss.item()
total += labels.size(0)
_, predicted = torch.max(outputs.data, dim = 1)
correct += (predicted == labels).sum().item()
acc = correct / total
count = batch_idx
print('dev loss:', dev_mean_loss/count,'Accuracy on dev set:', acc)
# 定义预测保存函数,用于保存预测结果
def predict_save():
test_data = pd.read_csv('test.csv')
test_input = torch.from_numpy(np.array(test_data)[:,1:].astype(np.float32)).to(device)
with torch.no_grad():
test_out = model(test_input)
_, predicted = torch.max(test_out, dim = 1) #dim=1表示输出所在行的最大值
test_out = pd.get_dummies(predicted.cpu())
print(test_out)
lables=['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
# 添加列标签
test_out.columns = lables
# 插入id行
test_out.insert(0, 'id', test_data['id'])
output = pd.DataFrame(test_out)
output.to_csv('my_predict.csv', index = False)
if __name__ == '__main__':
for epoch in range(100):
train(epoch)
dev()
predict_save()