在上次房价数据集中做出一些改进,对鸢尾花数据集进行预测。
需要导入的库
from sklearn.datasets import load_iris #导入鸢尾花数据集
from sklearn.linear_model import LogisticRegression #导入sklearn中的逻辑回归模型
from sklearn.model_selection import train_test_split,cross_val_score #导入数据集的划分和交叉验证函数
import matplotlib.pyplot as plt
import paddle
import numpy as np
import paddle.nn as nn
读取鸢尾花数据集,将其分为训练集和测试集
#数据读取
iris = load_iris()
iris_x = iris.data
iris_y = iris.target
train_x,test_x,train_y,test_y = train_test_split(iris_x,iris_y,test_size=0.3) #划分数据集和测试集
train_data = np.insert(train_x, 4, train_y, 1)
test_data = np.insert(test_x, 4, test_y, 1)
通过 train_test_split()函数对数据集进行划分,设置测试数据占总样本的0.3
通过np.insert函数插入矩阵
a=np.insert(arr, obj, values, axis)
#arr原始数组,可一可多,obj插入元素位置,values是插入内容,axis是按行按列插入(0:行、1:列)。
即实现了target和data的合并
构建datasets类,和房价预测一模一样
# create datasets类
# 三个必须的函数:1.构造函数(初始化工作)2.__getitem__函数(根据index确保数据能被找到并返回这一行数据)3.返回数据集长度
# 不可或缺,定义错误会导致dataloader使用时出错;
class MyDataset(paddle.io.Dataset):
"""
继承paddle.io.Dataset类
"""
def __init__(self, data):
"""
实现构造函数(初始化这个class)
"""
super(MyDataset, self).__init__()
self.data = data
def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,指定index-->返回数据
"""
data = self.data[index]
x_data = data[:-1]
label = data[-1]
return x_data, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集长度
"""
return self.data.shape[0]
构建神经网络结构
注意:输入单元为4,输出单元为3
## create model structure
# 两个必须的函数:1.构造函数(初始化网络结构)2.forwad函数(定义前向传播过程)
class Mymodel(paddle.nn.Layer):
def __init__(self):
super(Mymodel, self).__init__()
self.linear1 = nn.Linear(4, 3) #维度(4,3)
def forward(self, inputs):
y = self.linear1(inputs)
return y
custom_dataset_train = MyDataset(train_data)
train_loader = paddle.io.DataLoader(custom_dataset_train, batch_size=50, shuffle=True,drop_last=True)
custom_dataset_test = MyDataset(test_data)
test_loader = paddle.io.DataLoader(custom_dataset_test, batch_size=len(test_data), shuffle=False)
lr_model = Mymodel()
optim = paddle.optimizer.Adam(parameters=lr_model.parameters(), learning_rate=0.1)
# 设置损失函数
loss_fn = paddle.nn.CrossEntropyLoss()
训练过程
max_epoch = 200
for epoch in range(max_epoch):
lr_model.train()
train_loss = []
for batch_id, (x_data,y_data) in enumerate(train_loader()):
# pay attention to this dtype
x_data = paddle.to_tensor(x_data,dtype="float32")
y_data = paddle.to_tensor(y_data,dtype="int64")
optim.clear_grad()
y_hat = lr_model(x_data)
loss = loss_fn(y_hat,y_data)
loss.backward()
optim.step()
train_loss.append(loss.item())
import pdb
# pdb.set_trace()
lr_model.eval()
for batch_id, data in enumerate(test_loader()):
x_data = paddle.to_tensor(x_data,dtype="float32")
y_data = paddle.to_tensor(y_data,dtype="int64")
y_hat = lr_model(x_data)
loss_test = loss_fn(y_hat,y_data)
train_loss = np.mean(train_loss)
print("epoch:"+str(epoch)+"\t train loss:" + str(round(train_loss,4)) + "\t test loss:" + str(round(loss_test.item(),4)))