课程链接:https://aistudio.baidu.com/aistudio/course/introduce/1978
机器学习的过程可以简化为三个步骤:1、定义function集合,也就是model集;2、使用数据训练判断function好坏;3、用一个方法挑选出表现最好的function
根据上面展示的三个步骤,可以利用百度Paddle框架快速实现机器学习过程。
模型可以自己搭建,也可以使用飞桨内置的模型
import paddle
# Sequential形式组网
mnist = paddle.nn.Sequential(
paddle.nn.Flatten(),
paddle.nn.Linear(784, 512),
paddle.nn.ReLU(),
paddle.nn.Dropout(0.2),
paddle.nn.Linear(512, 10)
)
# Layer类继承方式组网
class Mnist(paddle.nn.Layer):
def __init__(self):
super(Mnist, self).__init__()
self.flatten = paddle.nn.Flatten()
self.linear_1 = paddle.nn.Linear(784, 512)
self.linear_2 = paddle.nn.Linear(512, 10)
self.relu = paddle.nn.ReLU()
self.dropout = paddle.nn.Dropout(0.2)
def forward(self, inputs):
y = self.flatten(inputs)
y = self.linear_1(y)
y = self.relu(y)
y = self.dropout(y)
y = self.linear_2(y)
return y
# 飞桨框架内置模型
print('飞桨框架内置模型:', paddle.vision.models.__all__)
lenet = paddle.vision.models.LeNet()
对于自定义的数据集,你可以在数据集的构造函数中进行数据增强方法的定义,之后对 __getitem__
中返回的数据进行应用,就可以完成自定义数据增强。
import numpy as np
import pandas as pd
import paddle
from paddle.io import Dataset
from PIL import Image
import glob,os
class MyDataset(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, mode='train'):
"""
步骤二:实现构造函数,定义数据读取方式,划分训练和测试数据集
"""
super(MyDataset, self).__init__()
self.data=[]
if mode == 'train':
path='work/real_or_drawing/train_data'
for root,dirs,files in os.walk(path):
for file in files:
file_path = os.path.join(root,file)
folder_name = os.path.basename(os.path.dirname(file_path))
f=Image.open(file_path)
data=[f,folder_name]
self.data.append(data)
else:
path='work/real_or_drawing/test_data/0'
file=glob.glob(os.path.join(path, "*.bmp"))
for f in file:
data=[paddle.vision.transforms.pad(Image.open(f), padding=2),'0'] #验证集resize 32*32 ,lable默认都为0
self.data.append(data)
def __getitem__(self, index, mode='train'):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
img = self.data[index][0].convert('L') #转成灰度图片
img_t=paddle.vision.transforms.to_tensor(img)
label = int(self.data[index][1])
return img_t, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return len(self.data)
train_data=MyDataset(mode='train')
test_data=MyDataset(mode='test')
飞桨框架通过基础API,通过向后传播、梯度下降对模型进行训练与预测。
"""
定义模型训练函数
"""
val_acc_history = []
val_loss_history = []
def training_loop(train_data,validation_data,model, optimizer, loss_fn,n_epochs,batch_size):
model.train()
train_loader = paddle.io.DataLoader(train_data, batch_size=batch_size, shuffle=True) #不接受str类型
valid_loader = paddle.io.DataLoader(validation_data, batch_size=batch_size, shuffle=True)
for epoch in range(1, n_epochs + 1):
for batch_id, data in enumerate(train_loader()):
x_data = paddle.to_tensor(data[0])
y_data = paddle.to_tensor(data[1])
y_data = paddle.unsqueeze(y_data, 1)
#model进行预测目标值
y_pred = model(x_data)
#计算损失函数
loss= loss_fn(y_pred, y_data)
if batch_id % 1000== 0:
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy()))
loss.backward()
optimizer.step()
optimizer.clear_grad()
# evaluate model after one epoch
model.eval()
accuracy = []
losses = []
for batch_id, data in enumerate(valid_loader()):
x_data = data[0]
y_data = paddle.to_tensor(data[1])
y_data = paddle.unsqueeze(y_data, 1)
y_pred = model(x_data)
loss = loss_fn(y_pred, y_data)
acc = paddle.metric.accuracy(y_pred, y_data)
accuracy.append(acc.numpy())
losses.append(loss.numpy())
avg_acc, avg_loss = np.mean(accuracy), np.mean(losses)
print("[validation] accuracy/loss: {}/{}".format(avg_acc, avg_loss))
val_acc_history.append(avg_acc)
val_loss_history.append(avg_loss)
model.train()
paddle.save(model.state_dict(), 'text.pdparams')
paddle.save(optimizer.state_dict(), 'text.pdopt')
#设置训练参数
model = MyRNN()
optimizer= paddle.optimizer.Adam(learning_rate=0.02, parameters=model.parameters())
loss_fn=paddle.nn.CrossEntropyLoss(reduction='mean')
n_epochs=10
batch_size=32
#开始模型训练
training_loop(train_data,validation_data,model, optimizer, loss_fn,n_epochs,batch_size)