1. 调整MLPClassifier分类器的参数solver,比较不同参数的模型在鸢尾花数据集上的分类性能。
import pandas as pd
filename = 'data/iris.data'
data = pd.read_csv(filename,header = None)
data.columns = ['sepal length','sepal width','petal length','petal width','class']
data.iloc[0:5]
data.loc[data['class'] == 'Iris-setosa','class'] = 0
data.loc[data['class'] == 'Iris-versicolor','class'] = 1
data.loc[data['class'] == 'Iris-virginica','class'] = 2
x = data.iloc[:,0:4]
y = data.iloc[:,4]
from sklearn import model_selection
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size = 0.3,random_state = 1)
#不同solver算法准确率的比较
from sklearn.neural_network import MLPClassifier
mlp1 = MLPClassifier(solver = 'lbfgs',alpha = 1e-5,hidden_layer_sizes = (5,5),random_state = 1)
mlp1.fit(x_train,y_train)
print('模型1准确率:',mlp1.score(x_test,y_test))
mlp2 = MLPClassifier(solver = 'sgd',alpha = 1e-5,hidden_layer_sizes = (5,5),random_state = 1)
mlp2.fit(x_train,y_train)
print('模型2准确率:',mlp2.score(x_test,y_test))
mlp3 = MLPClassifier(solver = 'adam',alpha = 1e-5,hidden_layer_sizes = (5,5),random_state = 1)
mlp3.fit(x_train,y_train)
print('模型3准确率:',mlp3.score(x_test,y_test))
#模型1
from sklearn import metrics
y_predicted1 = mlp1.predict(x_test)
print("Classification report for %s" % mlp1)
print(metrics.classification_report(y_test,y_predicted1))
print("Confusion matrix:\n",metrics.confusion_matrix(y_test,y_predicted1))
#模型2
y_predicted2 = mlp2.predict(x_test)
print("Classification report for %s" % mlp2)
print(metrics.classification_report(y_test,y_predicted2))
print("Confusion matrix:\n",metrics.confusion_matrix(y_test,y_predicted2))
#模型3
y_predicted3 = mlp3.predict(x_test)
print("Classification report for %s" % mlp3)
print(metrics.classification_report(y_test,y_predicted3))
print("Confusion matrix:\n",metrics.confusion_matrix(y_test,y_predicted3))
分析:
MPL分类器对鸢(读yuan,一声)尾花数据预测中,参数solver值里的‘lbfgs’预测准确率为1,,而‘sgd’与‘adam’的预测准确率为0.6,可见‘lbfgs’算法效果更好,准确率较高。
2. 在MLPClassfier训练函数fit()前后增加计时功能,设置不同隐藏层数目,比较训练所耗费的时间,以及模型分类的准确性。针对MLP模型,是否结点越多分类性能越好?
【提示】计时函数:import time
先来了解一下计时函数的使用,用一小段代码测试一下:
#运行时间代码:
import time
print('运行时间:\n')
start = time.time()
for i in range(2**23):
pass #pass在此处作用:不执行,起占位作用
end = time.time()
print(end - start, 's')
正题:
#根据题目意思。分别设置神经网络结构为(5,5),(1,1),(5,5,5,5),分别进行横向与纵向比较对训练时间与分类性能的影响
import pandas as pd
filename = 'data/iris.data'
data = pd.read_csv(filename,header = None)
data.columns = ['sepal length','sepal width','petal length','petal width','class']
data.iloc[0:5]
data.loc[data['class'] == 'Iris-setosa','class'] = 0
data.loc[data['class'] == 'Iris-versicolor','class'] = 1
data.loc[data['class'] == 'Iris-virginica','class'] = 2
x = data.iloc[:,0:4]
y = data.iloc[:,4]
from sklearn import model_selection
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size = 0.3,random_state = 1)
from sklearn.neural_network import MLPClassifier
import time
#模型1
start1 = time.time()
mlp1 = MLPClassifier(solver = 'lbfgs',alpha = 1e-5,hidden_layer_sizes = (5,5),random_state = 1)
mlp1.fit(x_train,y_train)
end1 = time.time()
print('模型1运行时间为:',end1-start1,'s')
print('模型1准确率:',mlp1.score(x_test,y_test))
#模型2
start2 = time.time()
mlp2 = MLPClassifier(solver = 'lbfgs',alpha = 1e-5,hidden_layer_sizes = (1,1),random_state = 1) #更改隐藏层节点数
mlp2.fit(x_train,y_train)
end2 = time.time()
print('模型2运行时间为:',end2-start2,'s')
print('模型2准确率:',mlp2.score(x_test,y_test))
#模型3
start3 = time.time()
mlp3 = MLPClassifier(solver = 'lbfgs',alpha = 1e-5,hidden_layer_sizes = (5,5,5,5),random_state = 1) #增加隐藏层,同时增加节点
mlp3.fit(x_train,y_train)
end3 = time.time()
print('模型3运行时间为:',end3-start3,'s')
print('模型3准确率:',mlp3.score(x_test,y_test))
#比较分类性能
#模型1
from sklearn import metrics
y_predicted1 = mlp1.predict(x_test)
print("Classification report for %s" % mlp1)
print(metrics.classification_report(y_test,y_predicted1))
print("Confusion matrix:\n",metrics.confusion_matrix(y_test,y_predicted1))
#模型2
y_predicted2 = mlp2.predict(x_test)
print("Classification report for %s" % mlp2)
print(metrics.classification_report(y_test,y_predicted2))
print("Confusion matrix:\n",metrics.confusion_matrix(y_test,y_predicted2))
#模型3
y_predicted3 = mlp3.predict(x_test)
print("Classification report for %s" % mlp3)
print(metrics.classification_report(y_test,y_predicted3))
print("Confusion matrix:\n",metrics.confusion_matrix(y_test,y_predicted3))
分析:
从结果上看,隐藏层数目相同,神经节点多的,预测时间整体上较长(不排除个别情况较短),准确率不一定越高,分类性能不一定越好。隐藏层数目越多,预测时间不一定越长,准确率也不一定越高。
由此可见,MLP分类器的预测性能与初始化的参数密切相关。