程序代码
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report
import numpy as np
mnist=fetch_openml("mnist_784",version=1,cache=True)
x=mnist.data
y=mnist.target
print("降维前的特征数:",x.shape[1])
PCA_Transfer=PCA(n_components=0.95)
PCA_x=PCA_Transfer.fit_transform(x)
print("降维后的特征数:",PCA_x.shape[1])
x_train,x_test,y_train,y_test=train_test_split(PCA_x,y,test_size=0.2,random_state=1)
Standard_Transfer=StandardScaler()
standard_x_train=Standard_Transfer.fit_transform(x_train)
standard_x_test=Standard_Transfer.fit_transform(x_test)
MinMax_Transfer=MinMaxScaler()
Guiyihua_x_train=MinMax_Transfer.fit_transform(x_train)
Guiyihua_x_test=MinMax_Transfer.fit_transform(x_test)
'''
##### KNN算法 #####
KNN_scores=[]
KNN_BestParam=0
KNN_BestScore=0
for i in range(1,11):
KNN_estimator = KNeighborsClassifier(n_neighbors=i)
KNN_estimator.fit(standard_x_train, y_train)
score=KNN_estimator.score(standard_x_test,y_test)
KNN_scores.append(score)
if score>KNN_BestScore:
KNN_BestScore=score
KNN_BestParam=i
plt.plot(range(1,11),KNN_scores)
plt.show()
print("KNN算法最高准确率:",KNN_BestScore," 最佳超参数:", KNN_BestParam)
'''
Bayes_estimator1=MultinomialNB()
param_dic={"alpha":[0.5,0.6,0.7,0.8,0.9,1,1.1,1.2]}
Bayes_estimator1=GridSearchCV(Bayes_estimator1,param_grid=param_dic,cv=10,n_jobs=-1)
Bayes_estimator1.fit(Guiyihua_x_train,y_train)
print("多项式Bayes算法在测试集上的平均预测成功率:",Bayes_estimator1.score(Guiyihua_x_test,y_test))
'''
y_predict=Bayes_estimator1.predict(x_test)
Bayes_table1=classification_report(y_test,y_predict)
print(Bayes_table1)
train_sizes,train_scores,test_scores=learning_curve(Bayes_estimator1,x,y)
plt.figure(1)
plt.xlabel("样本个数",fontproperties="Simhei")
plt.ylabel("预测准确率",fontproperties="Simhei")
plt.grid(True)
line1,=plt.plot(train_sizes,np.mean(train_scores,axis=1),color='r',label="train score")
line2,=plt.plot(train_sizes,np.mean(test_scores,axis=1),color='g',label="test score")
plt.legend([line1,line2],["train_score","test_score"])
plt.show()
'''
Bayes_estimator2=GaussianNB()
Bayes_estimator2=GridSearchCV(Bayes_estimator2,cv=10,param_grid={},n_jobs=-1)
Bayes_estimator2.fit(x_train,y_train)
print("高斯Bayes算法在测试集上的平均预测成功率:",Bayes_estimator2.score(x_test,y_test))
'''
y_predict=Bayes_estimator2.predict(x_test)
Bayes_table2=classification_report(y_test,y_predict)
print(Bayes_table2)
train_sizes,train_scores,test_scores=learning_curve(Bayes_estimator2,x,y)
plt.figure(2)
plt.xlabel("样本个数",fontproperties="Simhei")
plt.ylabel("预测准确率",fontproperties="Simhei")
plt.grid(True)
line1,=plt.plot(train_sizes,np.mean(train_scores,axis=1),color='r',label="train score")
line2,=plt.plot(train_sizes,np.mean(test_scores,axis=1),color='g',label="test score")
plt.legend([line1,line2],["train_score","test_score"])
plt.show()
'''
Bayes_estimator3=BernoulliNB()
Bayes_estimator2=GridSearchCV(Bayes_estimator2,cv=10,param_grid={},n_jobs=-1)
Bayes_estimator3.fit(x_train,y_train)
print("伯努利Bayes算法在测试集上的平均预测成功率:",Bayes_estimator3.score(x_test,y_test))
'''
y_predict=Bayes_estimator3.predict(x_test)
Bayes_table3=classification_report(y_test,y_predict)
print(Bayes_table3)
train_sizes,train_scores,test_scores=learning_curve(Bayes_estimator3,x,y)
plt.figure(3)
plt.xlabel("样本个数",fontproperties="Simhei")
plt.ylabel("预测准确率",fontproperties="Simhei")
plt.grid(True)
line1,=plt.plot(train_sizes,np.mean(train_scores,axis=1),color='r',label="train score")
line2,=plt.plot(train_sizes,np.mean(test_scores,axis=1),color='g',label="test score")
plt.legend([line1,line2],["train_score","test_score"])
plt.show()
'''
DecisionTree_estimator1=DecisionTreeClassifier(random_state=1,criterion="entropy")
CV_DecisionTree_estimator1=GridSearchCV(DecisionTree_estimator1,cv=10,param_grid={},n_jobs=-1)
CV_DecisionTree_estimator1.fit(x_train,y_train)
print("信息熵决策树算法在测试集上的平均预测成功率:", CV_DecisionTree_estimator1.score(x_test,y_test))
'''
y_predict=CV_DecisionTree_estimator1.predict(x_test)
CV_DecisionTree_table1=classification_report(y_test,y_predict)
print(CV_DecisionTree_table1)
'''
'''
train_sizes,train_scores,test_scores=learning_curve(DecisionTree_estimator1,x,y)
plt.figure(4)
plt.xlabel("样本个数",fontproperties="Simhei")
plt.ylabel("预测准确率",fontproperties="Simhei")
plt.grid(True)
line1,=plt.plot(train_sizes,np.mean(train_scores,axis=1),color='r',label="train score")
line2,=plt.plot(train_sizes,np.mean(test_scores,axis=1),color='g',label="test score")
plt.legend([line1,line2],["train_score","test_score"])
plt.show()
'''
DecisionTree_estimator2=DecisionTreeClassifier(random_state=1,criterion="gini")
CV_DecisionTree_estimator2=GridSearchCV(DecisionTree_estimator2,cv=10,param_grid={},n_jobs=-1)
CV_DecisionTree_estimator2.fit(x_train,y_train)
print("基尼指数决策树算法在测试集上的平均预测成功率:", CV_DecisionTree_estimator2.score(x_test,y_test))
'''
y_predict=CV_DecisionTree_estimator2.predict(x_test)
CV_DecisionTree_table2=classification_report(y_test,y_predict)
print(CV_DecisionTree_table2)
train_sizes,train_scores,test_scores=learning_curve(DecisionTree_estimator2,x,y)
plt.figure(5)
plt.xlabel("样本个数",fontproperties="Simhei")
plt.ylabel("预测准确率",fontproperties="Simhei")
plt.grid(True)
line1,=plt.plot(train_sizes,np.mean(train_scores,axis=1),color='r',label="train score")
line2,=plt.plot(train_sizes,np.mean(test_scores,axis=1),color='g',label="test score")
plt.legend([line1,line2],["train_score","test_score"])
plt.show()
'''
'''
##### 随机森林算法 #####
RandomForest_estimator1=RandomForestClassifier(random_state=1,criterion="entropy")
param_dic={"n_estimators":[2,4,6,8,10,12,14,16,18,20]}
CV_RandomForest_estimator1=GridSearchCV(RandomForest_estimator1,cv=10,param_grid=param_dic)
CV_RandomForest_estimator1.fit(x_train,y_train)
print("信息熵随机森林算法在测试集上的平均预测成功率:",CV_RandomForest_estimator1.score(x_test,y_test)," 最佳超参数为:",CV_RandomForest_estimator1.best_params_)
RandomForest_estimator2=RandomForestClassifier(random_state=1,criterion="gini")
param_dic={"n_estimators":[2,4,6,8,10,12,14,16,18,20]}
CV_RandomForest_estimator2=GridSearchCV(RandomForest_estimator1,cv=10,param_grid=param_dic)
CV_RandomForest_estimator2.fit(x_train,y_train)
print("信息熵随机森林算法在测试集上的平均预测成功率:",CV_RandomForest_estimator2.score(x_test,y_test)," 最佳超参数为:",CV_RandomForest_estimator2.best_params_)
'''
'''
##### 逻辑回归算法 #####
logit_estimator=LogisticRegression()
logit_estimator.fit(x_train,y_train)
print("逻辑回归算法在测试集上的平均预测成功率:",logit_estimator.score(x_test,y_test))
'''