1.脉络框架
这里计划使用logistic回归实现对mnist数据集中数值为5的图片实现二分类。使用sklearn库来实现该功能。
第一步:加载数据集
第二步:划分训练集和测试集
第三步:使用GridSearchCV网格化寻求最优超参数
第四步:使用超参数训练模型
第五步:分别使用准确率、精确度、召回率和f1函数来评估模型
2.代码实现
(1)代码
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score,recall_score,f1_score
# 加载数据
def load_mnist():
mnist = fetch_mldata("MNIST original")
return mnist
# 分离训练集和测试集
def split_data(dataset):
data = dataset["data"]
label = dataset["target"]
label = (label==5)
ss = StratifiedShuffleSplit(n_splits=10,test_size=0.2,train_size=0.8,random_state=42)
for train_index,test_index in ss.split(data,label):
train_data,train_label = data[train_index],label[train_index]
test_data, test_label = data[test_index], label[test_index]
return train_data,train_label,test_data, test_label
def train_logistic_l1_param(train_data,train_label):
# 获取最优参数,为0.01
param = [{'alpha' : [1 , 0.5 ,0.1 , 0.05 ,0.01]}]
sgd_clf = SGDClassifier(loss="log", penalty="l1")
grid_search = GridSearchCV(sgd_clf,param,scoring = 'neg_mean_squared_error')
grid_search.fit(train_data,train_label)
print(grid_search.best_params_)
# logistic l1正则
def train_logistic_l1(train_data,train_label,test_data,test_label):
# 训练学习器
sgd_clf = SGDClassifier(loss="log", penalty="l1",alpha = 0.0001)
sgd_clf.fit(train_data,train_label)
# 预测准确率、精确度、召回率和f1指标
y_predict = sgd_clf.predict(test_data)
precision = precision_score(test_label,y_predict)
recall = recall_score(test_label,y_predict)
f1 = f1_score(test_label,y_predict)
accuracy = sum(y_predict == test_label) / len(y_predict)
return accuracy,precision,recall,f1
# logistic l2正则
def train_logistic_l2(train_data,train_label,test_data,test_label):
# 训练学习器
sgd_clf = SGDClassifier(loss ="log",penalty="l2",alpha = 0.0001 )
sgd_clf.fit(train_data,train_label)
# 预测准确率、精确度、召回率和f1指标
y_predict = sgd_clf.predict(test_data)
precision = precision_score(test_label,y_predict)
recall = recall_score(test_label,y_predict)
f1 = f1_score(test_label,y_predict)
accuracy = sum(y_predict == test_label) / len(y_predict)
return accuracy,precision,recall,f1
# logistic 弹性网络正则
def train_logistic_l1_l2(train_data,train_label,test_data,test_label):
# 训练学习器
sgd_clf = SGDClassifier(loss = "log",penalty="elasticnet",alpha = 0.0001 )
sgd_clf.fit(train_data,train_label)
# 预测准确率、精确度、召回率和f1指标
y_predict = sgd_clf.predict(test_data)
precision = precision_score(test_label,y_predict)
recall = recall_score(test_label,y_predict)
f1 = f1_score(test_label,y_predict)
accuracy = sum(test_label == y_predict) / len(test_label)
return accuracy, precision, recall, f1
if __name__ == "__main__":
# 加载数据
mnist = load_mnist()
# 划分数据集
train_data, train_label, test_data, test_label = split_data(mnist)
# 训练logist分类器,并计算其准确率
accuracy_1,precision_1,recall_1,f1_1 = train_logistic_l1(train_data, train_label, test_data, test_label)
accuracy_2,precision_2,recall_2,f1_2 = train_logistic_l2(train_data, train_label, test_data, test_label)
accuracy_3,precision_3,recall_3,f1_3 = train_logistic_l1_l2(train_data, train_label, test_data, test_label)
print("The accuracy precision recall and f1 score of logistic_l1 is:")
print(accuracy_1,precision_1,recall_1,f1_1)
print("The accuracy precision recall and f1 score of logistic_l2 is:")
print(accuracy_2,precision_2,recall_2,f1_2)
print("The accuracy precision recall and f1 score of logistic_l1_l2 is:")
print(accuracy_3,precision_3,recall_3,f1_3)
(2)结果
The accuracy precision recall and f1 score of logistic_l1 is:
0.9679285714285715 0.8330605564648118 0.8060174188440221 0.8193158953722334
The accuracy precision recall and f1 score of logistic_l2 is:
0.9696428571428571 0.8330683624801272 0.8297703879651623 0.8314161047203491
The accuracy precision recall and f1 score of logistic_l1_l2 is:
0.9645714285714285 0.789433962264151 0.8281868566904196 0.8083462132921175
Process finished with exit code 0