使用sklearn对mnist实现logistic二分类

1.脉络框架
这里计划使用logistic回归实现对mnist数据集中数值为5的图片实现二分类。使用sklearn库来实现该功能。
第一步:加载数据集
第二步:划分训练集和测试集
第三步:使用GridSearchCV网格化寻求最优超参数
第四步:使用超参数训练模型
第五步:分别使用准确率、精确度、召回率和f1函数来评估模型

2.代码实现
(1)代码

import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score,recall_score,f1_score


# 加载数据
def load_mnist():
    mnist = fetch_mldata("MNIST original")
    return mnist

# 分离训练集和测试集
def split_data(dataset):
    data = dataset["data"]
    label = dataset["target"]
    label = (label==5)
    ss = StratifiedShuffleSplit(n_splits=10,test_size=0.2,train_size=0.8,random_state=42)
    for train_index,test_index in ss.split(data,label):
        train_data,train_label = data[train_index],label[train_index]
        test_data, test_label = data[test_index], label[test_index]
    return train_data,train_label,test_data, test_label

def train_logistic_l1_param(train_data,train_label):
    # 获取最优参数,为0.01
    param = [{'alpha' : [1 , 0.5 ,0.1 , 0.05 ,0.01]}]
    sgd_clf = SGDClassifier(loss="log", penalty="l1")
    grid_search = GridSearchCV(sgd_clf,param,scoring = 'neg_mean_squared_error')
    grid_search.fit(train_data,train_label)
    print(grid_search.best_params_)

# logistic l1正则
def train_logistic_l1(train_data,train_label,test_data,test_label):
    # 训练学习器
    sgd_clf = SGDClassifier(loss="log", penalty="l1",alpha = 0.0001)
    sgd_clf.fit(train_data,train_label)
    # 预测准确率、精确度、召回率和f1指标
    y_predict = sgd_clf.predict(test_data)
    precision = precision_score(test_label,y_predict)
    recall = recall_score(test_label,y_predict)
    f1 = f1_score(test_label,y_predict)
    accuracy = sum(y_predict == test_label) / len(y_predict)
    return accuracy,precision,recall,f1

# logistic l2正则
def train_logistic_l2(train_data,train_label,test_data,test_label):
    # 训练学习器
    sgd_clf = SGDClassifier(loss ="log",penalty="l2",alpha = 0.0001 )
    sgd_clf.fit(train_data,train_label)
    # 预测准确率、精确度、召回率和f1指标
    y_predict = sgd_clf.predict(test_data)
    precision = precision_score(test_label,y_predict)
    recall = recall_score(test_label,y_predict)
    f1 = f1_score(test_label,y_predict)
    accuracy = sum(y_predict == test_label) / len(y_predict)
    return accuracy,precision,recall,f1

# logistic 弹性网络正则
def train_logistic_l1_l2(train_data,train_label,test_data,test_label):
    # 训练学习器
    sgd_clf = SGDClassifier(loss = "log",penalty="elasticnet",alpha = 0.0001 )
    sgd_clf.fit(train_data,train_label)
    # 预测准确率、精确度、召回率和f1指标
    y_predict = sgd_clf.predict(test_data)
    precision = precision_score(test_label,y_predict)
    recall = recall_score(test_label,y_predict)
    f1 = f1_score(test_label,y_predict)
    accuracy = sum(test_label == y_predict) / len(test_label)
    return accuracy, precision, recall, f1

if __name__ == "__main__":
    # 加载数据
    mnist = load_mnist()
    # 划分数据集
    train_data, train_label, test_data, test_label = split_data(mnist)
    # 训练logist分类器,并计算其准确率
    accuracy_1,precision_1,recall_1,f1_1 = train_logistic_l1(train_data, train_label, test_data, test_label)
    accuracy_2,precision_2,recall_2,f1_2 = train_logistic_l2(train_data, train_label, test_data, test_label)
    accuracy_3,precision_3,recall_3,f1_3 = train_logistic_l1_l2(train_data, train_label, test_data, test_label)
    print("The accuracy precision recall and f1 score  of logistic_l1 is:")
    print(accuracy_1,precision_1,recall_1,f1_1)
    print("The accuracy precision recall and f1 score of logistic_l2 is:")
    print(accuracy_2,precision_2,recall_2,f1_2)
    print("The accuracy precision recall and f1 score of logistic_l1_l2 is:")
    print(accuracy_3,precision_3,recall_3,f1_3)
    

(2)结果

The accuracy precision recall and f1 score  of logistic_l1 is:
0.9679285714285715 0.8330605564648118 0.8060174188440221 0.8193158953722334
The accuracy precision recall and f1 score of logistic_l2 is:
0.9696428571428571 0.8330683624801272 0.8297703879651623 0.8314161047203491
The accuracy precision recall and f1 score of logistic_l1_l2 is:
0.9645714285714285 0.789433962264151 0.8281868566904196 0.8083462132921175

Process finished with exit code 0

你可能感兴趣的:(机器学习)