数据挖掘实战(1)——手写数字识别

文章目录

  • 1 导包
  • 2 数据准备
  • 3 数据规范化
  • 4 划分数据集
  • 5 训练
  • 5 查看结果
  • 6 数据降维并画出决策边界
  • 7 混淆矩阵
  • 8 学习曲线
  • 9 验证曲线

1 导包

import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris, load_digits
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, \
    ExtraTreesClassifier, ExtraTreesRegressor, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingClassifier, \
    GradientBoostingRegressor, VotingClassifier, VotingRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score, mean_squared_error, log_loss, \
    roc_curve, roc_auc_score, classification_report, confusion_matrix, PrecisionRecallDisplay, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, validation_curve
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMClassifier
from catboost import CatBoostClassifier, CatBoostRegressor
from mlxtend.classifier import StackingClassifier
from mlxtend.plotting import plot_decision_regions

2 数据准备

# Pandas设置
pd.set_option("display.max_columns", None)  # 设置显示完整的列
pd.set_option("display.max_rows", None)  # 设置显示完整的行
pd.set_option("display.expand_frame_repr", False)  # 设置不折叠数据
pd.set_option("display.max_colwidth", 100)  # 设置列的最大宽度

# 加载数据集
digits = load_digits()
X, y = digits.data, digits.target
print(X.shape, y.shape)

3 数据规范化

# 数据规范化
scaler = StandardScaler()
X = scaler.fit_transform(X)

4 划分数据集

这里测试了17个模型

# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

5 训练

# 训练
def multi_train(X_train, y_train, X_test, y_test, clf_names, clfs):
    results_df = pd.DataFrame(columns=['model_name', 'accuracy', 'precision', 'recall', 'f1', 'mse_loss', 'log_loss', 'cost_time'])
    
    count = 0
    for name, clf in zip(clf_names, clfs):
        t1 = time.time()
        print(name)
        clf = OneVsRestClassifier(clf)
        clf.fit(X_train, y_train)  # 训练
        y_predict = clf.predict(X_test)  # 预测值
        y_predict_prob = clf.predict_proba(X_test)  # 预测概率矩阵
        accuracy = accuracy_score(y_test, y_predict)  # 准确率
        precision = precision_score(y_test, y_predict, average='micro')  # 精确率
        recall = recall_score(y_test, y_predict, average='micro')  # 召回率
        f1 = f1_score(y_test, y_predict, average='micro')  # F1值
        r2 = r2_score(y_test, y_predict)
#         auc = roc_auc_score(y_test, y_predict_prob[:, -1], multi_class='ovo')  # auc值
        loss1 = mean_squared_error(y_test, y_predict)
        loss2 = log_loss(OneHotEncoder(sparse=False).fit_transform(y_test.reshape(-1, 1)), 
                         OneHotEncoder(sparse=False).fit_transform(y_predict.reshape(-1, 1)),
                        )
        print(classification_report(y_test, y_predict))  # 分类报告
        print(confusion_matrix(y_test, y_predict))  # 混淆矩阵
        
        t2 = time.time()
        series =  pd.Series({"model_name": name,
                            "accuracy": accuracy,
                            "precision": precision,
                            "recall": recall,
                            "f1": f1,
#                             "auc": auc,
                            "cost_time": t2 - t1,
                            "mse_loss": loss1,
                            "log_loss": loss2})
        # 添加新行
        results_df.loc[count] = series
        count += 1
        
    return results_df

clf_names = [
    '逻辑回归', 'KNN', '支持向量机分类', '决策树', '高斯模型', '伯努利模型', 'Bagging', '随机森林', '极限树', 
    'AdaBoost', 'GBDT', 'XGBoost', 'LightGBM', 'CatBoost', 'Voting', 'Stacking', 'MLP'
]
clfs = [
    LogisticRegression(),
    KNeighborsClassifier(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    GaussianNB(),
    BernoulliNB(),
    BaggingClassifier(),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier(),
    LGBMClassifier(),
    CatBoostClassifier(silent=True),
    VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('gnb', GaussianNB())], 
                     voting='soft', 
                     weights=[2,1,1], 
                     flatten_transform=True),
    StackingClassifier(classifiers=[XGBClassifier(), GaussianNB()], meta_classifier=LogisticRegression()),
    MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=1000, activation='relu', solver='adam')
]
results_df = multi_train(X_train, y_train, X_test, y_test, clf_names, clfs)

5 查看结果

results_df

数据挖掘实战(1)——手写数字识别_第1张图片

6 数据降维并画出决策边界

pca = PCA(n_components=2)
X_train2 = pca.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_train2, y_train)
plt.figure(figsize=(10, 10))
plot_decision_regions(X=X_train2, y=y_train, clf=clf)
plt.show()

数据挖掘实战(1)——手写数字识别_第2张图片

7 混淆矩阵

以LR回归为例

display = ConfusionMatrixDisplay.from_estimator(estimator=LogisticRegression().fit(X_train, y_train), X=X_test, y=y_test)

数据挖掘实战(1)——手写数字识别_第3张图片

8 学习曲线

以MLP为例

# 学习曲线
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, activation='relu', solver='adam')
mlp.fit(X_train, y_train)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
train_sizes, train_scores, test_scores = learning_curve(estimator=mlp, X=X_test, y=y_test, cv=5, train_sizes=[0.1, 0.25, 0.5, 0.75, 1], scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training')
plt.plot(train_sizes, test_scores_mean, label='Cross-validation')
plt.xlabel('Training examples')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

数据挖掘实战(1)——手写数字识别_第4张图片

9 验证曲线

以XGBoost为例

# 验证曲线
param_range = [i for i in range(3, 11)]
train_scores, test_scores = validation_curve(estimator=xgb, X=X_test, param_name='max_depth', param_range=param_range, y=y_test, cv=5, scoring='accuracy')

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, train_scores_mean, 'y', label='Training')
plt.plot(param_range, test_scores_mean, 'm', label='Cross-validation')
plt.xlabel('param')
plt.ylabel('Accuracy')
plt.show()

数据挖掘实战(1)——手写数字识别_第5张图片

你可能感兴趣的:(#,数据挖掘,人工智能,sklearn,数据挖掘,分类,python,机器学习)