手写数字识别

代码

获取数据

import numpy as np
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"] #X里面是数据,y里面是标签
y = y.astype(np.uint8)#将标签从文本格式转换为数字格式

简单查看数据

import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")
plt.show()

手写数字识别_第1张图片

创建测试集训练集

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

训练一个二分类器

为了简化问题,我们先训练一个二分类器,比如这个分类器能识别5和非5

#创建训练集、测试集
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

使用随机梯度下降分类器

SGDClassifer随机梯度下降分类器

#使用随机梯度下降分类器
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)

sgd_clf.fit(X_train, y_train_5)

性能评测
使用交叉验证和混淆矩阵评估

from sklearn.model_selection import cross_val_predict
#返回的不是评估分数,而是对每个折叠的预测
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

#获取混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
from sklearn.metrics import precision_score, recall_score
#获取精度 结果为0.8370879772350012
precision_score(y_train_5, y_train_pred)
#获取召回率 结果为0.6511713705958311
recall_score(y_train_5, y_train_pred)
#获取F1分数 结果为0.7325171197343846
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

如果想调整精度和召回率就调整阈值

#与之前的不同,这里返回的是训练集中所有实例的决策分数
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")
from sklearn.metrics import precision_recall_curve
#计算所有可能的阈值的精度和召回率
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

绘制图形

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown



recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]


plt.figure(figsize=(8, 4))                                                                  # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro")                                             # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                             # Not shown

plt.show()

手写数字识别_第2张图片
如果你决定将精度设置为90%

#获取90%的阈值
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
#使用这个精度进行预测
y_train_pred_90 = (y_scores >= threshold_90_precision)

这样就得到了精度设置为90%时候的预测结果

使用随机森林训练

from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")

性能评价

像前面的使用F1分数也可以进行评价
但这里使用ROC曲线的面积进行模型的评估
对于随机梯度下降分类器
分数为0.9604938554008616

from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

对于随机森林 分数为0.9983436731328145

from sklearn.metrics import roc_auc_score
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
roc_auc_score(y_train_5, y_scores_forest)

所以随机森林看起来比较好

升级为可以识别0-9

随机梯度下降,直接就可以将实例分为多个类

sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

如果用支持向量机,则会根据情况自动调用ovo 或者ovr

from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5
svm_clf.predict([some_digit])

评分直接用交叉验证

from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

分数为
array([0.87365, 0.85835, 0.8689 ])

练习

提升准确度到97%

使用k临近算法

from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)
grid_search.best_params_

在这里插入图片描述
预测准确度

  • 注意这里的函数是对多分类的准确度评测
from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

人工扩展数据集

只是对图片向任意方向(上、下、左、右)移动一个像素,然后将其添加到训练集

from scipy.ndimage.interpolation import shift
#将图像移动几个像素
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)

你可能感兴趣的:(#,实战代码)