import numpy as np
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"] #X里面是数据,y里面是标签
y = y.astype(np.uint8)#将标签从文本格式转换为数字格式
import matplotlib as mpl
import matplotlib.pyplot as plt
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")
plt.show()
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
为了简化问题,我们先训练一个二分类器,比如这个分类器能识别5和非5
#创建训练集、测试集
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
SGDClassifer随机梯度下降分类器
#使用随机梯度下降分类器
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train, y_train_5)
性能评测
使用交叉验证和混淆矩阵评估
from sklearn.model_selection import cross_val_predict
#返回的不是评估分数,而是对每个折叠的预测
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
#获取混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
from sklearn.metrics import precision_score, recall_score
#获取精度 结果为0.8370879772350012
precision_score(y_train_5, y_train_pred)
#获取召回率 结果为0.6511713705958311
recall_score(y_train_5, y_train_pred)
#获取F1分数 结果为0.7325171197343846
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)
如果想调整精度和召回率就调整阈值
#与之前的不同,这里返回的是训练集中所有实例的决策分数
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
method="decision_function")
from sklearn.metrics import precision_recall_curve
#计算所有可能的阈值的精度和召回率
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
绘制图形
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.legend(loc="center right", fontsize=16) # Not shown in the book
plt.xlabel("Threshold", fontsize=16) # Not shown
plt.grid(True) # Not shown
plt.axis([-50000, 50000, 0, 1]) # Not shown
recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
plt.figure(figsize=(8, 4)) # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:") # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:") # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro") # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro") # Not shown
plt.show()
#获取90%的阈值
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
#使用这个精度进行预测
y_train_pred_90 = (y_scores >= threshold_90_precision)
这样就得到了精度设置为90%时候的预测结果
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
method="predict_proba")
像前面的使用F1分数也可以进行评价
但这里使用ROC曲线的面积进行模型的评估
对于随机梯度下降分类器
分数为0.9604938554008616
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)
对于随机森林 分数为0.9983436731328145
from sklearn.metrics import roc_auc_score
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
roc_auc_score(y_train_5, y_scores_forest)
所以随机森林看起来比较好
随机梯度下降,直接就可以将实例分为多个类
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])
如果用支持向量机,则会根据情况自动调用ovo 或者ovr
from sklearn.svm import SVC
svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5
svm_clf.predict([some_digit])
评分直接用交叉验证
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
分数为
array([0.87365, 0.85835, 0.8689 ])
使用k临近算法
from sklearn.model_selection import GridSearchCV
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)
grid_search.best_params_
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)
只是对图片向任意方向(上、下、左、右)移动一个像素,然后将其添加到训练集
from scipy.ndimage.interpolation import shift
#将图像移动几个像素
def shift_image(image, dx, dy):
image = image.reshape((28, 28))
shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
return shifted_image.reshape([-1])
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
for image, label in zip(X_train, y_train):
X_train_augmented.append(shift_image(image, dx, dy))
y_train_augmented.append(label)
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]
knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)