思源湖的鱼

机器学习系列（二） kNN（k近邻算法）会用到scikit 2020.6.4

前言

本节学习kNN算法
这个应该算是最简单最基础的机器学习算法

思想极度简单
效果好
可以解释机器学习中很多细节

本节内容包括

自己实现底层逻辑
使用scikit的库
借用kNN了解机器学习里的一些细节问题

1、kNN的简单实现

kNN的思想简单讲就是
附近k个样本哪种多，就是哪种的概率大

可以认为没有模型
也可以认为训练数据集就是模型

如图，k设为3，绿点是新的点，附近红色有2个，蓝色有1个，故认为绿点应该被分类为红色

自己编写逻辑，简单实现如下

import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter

# 原始数据集
raw_data_X = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 作为训练集
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
# 新点
x = np.array([8.093607318, 3.365731514])
# 在图上展示
plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g')
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')
plt.scatter(x[0], x[1], color='b')
plt.show()

# kNN过程
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train] #欧拉距离
nearest = np.argsort(distances) #排序索引
k = 6 #设定k值
topK_y = [y_train[neighbor] for neighbor in nearest[:k]] #前k个点的y值
votes = Counter(topK_y) #这k个点的分类票数
predict_y = votes.most_common(1)[0][0] #票数最多的那个元素

封装成一个可调用的函数

import numpy as np
from math import sqrt
from collections import Counter

"""进行函数封装"""
# 原始版
"""
def kNN_classify(k, X_train, y_train, x):
    # 确保数据合法
    assert 1 <= k <= X_train.shape[0], "k must be valid"
    assert X_train.shape[0] == y_train.shape[0], \
        "the size of X_train must equal to the size of y_train"
    assert X_train.shape[1] == x.shape[0], \
        "the feature number of x must be equal to X_train"
    # kNN计算过程
    distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
    nearest = np.argsort(distances)
    topK_y = [y_train[i] for i in nearest[:k]]
    votes = Counter(topK_y)
    return votes.most_common(1)[0][0]
    """
    
# 优化版
class KNNClassifier:
    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None
        self._y_train = None
        
    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."
        self._X_train = X_train
        self._y_train = y_train
        return self
        
    def predict(self, X_predict):
        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._y_train is not None, \
                "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
                "the feature number of X_predict must be equal to X_train"
        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)
        
    def _predict(self, x):
        """给定单个待预测数据x，返回x的预测结果值"""
        assert x.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"
        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._X_train]
        nearest = np.argsort(distances)
        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]
        
    def __repr__(self):
        return "KNN(k=%d)" % self.k

调用scikit的库，实现如下

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

"""调用scikit里的函数"""
# 原始数据集
raw_data_X = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 作为训练集
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
# 新点
x = np.array([8.093607318, 3.365731514])

# kNN
kNN_classifier = KNeighborsClassifier(n_neighbors=6) #k值
kNN_classifier.fit(X_train, y_train) #拟合
# kNN_classifier.predict(x) #一维数组会出问题
X_predict = x.reshape(1, -1) #变成二维
y_predict = kNN_classifier.predict(X_predict)
print(y_predict[0])

2、进行性能判断

我们需要判断算法的性能

先留出一部分数据作为test数据
然后进行性能判断，如准确度

分离test数据集

自己实现test数据集分离如下

import numpy as np
from sklearn import datasets

"""分离出train和test的数据集"""
# 鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 对索引先乱序
shuffled_indexes = np.random.permutation(len(X))
# 测试数据集的大小
test_ratio = 0.2
test_size = int(len(X) * test_ratio)
# 分配索引
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
# 分离
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

封装成可调用函数如下

import numpy as np

"""进行train-test分离函数封装"""
def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"
    if seed:
        np.random.seed(seed)
    # 对索引先乱序
    shuffled_indexes = np.random.permutation(len(X))
    # 测试数据集的大小
    test_size = int(len(X) * test_ratio)
    # 分配索引
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]
    # 分离
    X_train = X[train_indexes]
    y_train = y[train_indexes]
    X_test = X[test_indexes]
    y_test = y[test_indexes]
    return X_train, X_test, y_train, y_test

调用scikit的库实现如下

import numpy as np
from sklearn import datasets
"""分离出train和test的数据集"""
# 鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
"""scikit里的函数"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

进行准确度判断

函数封装

import numpy as np

"""accuracy封装"""
def accuracy_score(y_true, y_predict):
    '''计算y_true和y_predict之间的准确率'''
    assert y_true.shape[0] == y_predict.shape[0], \
        "the size of y_true must be equal to the size of y_predict"
    return sum(y_true == y_predict) / len(y_true)

调用scikit的库

"""scikit中的accuracy"""
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
y_predict = knn_clf.predict(X_test)
accuracy_score(y_test, y_predict)
knn_clf.score(X_test, y_test)

3、超参数

超参数：运行算法前就需要指定的参数
模型参数：算法过程中学习的参数

调参调的就是超参数

kNN的超参数是k和距离权重，没有模型参数
还要考虑下明科夫斯基距离的p

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

"""寻找最好的超参数k、距离权重和明可夫斯基距离的p"""
# 数据集准备
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
# kNN
"""
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)
"""

# 寻找最好的k，不考虑距离权重
best_score = 0.0
best_k = -1
for k in range(1, 11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    score = knn_clf.score(X_test, y_test)
    if score > best_score:
        best_k = k
        best_score = score
print("best_k =", best_k)
print("best_score =", best_score)

# 考虑距离权重
best_score = 0.0
best_k = -1
best_method = ""
for method in ["uniform", "distance"]:
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_method = method
print("best_method =", best_method)
print("best_k =", best_k)
print("best_score =", best_score)

# 最好的p
best_score = 0.0
best_k = -1
best_p = -1
for k in range(1, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_p = p
            best_score = score
print("best_k =", best_k)
print("best_p =", best_p)
print("best_score =", best_score)

# 网格搜索
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2) #n_jobs是核数，-1表示所有
grid_search.fit(X_train, y_train)
knn_clf = grid_search.best_estimator_ #返回最佳分类器
grid_search.best_score_ #最佳的准确度
grid_search.fit(X_train, y_train)

4、数值归一化

将所有数据映射到统一尺度，以防某特征影响过大

最值归一化：0-1之间，如图所示，适用于有明显边界的情况
均值方差归一化：均值0方差1的分布，适用于无边界

数值归一化原理

最值归一化原理如下

import numpy as np
import matplotlib.pyplot as plt

# 最值归一化 Normalization
# 对数组
x = np.random.randint(0, 100, 100)
x = (x - np.min(x)) / (np.max(x) - np.min(x))
print(x)
# 对矩阵
X = np.random.randint(0, 100, (50, 2))
X = np.array(X, dtype=float) #先要变成浮点数
X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))
X[:,1] = (X[:,1] - np.min(X[:,1])) / (np.max(X[:,1]) - np.min(X[:,1]))
print(X)
plt.scatter(X[:,0], X[:,1])
plt.show()

均值方差归一化原理如下

import numpy as np
import matplotlib.pyplot as plt
# 均值方差归一化 Standardization
X2 = np.random.randint(0, 100, (50, 2))
X2 = np.array(X2, dtype=float)
X2[:,0] = (X2[:,0] - np.mean(X2[:,0])) / np.std(X2[:,0])
X2[:,1] = (X2[:,1] - np.mean(X2[:,1])) / np.std(X2[:,1])
print(X2)
plt.scatter(X2[:,0], X2[:,1])
plt.show()

在kNN中运用数值归一化

注意对测试数据集的归一化用训练数据集的均值方差