k近邻算法学习笔记

1、使用模拟数据演示k近邻算法

import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter

# knn算法思想:如果样本在特征空间的 k个最相邻的样本中大部分属于某一类,那么该样本也属于这一类

# raw_data_x原始特征集合,raw_data_y标签集合
raw_data_x = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679779110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745015997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783487, 3.424088941],
              [7.9939820917, 0.791637231]
                ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])

plt.scatter(x_train[y_train==0, 0], x_train[y_train==0, 1], color='r')
plt.scatter(x_train[y_train==1, 0], x_train[y_train==1, 1], color='b')
plt.scatter(x[0], x[1], color='y')

# plt.show()

# distances = []
# for data in x_train:
      # 计算训练集中每个点到新的点x的距离
#     distance = sqrt(np.sum((data - x) ** 2))
#     distances.append(distance)

# 计算训练集中每个点到新的点x的距离集合
distances = [sqrt(np.sum((data - x) ** 2)) for data in x_train]
print(distances)
# np.argsort对数组进行排序,返回索引列表,这里是对新的点x到训练集中每个点的距离进行排序
nearest = np.argsort(distances)
print(nearest)
k = 6
# 获得新的点x距离最近的k个点的类别(k个样本),distances[i]到最近的k个点的距离
top_K = [y_train[i] for i in nearest[:k]]
print(top_K)
# Counter计数,元素被作为字典的key存储,它们的计数作为字典的value存储
print(Counter(top_K))
votes = Counter(top_K)
print(votes.most_common(1))
# Counter提供的most_common方法获取计数最多的n个元素
# 返回一个列表(里面的元素是一个元组,元组第0位是被计数的具体元素,元组的第1位是出现的次数)
result = votes.most_common(1)[0][0]
# result就是根据knn算法预测的新的点x所属的类别
print(result)

2、使用scikit-learn中的knn算法

import numpy as np
from sklearn.neighbors import KNeighborsClassifier


# raw_data_x原始特征集合,raw_data_y标签集合
raw_data_x = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679779110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745015997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783487, 3.424088941],
              [7.9939820917, 0.791637231]
                ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])

# 构建knn算法机器学习对象
knnClassfier = KNeighborsClassifier(n_neighbors=6)
 # 拟合出knn算法的模型
knnClassfier.fit(X_train, y_train)
# predict:预测结果
print(knnClassfier.predict([x]))

3、手写代码模拟knn算法

编写KNN.py模拟scikilearn中的knn算法预测过程:

import numpy as np
from math import sqrt
from collections import Counter

# 注意:python是缩进严格的语言,方法定义def前面没有缩进时会报错
class MyKNeighborsClassfier:

    def __init__(self, n_neighbors):
        assert n_neighbors >= 1, "n_neighbors must be valid"
        self.k = n_neighbors
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        assert X_train.shape[0] == y_train.shape[0], "训练集特征数据与标签集合不符"
        self.X_train = X_train
        self.y_train = y_train
        return self

    def predict(self, X_predict):
        return np.array([self.__predict(x) for x in X_predict])

    def __predict(self, X_predict):
        distances = [sqrt(np.sum((x_train - X_predict) ** 2)) for x_train in self.X_train]
        nearest = np.argsort(distances)
        top_K = [self.y_train[i] for i in nearest[:self.k]]
        votes = Counter(top_K)
        y_predict = votes.most_common(1)[0][0]
        return y_predict

使用自定义的预测方法预测鸢尾花类型:

import numpy as np
from KNN import MyKNeighborsClassfier

# raw_data_x原始特征集合,raw_data_y标签集合
raw_data_x = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679779110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745015997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783487, 3.424088941],
              [7.9939820917, 0.791637231]
                ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])

kNeighborsClassfier = MyKNeighborsClassfier(n_neighbors=6)
kNeighborsClassfier.fit(X_train, y_train)
print(kNeighborsClassfier.predict(x))

4、模拟数据集拆分

编写model_selection.py文件,模拟scikilearn中knn算法数据集

import numpy as np

class TrainTestSplit:
    def train_test_split(x, y, ratio=0.2, seed=None):
        if seed:
            np.random.seed(seed)
        # 获得乱序的数据集索引
        shuffle_index = np.random.permutation(len(x))
        test_size = int(len(x) * ratio)

        # 前test_size个索引对应的数据作为测试集,从test_size往后的索引对应的数据作为训练集
        test_indexs = shuffle_index[:test_size]
        train_indexs = shuffle_index[test_size:]

        x_train = x[train_indexs]
        y_train = y[train_indexs]
        x_test = x[test_indexs]
        y_test = y[test_indexs]
        return x_train,y_train,x_test,y_test

测试:

import numpy as np
from model_selection import TrainTestSplit
from sklearn import datasets
from KNN import MyKNeighborsClassfier

iris = datasets.load_iris()

x_train,y_train,x_test,y_test = TrainTestSplit.train_test_split(x=iris.data, y=iris.target)
# print(x_train.shape, y_train.shape)
# print(x_test.shape, y_test.shape)
# print(x_test)
# print(y_test)
# print(x_train)
# print(y_train)
knnClassfier = MyKNeighborsClassfier(n_neighbors=6)
knnClassfier.fit(X_train=x_train,y_train=y_train)
result = knnClassfier.predict(x_test)
print(sum(result == y_test))

5、封装准确度方法

编写metrics.py

import numpy as np

def accuracy_score(y_true, y_predict):
    return sum(y_true == y_predict)/len(y_true)

在KNN.py文件中添加score方法调用metrics.py中的accuracy_score来计算准确度

from metrics import accuracy_score


    def score(self, x_test, y_test):
        y_predict = self.predict(X_predict=x_test)
        return accuracy_score(y_predict, y_test)

使用手写数字识别数据集测试准确度方法:

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from model_selection import TrainTestSplit
from KNN import MyKNeighborsClassfier

# 手写数字识别测试数据集
digits = datasets.load_digits()
print(digits.keys())
X = digits.data
print(X.shape)
y = digits.target
print(y.shape)
x_train,y_train,x_test,y_test = TrainTestSplit.train_test_split(x = X, y = y, ratio=0.2)
my_knn_cli = MyKNeighborsClassfier(n_neighbors=3)
my_knn_cli.fit(X_train=x_train,y_train=y_train)
# result = my_knn_cli.predict(x_test)
# print(sum(result == y_test)/len(y_test))
print(my_knn_cli.score(x_test,y_test))

6、超参数

6.1 超参数概念:

超参数:在算法运行前需要决定的参数
模型参数:算法过程中学习的参数

KNN算法中没有模型参数
KNN算法中的k是典型的超参数

6.2 寻找最好的k,p,weights

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# 手写数字识别测试数据集
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
# knnclf = KNeighborsClassifier(n_neighbors=3)
# knnclf.fit(X=X_train,y=y_train)
# print(knnclf.score(X=X_test,y=y_test))

# 寻找最好的k

# uniform不考虑距离,distance考虑距离
best_method = ""
# p (1曼哈顿距离,2欧拉距离,任意p:明可夫斯基距离)
best_p = 0
best_score = 0.0
best_k = -1
for method in ["uniform", "distance"]:
    for k in range(1, 11):
        if method == "distance":
            for p in range(1, 6):
                knnclf = KNeighborsClassifier(n_neighbors=k, weights=method, p=p)
                knnclf.fit(X=X_train, y=y_train)
                score = knnclf.score(X=X_test, y=y_test)
                if score > best_score:
                    best_score = score
                    best_k = k
                    best_method = method
                    best_p = p
        else:
            knnclf = KNeighborsClassifier(n_neighbors=k, weights=method)
            knnclf.fit(X=X_train, y=y_train)
            score = knnclf.score(X=X_test, y=y_test)
            if score > best_score:
                best_score = score
                best_k = k
                best_method = method
                best_p = 0
print(best_method)
print(best_p)
print(best_k)
print(best_score)

6.3 网格搜索与更多超参数

GridSearchCV网格搜索

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import time

# 手写数字识别测试数据集
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]
    }
]
print(param_grid)
knnclf = KNeighborsClassifier()
# n_jobs指定使用cpu的核数,-1所有的cpu都要使用,verbose是否打印搜索的日志,值越大,日志越详细
grid_search = GridSearchCV(knnclf, param_grid, n_jobs=-1, verbose=2)
time_start = time.time()
grid_search.fit(X=X_train,y=y_train)
time_end = time.time()
# 计算的时间差为程序的执行时间,单位为秒
print(time_end-time_start)
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)
knnclf = grid_search.best_estimator_
print(knnclf.score(X=X_test,y=y_test))
# best_estimator_网格搜索的最佳参数KNeighborsClassifier对象
# best_params_最佳param_grid超参数组合结果
# best_score_最佳超参数测试得分值

更多的距离定义,可以通过KNeighborsClassifier对象的metric属性指定,有关可用度量的列表,请参见DistanceMetric的文档。

7、数据归一化

7.1 最值归一化和均值方差归一化

import numpy as np
import matplotlib.pyplot as plt

# 最值归一化
# x = np.random.randint(0,100,size=100)
# print(x)
# result = (x - np.min(x))/(np.max(x)-np.min(x))
# print(result)

x = np.random.randint(0, 100, (50, 2))
x = np.array(x,dtype=float)
print(x)
x[:,0] = (x[:,0] - np.min(x[:,0]))/(np.max(x[:,0]) - np.min(x[:,0]))
x[:,1] = (x[:,1] - np.min(x[:,1]))/(np.max(x[:,1]) - np.min(x[:,1]))
print(x)
# 方差
print(np.std(x[:,0]))
print(np.std(x[:,1]))
# plt.scatter(x[:,0], x[:,1])
# plt.show()

# 均值方差归一化

x = np.random.randint(0, 100, (50, 2))
x = np.array(x,dtype=float)

x[:,0] = (x[:,0] - np.mean(x[:,0]))/np.std(x[:,0])
x[:,1] = (x[:,1] - np.mean(x[:,1]))/np.std(x[:,1])
print(x)
# 方差
print(np.std(x[:,0]))
print(np.std(x[:,1]))
plt.scatter(x[:,0], x[:,1])
plt.show()
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y)

# sklearn.preprocessing StandardScaler方差归一化对象
standardScaler = StandardScaler()
standardScaler.fit(X_train)
print(standardScaler.mean_)
print(standardScaler.scale_)

X_train = standardScaler.transform(X_train)
print(X_train)
X_test = standardScaler.transform(X_test)
print(X_test)

knnclf = KNeighborsClassifier(n_neighbors=3)
knnclf.fit(X_train, y_train)
print(knnclf.score(X_test, y_test))

7.2 手写代码模拟StandardScaler

编写preprocessing.py文件

import numpy as np

class StanderScaler:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None

    def fit(self, X):
        assert X.ndim == 2, "this dimension X must be two"
        self.mean_ = np.array([np.mean(X[i]) for i in range(X.shape[1])])
        self.scale_ = np.array([np.std(X[i]) for i in range(X.shape[1])])
        return self

    def transform(self, X):
        assert len(self.scale_) == X.shape[1], "the feature num of X must be equal to scale_ "
        resX = np.empty(X.shape, dtype=float)
        for col in range(X.shape[1]):
            resX[:, col] = (X[:,col] - self.mean_[col])/self.scale_[col]
        return resX

测试:

from preprocessing import StanderScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y)
standardScaler = StanderScaler()
standardScaler.fit(X_train)
print(standardScaler.mean_)
print(standardScaler.scale_)

X_train = standardScaler.transform(X_train)
print(X_train)
X_test = standardScaler.transform(X_test)
print(X_test)

knnclf = KNeighborsClassifier(n_neighbors=3)
knnclf.fit(X_train, y_train)
print(knnclf.score(X_test, y_test))

8、总结

k近邻算法学习笔记_第1张图片

你可能感兴趣的:(机器学习算法,近邻算法,学习,python)