knn算法实现

knn.py

本文为 落魄陶陶 原创,转载请注明出处
数据来源及源码参见github

  • 学习并参考《机器学习实战》第二章
  • 主要使用Pandas库
  • knn.py为基本算法实现,基于数据knn.xlsx
  • knn_dating.py为约会预测,基于数据datingTestSet.txt

knn.py

    # 1.读取数据 得到 每行都是 x1,x2,x3,...,xn,y的dataFrame df
    # 2.计算给定点target = (x1,x2,...,xn)与步骤1中df的所有点的距离
    # 3.对步骤2所得的所有距离排序 asc,取前n个对应的y
    # 4.步骤3中y出现频率最高的为target的y
   from collections import namedtuple
   
   import numpy as np
   import pandas as pd
   
   MinMaxDict = namedtuple('MinMax', 'min max range')
   
   
   def load_data()-> pd.DataFrame:
       """
       加载数据
       :return: pd.DataFrame 最后一列为label,其余为输入
       """
       return pd.read_excel('knn.xlsx')
   
   
   def norm(df):
       min_max_list = []
       for column in df.columns[:-1]:
           col = df[column]
           min_val, max_val = col.min(), col.max()
           length = max_val - min_val
           df[column] = (col - min_val) / length
           min_max_list.append(MinMaxDict(min_val, max_val, length))
       return df, min_max_list
   
   
   def norm_target(target, min_max_list):
       ret = []
       for i, item in enumerate(min_max_list):
           ret.append((target[i] - item.min) / item.range)
       return tuple(ret)
   
   
   def classify(n, df, target):
       """
       构建一个DataFrame,包含[labels,distance]两列,
       label就是训练集的label,distance为目标数据targe对训练集df每条数据的欧氏距离
       然后再根据distance排序,取前n个结果
       再对前n个结果统计,个数最多的label就是分类结果
       :param n:
       :param df:
       :param target:
       :return:
       """
       label_column = df.columns[-1]
       result = pd.DataFrame()
       result['labels'] = df[label_column].copy()
       result['distance'] = np.sqrt(np.sum(df.loc[:, df.columns[:-1]].sub(target, axis='columns') ** 2, axis=1))
       result = result.sort_values(by='distance', ascending=True)
       return result[:n]['labels'].value_counts(ascending=False).index[0]
   
   
   if __name__ == '__main__':
       n = 5
       # 1.读取数据得到DataFrame
       df = load_data()
       # 2.将数据正规化,同时得到每列最大最小及跨度信息,用于目标数据正规化
       x_columns = df.columns[:-1]
       df, min_max_list = norm(df)
       # 3.将目标正规化
       item = (0, 10)
       item = norm_target(item, min_max_list)
       # 4.对目标值和已有数据计算欧氏距离,取前n个最小值得到分类结果
       result = classify(n, df, item)
       print(result)

约会预测
knn_dating.py

    import pandas as pd
    
    import knn
    
    
    def load_dating_data(file, ratio):
        df = pd.read_csv(file, sep='\t')
        df.columns = ['fly_miles', 'game_time', 'ice_cream', 'labels']
        df.labels = df.labels.map(lambda x: ['smallDoses', 'didntLike', 'largeDoses'].index(x) + 1)
        size = df.shape[0]
        num = int(size * ratio)
        return df[:num], df[num:]
    
    
    # class DatingKnn(KNN):
    #
    #     def __init__(self, n, ratio):
    #         super().__init__(n)
    #         num = int(self.df.shape[0] * ratio)
    #         self.testing_set = self.df[num:]
    #         self.df = self.df[:num]
    #
    #     def load_data(self):
    #         return load_dating_data('datingTestSet.txt')
    #
    #     def test(self):
    #         results = pd.Series([self.classify(item[1:-1], False) for item in self.testing_set.itertuples()])
    #         labels = self.testing_set.labels
    #         labels.index = results.index
    #         bingo = (labels - results).value_counts()[0]
    #         print(bingo / results.shape[0])
    #         # print(labels-results)
    #         # print(self.testing_set.labels)
    
    
    if __name__ == '__main__':
        # df = load_dating_data('datingTestSet.txt')
        # print(df.labels.unique())
        # fig = plt.figure()
        # ax = fig.add_subplot(111)
        # ax.scatter(df.iloc[:, 0], df.iloc[:, 1], s=15 * df.labels, c=15 * df.labels)
        # plt.show()
    
        n = 5
        ratio = 0.9  # 数据的百分之90为训练数据,10%为测试数据
        # 1.读取数据得到DataFrame
        training_set, testing_set = load_dating_data('datingTestSet.txt', 0.9)
        # 2.将数据正规化,同时得到每列最大最小及跨度信息,用于目标数据正规化
        df, min_max_list = knn.norm(training_set)
    
        results_list = []
        for item in testing_set.itertuples(index=False):
            # 3.将目标正规化
            item = item[:-1]
            item = knn.norm_target(item, min_max_list)
            # 4.对目标值和已有数据计算欧氏距离,取前n个最小值得到分类结果
            result = knn.classify(n, df, item)
            results_list.append(result)
    
        # 5.使用测试集计算正确率
        total = len(results_list)
        testing_set.reset_index(inplace=True)
        labels = testing_set[testing_set.columns[-1]] # 两个Series相减时,按照index相同的相减,所以重置测试集的index
        results = pd.Series(results_list)
        bingo = (labels - results_list).value_counts()[0] # 对测试集label和预测结果做差,统计结果为0的个数即为正确的个数
        percent = bingo / total
        print(percent)

你可能感兴趣的:(Python)