本文为 落魄陶陶 原创,转载请注明出处
数据来源及源码参见github
knn.py
# 1.读取数据 得到 每行都是 x1,x2,x3,...,xn,y的dataFrame df
# 2.计算给定点target = (x1,x2,...,xn)与步骤1中df的所有点的距离
# 3.对步骤2所得的所有距离排序 asc,取前n个对应的y
# 4.步骤3中y出现频率最高的为target的y
from collections import namedtuple
import numpy as np
import pandas as pd
MinMaxDict = namedtuple('MinMax', 'min max range')
def load_data()-> pd.DataFrame:
"""
加载数据
:return: pd.DataFrame 最后一列为label,其余为输入
"""
return pd.read_excel('knn.xlsx')
def norm(df):
min_max_list = []
for column in df.columns[:-1]:
col = df[column]
min_val, max_val = col.min(), col.max()
length = max_val - min_val
df[column] = (col - min_val) / length
min_max_list.append(MinMaxDict(min_val, max_val, length))
return df, min_max_list
def norm_target(target, min_max_list):
ret = []
for i, item in enumerate(min_max_list):
ret.append((target[i] - item.min) / item.range)
return tuple(ret)
def classify(n, df, target):
"""
构建一个DataFrame,包含[labels,distance]两列,
label就是训练集的label,distance为目标数据targe对训练集df每条数据的欧氏距离
然后再根据distance排序,取前n个结果
再对前n个结果统计,个数最多的label就是分类结果
:param n:
:param df:
:param target:
:return:
"""
label_column = df.columns[-1]
result = pd.DataFrame()
result['labels'] = df[label_column].copy()
result['distance'] = np.sqrt(np.sum(df.loc[:, df.columns[:-1]].sub(target, axis='columns') ** 2, axis=1))
result = result.sort_values(by='distance', ascending=True)
return result[:n]['labels'].value_counts(ascending=False).index[0]
if __name__ == '__main__':
n = 5
# 1.读取数据得到DataFrame
df = load_data()
# 2.将数据正规化,同时得到每列最大最小及跨度信息,用于目标数据正规化
x_columns = df.columns[:-1]
df, min_max_list = norm(df)
# 3.将目标正规化
item = (0, 10)
item = norm_target(item, min_max_list)
# 4.对目标值和已有数据计算欧氏距离,取前n个最小值得到分类结果
result = classify(n, df, item)
print(result)
约会预测
knn_dating.py
import pandas as pd
import knn
def load_dating_data(file, ratio):
df = pd.read_csv(file, sep='\t')
df.columns = ['fly_miles', 'game_time', 'ice_cream', 'labels']
df.labels = df.labels.map(lambda x: ['smallDoses', 'didntLike', 'largeDoses'].index(x) + 1)
size = df.shape[0]
num = int(size * ratio)
return df[:num], df[num:]
# class DatingKnn(KNN):
#
# def __init__(self, n, ratio):
# super().__init__(n)
# num = int(self.df.shape[0] * ratio)
# self.testing_set = self.df[num:]
# self.df = self.df[:num]
#
# def load_data(self):
# return load_dating_data('datingTestSet.txt')
#
# def test(self):
# results = pd.Series([self.classify(item[1:-1], False) for item in self.testing_set.itertuples()])
# labels = self.testing_set.labels
# labels.index = results.index
# bingo = (labels - results).value_counts()[0]
# print(bingo / results.shape[0])
# # print(labels-results)
# # print(self.testing_set.labels)
if __name__ == '__main__':
# df = load_dating_data('datingTestSet.txt')
# print(df.labels.unique())
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(df.iloc[:, 0], df.iloc[:, 1], s=15 * df.labels, c=15 * df.labels)
# plt.show()
n = 5
ratio = 0.9 # 数据的百分之90为训练数据,10%为测试数据
# 1.读取数据得到DataFrame
training_set, testing_set = load_dating_data('datingTestSet.txt', 0.9)
# 2.将数据正规化,同时得到每列最大最小及跨度信息,用于目标数据正规化
df, min_max_list = knn.norm(training_set)
results_list = []
for item in testing_set.itertuples(index=False):
# 3.将目标正规化
item = item[:-1]
item = knn.norm_target(item, min_max_list)
# 4.对目标值和已有数据计算欧氏距离,取前n个最小值得到分类结果
result = knn.classify(n, df, item)
results_list.append(result)
# 5.使用测试集计算正确率
total = len(results_list)
testing_set.reset_index(inplace=True)
labels = testing_set[testing_set.columns[-1]] # 两个Series相减时,按照index相同的相减,所以重置测试集的index
results = pd.Series(results_list)
bingo = (labels - results_list).value_counts()[0] # 对测试集label和预测结果做差,统计结果为0的个数即为正确的个数
percent = bingo / total
print(percent)