给定已经具有标记的训练数据集合,计算新样本与训练数据集合的距离,升序排序取TopK获取K个标签,最后根据K个标签,进行多票判决规则,预测该新样本点的结果。
1:距离
度量向量点的距离方式很多,根据不同的应用场景取不同的计算距离方式,默认采取的是欧式距离。
常见距离:
1) Lp 距离族, p=1 时表示哈曼顿距离, p=2 表示欧式距离, p=∞ 表示最大距离
1:优点
精度高,对异常点不敏感(除非选择的K值不合理),无需训练,实现简单(无论是KD树实现,还是原始版)
2:缺点
空间复杂度过高,根据样本点数目成线性关系,不管KD树实现还是原始版
时间复杂度过高,原始版是与样本数目以及维度都是线性的关系的;KD树优化后,只是在搜索阶段压缩了成为了 log(N) 的复杂度。
适合度:
标注数据量的问题,不能够太小,太小则模型效果不行,不能够太大,太大则模型运行效率不行。总而而言,适合中等数据规模的分类;另外,针对的特征属性的要求是数值类型的(一般都能够满足);
这部分,就按照原始版实现的,KD树版的暂且木有实现,博主懒呗,后期工作用到,需要实现了,再考虑实现吧,原始版实现比较简单,尤其是用python。不废话,具体的代码逻辑见下,不做过多的解释。后期的自己应该看的懂的,毕竟简单嘛
import numpy as np
import matplotlib
import matplotlib.pyplot as plot
import matplotlib.lines as mlines
import sys
from pandas import Series,DataFrame
import pandas as pd
def LoadData(FileName):
Fi = open(FileName, 'r')
DataSet = Fi.readlines()
DataSetLen = len(DataSet)
DataSetFeat = np.zeros((DataSetLen, 3))#数据特征
DataSetLable = []
Index = 0
LableName = {}
for line in DataSet:
line = line.strip()
LineList = line.split('\t')
DataSetFeat[Index, :] = LineList[0:3]
if not LableName.has_key(LineList[-1]):
LableName[LineList[-1]] = len(LableName) + 1
DataSetLable.append(LableName[LineList[-1]])
Index += 1
print LableName
return [DataSetFeat,DataSetLable,LableName]
def Normal(DataSetFeat):
minVals = DataSetFeat.min(0) #表示0维度上的求最小值,也就是列上的
maxVals = DataSetFeat.max(0)
ranges = (maxVals - minVals)
m = DataSetFeat.shape[0]
DataSetFeat = DataSetFeat - np.tile(minVals, [m, 1]) #tile具备扩散效果
DataSetFeat = DataSetFeat / np.tile(ranges, (m, 1))
return DataSetFeat
def classfy0(Feat, DataSetFeat, DataSetLable, K): #选择最近的K个数据,多数投票决定
m = DataSetFeat.shape[0]
FeatSet = np.tile(Feat, (m, 1))
FeatSet = (FeatSet - DataSetFeat) * (FeatSet - DataSetFeat)
Feat = FeatSet.sum(1)
Feat = np.sqrt(Feat) #求出预测点到样本集合点的距离
FeatIndex = np.argsort(Feat) #排序用于获取最近的TOK个点的下标
DataSetLable = np.array(DataSetLable)
Lable = DataSetLable[FeatIndex[0:K]]
#print Lable
Lable = Series(Lable)
LableCount = Lable.value_counts()
LableCount = (-LableCount).argsort().index.tolist()
#print LableCount[0]
return LableCount[0]
#K值越大,模型越简单---近似误差变大,估计误差在一定程度上会变小(超出一定能力,会变大)
def Test(DataSetFeat, DataSetLable):
K = 3
for K in range(1, 200, 10):
for TestPercent in [10]:#range(10,100,10):
TestPercent = float(TestPercent) * 0.01
TotalNum = len(DataSetLable)
TestNum = int(TotalNum * TestPercent)
print TestNum, TotalNum
ErrCount = 0
for i in range(TestNum):
ResultLabe = classfy0(DataSetFeat[i,:], DataSetFeat[TestNum:TotalNum,:], DataSetLable[TestNum:TotalNum], K)
if ResultLabe != DataSetLable[i]:
ErrCount += 1
#print "Predict Lable: %d, Real Lable: %d"%(ResultLabe, DataSetLable[i])
print "TestPercent:%f, K:%d, Err Ratio: %f"%(TestPercent, K, float(ErrCount)/float(TestNum))
def TestTrian(DataSetFeat, DataSetLable):
K = 3
for K in range(1, 200, 10):
for TestPercent in [90]:#range(10,100,10):
TestPercent = float(TestPercent) * 0.01
TotalNum = len(DataSetLable)
TestNum = int(TotalNum * TestPercent)
print TestNum, TotalNum
ErrCount = 0
for i in range(TestNum):
ResultLabe = classfy0(DataSetFeat[i,:], DataSetFeat[0:TestNum,:], DataSetLable[0:TestNum], K)
if ResultLabe != DataSetLable[i]:
ErrCount += 1
#print "Predict Lable: %d, Real Lable: %d"%(ResultLabe, DataSetLable[i])
print "TrainData: TestPercent:%f, K:%d, Err Ratio: %f"%(TestPercent, K, float(ErrCount)/float(TestNum))
FileName = sys.argv[1]
[DataSetFeat, DataSetLable, LableName] = LoadData(FileName)
DataSetFeat = Normal(DataSetFeat)
Test(DataSetFeat, DataSetLable)
TestTrian(DataSetFeat, DataSetLable)
#print DataSetFeat
#fig = plot.figure()
#ax = fig.add_subplot(111)
#ax.scatter(DataSetFeat[:,0], DataSetFeat[:,1], 15.0 * np.array(DataSetLable), 15.0 * np.array(DataSetLable))
#plot.show()