原理请移步上一篇博客
数据集:链接:https://pan.baidu.com/s/18UZD2p3PwTwjVy-_S53flw 提取码:KNN1
import numpy as np
import pandas as pd
import operator
from os import listdir
from collections import Counter
from numpy import *
def classify0(inX,dataSet,labels,k):
#lineLen 是数据的个数
lineLen = dataSet.shape[0]
#inX代表还未知分类的一个数据。将它减去dataSet对应的每一项
diffMAt = tile(inX,(lineLen,1)) - dataSet
#取平方(欧氏距离)
sqDiffMat = diffMAt**2
#对每一行取sum
distances = sqDiffMat.sum(axis=1)
#开方后获得距离distances
distances = distances**0.5
#将值对应大小排序,但是返回的数组中的值是其对应的下标值
sortedDistances = distances.argsort()
#定义一个分类的字典
classCount = {
}
for i in range(k):
#获得该下标值所在的分类,并作为未知分类inX的中间分类结果
visLabel = labels[sortedDistances[i]]
#看看classCount之前是否有过该分类,如果有就返回有过该分类的次数,否则返回0
classCount[visLabel] = classCount.get(visLabel,0) + 1
#对clasCount.items()中的第二个值进行逆排
b = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
#返回分类结果
return b[0][0]
def file2matrix(filename):
"""
将数据的feature和label划分开
"""
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
index = 0
fr = open(filename)
for i in fr.readlines():
line = i.strip()
listFromLine = line.split('\t')
#该数据的前三列为feature 第四列为lable
returnMat[index,:] = listFromLine[0:3]
#将最后一类lable加入到列表中
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):
"""
归一化特征值,消除属性之间量级不同导致的影响
:param dataSet: 数据集
:return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围,并没有用到
归一化公式:
Y = (X-Xmin)/(Xmax-Xmin)
其中的 min 和 max 分别是数据集中的最小特征值和最大特征值。该函数可以自动将数字特征值转化为0到1的区间。
"""
#获取数据集没一列的最大值 最小值
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
#每一列最大值和最小值之差
rangs = maxVals - minVals
#m为dataSet的行数,也就是数据的个数
m = dataSet.shape[0]
#normDataSet是归一化后的数据集
normDataSet = np.zeros(dataSet.shape)
#将原数据集的每一项减去其对应列的最小值
normDataSet = dataSet - tile(minVals,(m,1))
#归一化
normDataSet = normDataSet / tile(rangs,(m,1))
return normDataSet,rangs,minVals
def datingClassTest():
hoRatio = 0.1
#datingDataMat是feature datingLables是lable
datingDataMat,datingLabels = file2matrix('2.KNN/datingTestSet2.txt')
#normMat是归一化后的结果,其他两项没用到
normMat,rangs,minVals = autoNorm(datingDataMat)
#数据的总个数
m = normMat.shape[0]
#取百分之十的数据集为未知分类的数据集 剩下百分之90就是已知分类的数据集 (对应86行进行观察
numTestVecs = int(m*hoRatio)
#errorCount是错误数
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
#如果预测的分类和实际的分类不一样errorCount++
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
print(errorCount)
if __name__ == '__main__':
datingClassTest()
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
5.0