kNN(k-nearest neighbor, KNN)是一个基本的分类和回归算法,在1968年有Cover和Hart提出。
k近邻算法的三个基本要素(模型三要素):k值的选择、距离度量(如:欧式距离)、分类决策规则(如:多数表决majority vote)
例:已知表格的前四部电影,根据fight镜头和kiss镜头判断一个新的电影所属类别?
算法流程L:
(1)收集数据:可以使用任何方法。
(2)准备数据:距离计算所需要的数值,最好是结构化的数据格式。
(3)分析数据:可以使用任何方法。
(4)训练算法:此步骤不适用与k-近邻算法。
(5)测试算法:计算错误率
(6)使用算法:首先需要输入样本数据和结构化的输出结果,
然后运行k-近邻算法判断输入数据分别属于那个分类,
最后应用对计算出的分类执行后续的处理。
1.准备:通过python导入数据
准备数据:对训练数据,用numpy创建数据集和标签
'''
函数功能:生成自定义数据集
'''
def createDataSet():
group = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
'''
函数功能: KNN分类
Iuput: inx:测试集
dataSet:已知数据的特征(N*M)
labels:已知数据的标签或类别(1*M vertor)
k:K近邻算法中的k
Output: 测试样本最可能所属的标签
'''
def classify0(inx,dataSet,labels,k):
dataSetSize = dataSet.shape[0] #shape[0]返回dataSet的行数
diffMat = np.tile(inx,(dataSetSize,1))-dataSet
# np.tile(inx,(a,b))函数将inx重复a行,重复b列
sqDiffMat = diffMat**2 #做差后平方
sqDistances = sqDiffMat.sum(axis = 1)
#sum()求和函数,sum(0):每列所有元素相加,sum(1):每行所有元素相加
distances = sqDistances**0.5 #开平方,求欧式距离
sortedDisIndicies = distances.argsort()
#argsort()函数返回的是数组值从小到大的索引值
classCount={}
for i in range(k):
voteIlabel = labels[sortedDisIndicies [i]]
#取出前k个距离对应的标签
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#计算每个类别的样本数。字典get()函数返回指定键的值,如果值不在字典返回默认值0
sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
#reverse 降序排列字典
#key = operator.itemgetter(1)按照字典的值(value)进行排序
#key = operator.itemgetter(0)按照字典的键(key)进行排序
return sortedClassCount[0][0] #返回字典的第一条的key,也即是测试样本所属类别'''
相关函数介绍:
shape函数是numpy.core.fromnumeric中的函数,它的功能是查看矩阵或者数组的维数。
tile函数位于python模块 numpy.lib.shape_base中,功能是重复某个数组。比如tile(A,n),功能是将数组A重复n次,构成一个新的数组。
sorted函数:对所有可迭代的对象进行排序操作。
sorted(iterable[, cmp[, key[, reverse]]])
参数说明:
在约会网站上使用k-近邻算法:
'''
函数功能: 将文本转换为矩阵
Iuput: 文件名字符串
Output: 训练样本矩阵和类标签向量
'''
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = np.zeros((numberOfLines,3))#生成一个全0矩阵
classLabelVertor = []
index = 0
for line in arrayOLines:
line = line.strip()
#strip()函数用于移除字符串头尾指定的字符(默认为空格或换行符)
listFromLine = line.split('\t')
# split()函数通过指定分隔符对字符串进行分割并返回一个列表
returnMat[index,:] = listFromLine[0:3]
classLabelVertor.append((listFromLine[-1]))
index += 1
return returnMat,classLabelVertor
分析数据:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.xlabel('percentage of time spent playing vedio games')
plt.ylabel('liters of ice cream consumed per year')
for i in range(len(datingLabels)):
if datingLabels[i] == str(1):
type1 = ax.scatter(datingDataMat[i,1],datingDataMat[i,2],marker='x',s = 10,color = 'blue')
if datingLabels[i] == str(2):
type2 = ax.scatter(datingDataMat[i, 1], datingDataMat[i, 2], marker='x', s=20, color='green')
if datingLabels[i] == str(3):
type3 = ax.scatter(datingDataMat[i,1],datingDataMat[i,2],marker='x',s = 30,color = 'red')
plt.title('dating statistics')
plt.legend((type1,type2,type3),('do not like','probably like','like'))
plt.show()
分析结果:
3. 归一化
直接对数据使用欧式距离计算,则会使得高数量级的特征对目标变量影
响权重大,但实际上,并非真正如此,对其进行归一化到[0,1)之间,认
为其同等重要,故需要对数据进行归一化。
'''
函数功能: 归一化特征值
Input: 数据特征值
Output: 归一化后的数据特征值
'''
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(dataSet.shape)
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals,(m,1))
normDataSet = normDataSet/np.tile(ranges,(m,1))
return normDataSet,ranges,minVals
'''
函数功能:作为完整程序验证分类器
'''
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print('the classifier came back with:',classifierResult,', the real answer is:',datingLabels[i])
if classifierResult != datingLabels[i]:
errorCount += 1.0
errorrate = errorCount/float(numTestVecs)
print('the total error rate is:',errorrate)
'''
函数功能:约会网站预测
'''
def classifyPerson():
resultList = ['not at all', 'in small does', 'in large does']
percentTats = float(input('percentage of time spent playing vedio games:'))
ffMiles = float(input('frequent flier miles earned per year:'))
iceCream = float(input('liters of ice cream consumed per year:'))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = np.array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr - minVals)/ranges,normMat,datingLabels,3)
print('you will probably like this person:',resultList[int(classifierResult)-1])
'''
函数功能:把一个32X32的二进制图像矩阵通过转换为1X1024的向量
'''
def img2vector(filename):
returnVect = np.zeros((1,1024))
fr = open(filename)
for i in range(32):
linestr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(linestr[j])
return returnVect
'''
函数功能:手写数字识别系统
'''
def handwritingClassTest():
hwLabels = []
trainingFileList = os.listdir('digits/trainingDigits')
m = len(trainingFileList)
trainingMat = np.zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('digits/trainingDigits/%s' % fileNameStr)
testFileList = os.listdir('digits/testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
classifierResult = knn.classify0(vectorUnderTest,trainingMat,hwLabels,3)
print('the classifier came back with:',classifierResult)
print('the real answer is:',classNumStr)
if classifierResult != classNumStr:
errorCount +=1.0
print('\n the total number of error is:',errorCount)
print('\n the total error rate is:',(errorCount/float(mTest)))
k值的选择:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
if __name__ == "__main__":
iris = datasets.load_iris()
x = iris.data[:,[1,2]]
print(x)
y = iris.target
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 0)
# 训练数据和测试数据进行标准化
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
# 建立一个k近邻模型对象
knn_classifier = KNeighborsClassifier(6)
# 输入训练数据进行学习建模
knn_classifier.fit(x_train, y_train)
# 对测试数据进行预测
y_predict = knn_classifier.predict(x_test)
# score(x,y[,sample_weight])返回给定测试数据和标签的平均准确值
scores = knn_classifier.score(x_test,y_test)
print(scores)