一、实验名称:
KNN算法实现
二、实验目的:
掌握KNN算法基本原理
三、实验要求
1、使用python实现KNN;
2、测试不同的k值下KNN算法分类效果;
四、实验平台
计算机、Python、Anaconda
五、实验数据
processed.cleveland.data
UCI公开数据集-heart disease,属性信息如下:
数据集参考网址:https://archive.ics.uci.edu/ml/datasets/Heart+Disease
import numpy as np
f=open(r'E:\360MoveData\Users\DELL\Desktop\processed.cleveland.data')
age=[]
sex=[]
cp=[]
tresbps=[]
chol=[]
fbs=[]
restecg=[]
thalach=[]
exang=[]
oldpeak=[]
slope=[]
ca=[]
thal=[]
num=[]
for i,d in enumerate(f):
d=d.strip()
if not d:
continue
d=list(map(float,d.split(',')))
age.append(d[0])
sex.append(d[1])
cp.append(d[2])
tresbps.append(d[3])
chol.append(d[4])
fbs.append(d[5])
restecg.append(d[6])
thalach.append(d[7])
exang.append(d[8])
oldpeak.append(d[9])
slope.append(d[10])
ca.append(d[11])
thal.append(d[12])
num.append(d[13])
group=np.empty(shape=[0,13],dtype=float)
labels=np.empty(shape=[0,1],dtype=float)
for i in range(0,303):
group=np.append(group,[[age[i],sex[i],cp[i],tresbps[i],chol[i],fbs[i],restecg[i],thalach[i],exang[i],oldpeak[i],slope[i],ca[i],thal[i]]],axis=0)
labels=np.append(labels,[[num[i]]],axis=0)
def kNN_Classify(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
#关于tile函数的用法
#>>> b=[1,3,5]
#>>> tile(b,[2,3])
#array([[1, 3, 5, 1, 3, 5, 1, 3, 5],
# [1, 3, 5, 1, 3, 5, 1, 3, 5]])
sqDiffMat = diffMat ** 2
sqDistances = sum(sqDiffMat)
distances = sqDistances ** 0.5 # 算距离
sortedDistIndicies =np.argsort(distances)
#关于argsort函数的用法
#argsort函数返回的是数组值从小到大的索引值
#>>> x = np.array([3, 1, 2])
#>>> np.argsort(x)
#array([1, 2, 0])
classCount = {} #定义一个字典
# 选择k个最近邻
for i in range(k):
voteLabel = labels[sortedDistIndicies[i]]
# 计算k个最近邻中各类别出现的次数
classCount[voteLabel[i]] = sortedDistIndicies[i] + 1
#返回出现次数最多的类别标签i
maxCount = 0
for key, value in classCount.items():
if value > maxCount:
maxCount = value
maxIndex = key
return maxIndex
dataSet=group
test=np.array([61.0,1.0,1.0,134.0,234.0,0.0,0.0,145.0,0.0,2.6,2.0,2.0,3.0])
print("请输入K值")
k=int(input())
outputLabel = kNN_Classify(test, dataSet, labels,k)
print("输入的样本", test, "心血管病的类别为 ", outputLabel)
注:一些读代码的理解过程
test=np.array([61.0,1.0,1.0,134.0,234.0,0.0,0.0,145.0,0.0,2.6,2.0,2.0,3.0])
dataSetSize = 303
diffMat = np.tile(test, (dataSetSize, 1)) - group
print(diffMat)
sqDiffMat = diffMat ** 2
print(sqDiffMat)
sqDistances = sum(sqDiffMat)
print(sqDistances)
distances = sqDistances ** 0.5
print(distances)
sortedDistIndicies =np.argsort(distances)
print(sortedDistIndicies)#从小到大距离下标
k=1
classCount = {}
for i in range(k):
voteLabel = labels[sortedDistIndicies[i]] #距离最小的结果
print(voteLabel)#结果值
a = sortedDistIndicies[i] + 1
print(a)
classCount[voteLabel[i]] = sortedDistIndicies[i] + 1
print(classCount)
maxCount = 0
for key, value in classCount.items():
if value > maxCount:
maxCount = value
maxIndex = key
return maxIndex
参考:https://blog.csdn.net/wxwmb11/article/details/106447279,做了一些修改,感谢。