KNN是机器学习中的分类算法,它将一个未知类别的样本归类为距离它最近的k个样本中最多数的类别。它的算法原理如下所示:
1.为了判断未知实例的类别,以所有已知类别的实例作为参照,选择一个参数K。
2.计算未知实例到到已知实例的距离,
3.将这些距离进行排序,选择前k个距离,
4.将未知实例归类为这K个最近邻样本中最多数的类别。
设计KNN算法实现对空气质量的预测
代码如下(示例):
from collections import Counter
import numpy as np
import pandas as pdt
这里的数据我用字典的形式存储,但一般都是用网上下载的数据集,再用CSV导入
#读数据
train_data = {'2019/1/1':[68,186,49,67,7,43,1.07,19,'良'],
'2019/1/2':[80,179,58,84,8,41,0.94,24,'良'],
'2019/1/3':[86,177,63,92,6,42,0.89,17,'良'],
'2019/1/4':[71,175,51,87,5,44,0.9,35,'良'],
'2019/1/5':[89,246,65,106,7,55,1.15,36,'良'],
'2019/1/6':[113,263,84,131,7,57,1.08,23,'轻度污染'],
'2019/1/7':[110,241,82,120,7,51,1.07,29,'中度污染'],
'2019/1/8':[119,276,89,139,10,62,1.25,22,'轻度污染'],
'2019/1/9':[120,331,90,129,8,50,1.02,27,'轻度污染'],
'2019/1/10':[85,194,62,88,6,46,1.24,35,'良'],
'2019/1/11':[91,165,67,100,7,45,1.29,35,'良'],
'2019/1/12':[125,219,94,139,9,62,1.46,27,'轻度污染'],
'2019/1/13':[70,166,45,77,6,43,0.79,41,'良'],
'2019/1/14':[80,270,56,96,7,46,0.67,39,'良'],
'2019/1/15':[111,319,82,132,8,55,0.86,24,'中度污染'],
'2019/1/16':[94,208,70,106,5,45,0.82,30,'良'],
'2019/1/17' :[78,159,56,91,7,46,0.87,43,'良'],
'2019/1/18':[81,220,58,106,8,52,0.89,48,'良'],
'2019/1/19':[61,135,38,70,6,45,0.59,41,'良'],
'2019/1/20':[93,227,68,109,9,50,0.78,37,'良'],
}
test_data = {'2019/1/21':[101,246,88,89,8,52,0.65,33]}
df = pd.DataFrame(train_data).T
df.columns = ['AQI指数','当天AQI排名','PM2.5','PM10','So2','No2','Co','O3','质量等级']
#df_list=df.values[:,:-1] #把df转化成list方便算出距离
def distance(d1,d2):
res=0
for key in range(0,8):
res +=(d1[key]-(d2[key]))**2
return res**0.5
k=5
def knn(data):
#距离
res=[
distance(train,data)
for train in df_list]
#把原始数据和预测数据的欧式距离并入df中
distance_df = pd.DataFrame({"distance":res},index=df.index)
#排序
res2 = pd.concat([df,distance_df],axis=1).sort_values(by="distance")
#取前K个加权
d = Counter(res2.head(k)['质量等级'])
if d['良']>d['中度污染']and d['良']>d['轻度污染']:
return '良'
elif d['中度污染']>d['轻度污染']:
return '中度污染'
else:
return '轻度污染'
from collections import Counter
import numpy as np
import pandas as pd
#读数据
train_data = {'2019/1/1':[68,186,49,67,7,43,1.07,19,'良'],
'2019/1/2':[80,179,58,84,8,41,0.94,24,'良'],
'2019/1/3':[86,177,63,92,6,42,0.89,17,'良'],
'2019/1/4':[71,175,51,87,5,44,0.9,35,'良'],
'2019/1/5':[89,246,65,106,7,55,1.15,36,'良'],
'2019/1/6':[113,263,84,131,7,57,1.08,23,'轻度污染'],
'2019/1/7':[110,241,82,120,7,51,1.07,29,'中度污染'],
'2019/1/8':[119,276,89,139,10,62,1.25,22,'轻度污染'],
'2019/1/9':[120,331,90,129,8,50,1.02,27,'轻度污染'],
'2019/1/10':[85,194,62,88,6,46,1.24,35,'良'],
'2019/1/11':[91,165,67,100,7,45,1.29,35,'良'],
'2019/1/12':[125,219,94,139,9,62,1.46,27,'轻度污染'],
'2019/1/13':[70,166,45,77,6,43,0.79,41,'良'],
'2019/1/14':[80,270,56,96,7,46,0.67,39,'良'],
'2019/1/15':[111,319,82,132,8,55,0.86,24,'中度污染'],
'2019/1/16':[94,208,70,106,5,45,0.82,30,'良'],
'2019/1/17' :[78,159,56,91,7,46,0.87,43,'良'],
'2019/1/18':[81,220,58,106,8,52,0.89,48,'良'],
'2019/1/19':[61,135,38,70,6,45,0.59,41,'良'],
'2019/1/20':[93,227,68,109,9,50,0.78,37,'良'],
}
test_data = {'2019/1/21':[101,246,88,89,8,52,0.65,33]}
df = pd.DataFrame(train_data).T
df.columns = ['AQI指数','当天AQI排名','PM2.5','PM10','So2','No2','Co','O3','质量等级']
df_list=df.values[:,:-1]
#距离
def distance(d1,d2):
res=0
for key in range(0,8):
res +=(d1[key]-(d2[key]))**2
return res**0.5
k=5
def knn(data):
#距离
res=[
distance(train,data)
for train in df_list]
distance_df = pd.DataFrame({"distance":res},index=df.index)
#排序
res2 = pd.concat([df,distance_df],axis=1).sort_values(by="distance")
#取前K个加权
d = Counter(res2.head(k)['质量等级'])
if d['良']>d['中度污染']and d['良']>d['轻度污染']:
return '良'
elif d['中度污染']>d['轻度污染']:
return '中度污染'
else:
return '轻度污染'
data='2019/1/21'
print(data,knn(test_data[data]))
k值的取值是一个难点,太大预测就没有什么变化,太小就容易被个例影响,这个的取值只能靠经验,慢慢自己调
由于距离不同,每个数据的权值都不同,这里由于题目简单没有去改。实际上的权值应该这样算
#取前K个
result2=res2[0:k]
#总距离
sum=0
for i in res:
sum +=i
# 加权
for j in result2:
result2['质量等级']+=1-j/sum