def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
#print(sortedDistIndicies)
#print(len(sortedDistIndicies))
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
voteIlabel=voteIlabel[0]####这一步不做就会报错
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#print('step:',i,' voteIlabel:',voteIlabel)
#print(classCount)
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
#print('result:',sortedClassCount[0][0])
return sortedClassCount[0][0]
def autoNorm(dataSet):
minVals = dataSet.min(axis=0)
maxVals = dataSet.max(axis=0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
def TrafficClassTest():
normMat, ranges, minVals = autoNorm(data_xnew)
normMat1, ranges1, minVals1 = autoNorm(Data_xnew)
numTestVecs = Data_ynew.shape[0]
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat1[i,:],normMat,data_ynew,3)
print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult,Data_ynew[i]))
if (classifierResult != Data_ynew[i]): errorCount += 1.0
print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
表1 识别率分布图
标签\预测 |
1:步行 |
2:自行车 |
3:公交车 |
4:小汽车 |
求和 |
1:步行 |
0.9767 |
0.0021 |
0.0163 |
0.0049 |
1.0000 |
2:自行车 |
0.0024 |
0.9904 |
0.0072 |
0.0000 |
1.0000 |
3:公交车 |
0.0230 |
0.0000 |
0.9641 |
0.0129 |
1.0000 |
4:小汽车 |
0.0138 |
0.0000 |
0.0453 |
0.9409 |
1.0000 |
附录程序源代码:
#-*- coding=utf-8 -*-
##一次平滑用于预测
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import operator
##定义用于将多维list整合成为1维list的函数
def flat(l):
for k in l:
if not isinstance(k, (list, tuple)):
yield k
else:
yield from flat(k)
SCALE = 60#定义SCALE长度,滚动每段数据实际为2SCALE长度
data_x=[]
data_y=[]#训练集
Data_x=[]
Data_y=[]#测试集
##定义文件读入函数
def choose_file(filenumber):
traindata = pd.read_excel('train%s.xlsx'%filenumber)
tempspeed=[]
tempfirstplant=[]
tempmeans=[]
tempspeed.append(list(traindata['speed']))
tempmeans.append(list(traindata['means']))
tempfirstplant.append(list(traindata['firstplant']))
tempspeed = list(flat(tempspeed))
tempmeans = list(flat(tempmeans))
tempfirstplant=list(flat(tempfirstplant))
for i in range(SCALE):#每个文件前SCALE个数据的读入
speed=np.array(tempspeed[0 : i+SCALE])
speed_in=tempspeed[i]
firstplant=tempfirstplant[i]
mean_speed=speed.mean()
max_speed=speed.max()
std_speed=speed.std()
data_x.append(list([speed_in,firstplant,mean_speed,max_speed,std_speed]))
data_y.append(list([tempmeans[i]]))
for i in range(SCALE,(len(tempspeed)-SCALE)):#每个文件可以正常选择SCALE的数据的读入
speed = np.array(tempspeed[i-SCALE : i+SCALE])
speed_in=tempspeed[i]
firstplant=tempfirstplant[i]
mean_speed=speed.mean()
max_speed=speed.max()
std_speed=speed.std()
data_x.append(list([speed_in,firstplant,mean_speed,max_speed,std_speed]))
data_y.append(list([tempmeans[i]]))
for i in range((len(tempspeed)-SCALE),len(tempspeed)):#每个文件尾部的SCALE个数据的读入
speed=np.array(tempspeed[i-SCALE:(len(tempspeed))])
speed_in=tempspeed[i]
firstplant=tempfirstplant[i]
mean_speed=speed.mean()
max_speed=speed.max()
std_speed=speed.std()
data_x.append(list([speed_in,firstplant,mean_speed,max_speed,std_speed]))
data_y.append(list([tempmeans[i]]))
return data_x,data_y#返回用于训练模型的x和y
##读入5个出行个体的训练文件
l=[1,2,3,4,5,6]
for i in l:
choose_file(i)
data_y=np.array(data_y)
import random
a=range((len(data_x)))
b=random.sample(a,int(0.8*len(data_x)))
c=list(set(a).difference(set(b)))
data_xnew=[]
data_ynew=[]
Data_xnew=[]
Data_ynew=[]
for i in b:
data_xnew.append(data_x[i])
data_ynew.append(data_y[i])
for i in c:
Data_xnew.append(data_x[i])
Data_ynew.append(data_y[i])
data_xnew=np.array(data_xnew)
data_ynew=np.array(data_ynew)
Data_xnew=np.array(Data_xnew)
Data_ynew=np.array(Data_ynew)
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
#print(sortedDistIndicies)
#print(len(sortedDistIndicies))
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
voteIlabel=voteIlabel[0]####这一步不做就会报错
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#print('step:',i,' voteIlabel:',voteIlabel)
#print(classCount)
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
#print('result:',sortedClassCount[0][0])
return sortedClassCount[0][0]
def autoNorm(dataSet):
minVals = dataSet.min(axis=0)
maxVals = dataSet.max(axis=0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
CLASSFIERRESULT=[]
def TrafficClassTest():
normMat, ranges, minVals = autoNorm(data_xnew)
normMat1, ranges1, minVals1 = autoNorm(Data_xnew)
numTestVecs = Data_ynew.shape[0]
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat1[i,:],normMat,data_ynew,3)
CLASSFIERRESULT.append(classifierResult)
#print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, Data_ynew[i]))
if (classifierResult != Data_ynew[i]): errorCount += 1.0
print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
print(errorCount,numTestVecs)
TrafficClassTest()
CLASSFIERRESULT = np.array(CLASSFIERRESULT)
#Data_ynew标签(真实)true;CLASSFIERRESULT预测pre
count1,count2,count3,count4=0,0,0,0
true_walk = Data_ynew[Data_ynew==1].shape[0]
true_walk_index = np.argwhere(Data_ynew==1)[:,0]
for o in true_walk_index:
if CLASSFIERRESULT[o]==1:
count1+=1
elif CLASSFIERRESULT[o]==2:
count2+=1
elif CLASSFIERRESULT[o]==3:
count3+=1
elif CLASSFIERRESULT[o]==4:
count4+=1
else:pass
w_w,w_b,w_s,w_c=count1/true_walk,count2/true_walk,count3/true_walk,count4/true_walk
count1,count2,count3,count4=0,0,0,0
true_bike = Data_ynew[Data_ynew==2].shape[0]
true_bike_index = np.argwhere(Data_ynew==2)[:,0]
for o in true_bike_index:
if CLASSFIERRESULT[o]==1:
count1+=1
elif CLASSFIERRESULT[o]==2:
count2+=1
elif CLASSFIERRESULT[o]==3:
count3+=1
elif CLASSFIERRESULT[o]==4:
count4+=1
else:pass
b_w,b_b,b_s,b_c=count1/true_bike,count2/true_bike,count3/true_bike,count4/true_bike
count1,count2,count3,count4=0,0,0,0
true_bus = Data_ynew[Data_ynew==3].shape[0]
true_bus_index = np.argwhere(Data_ynew==3)[:,0]
for o in true_bus_index:
if CLASSFIERRESULT[o]==1:
count1+=1
elif CLASSFIERRESULT[o]==2:
count2+=1
elif CLASSFIERRESULT[o]==3:
count3+=1
elif CLASSFIERRESULT[o]==4:
count4+=1
else:pass
s_w,s_b,s_s,s_c=count1/true_bus,count2/true_bus,count3/true_bus,count4/true_bus
count1,count2,count3,count4=0,0,0,0
true_car = Data_ynew[Data_ynew==4].shape[0]
true_car_index = np.argwhere(Data_ynew==4)[:,0]
for o in true_car_index:
if CLASSFIERRESULT[o]==1:
count1+=1
elif CLASSFIERRESULT[o]==2:
count2+=1
elif CLASSFIERRESULT[o]==3:
count3+=1
elif CLASSFIERRESULT[o]==4:
count4+=1
else:pass
c_w,c_b,c_s,c_c=count1/true_car,count2/true_car,count3/true_car,count4/true_car
print('line1',w_w,w_b,w_s,w_c)
print('line2',b_w,b_b,b_s,b_c)
print('line3',s_w,s_b,s_s,s_c)
print('line4',c_w,c_b,c_s,c_c)