不解释什么是KNN,不说Python语法,不理解的地方请自行搜索,谢谢!
整理杂物时,发现多年前一本机器学习入门书籍,闲来无事翻阅一下,看否能温故知新。因日常工作大多都是和pandas打交道,突发奇想,基于pandas将书内算法,按自己对的理解,从新实现。
数据链接: https://pan.baidu.com/s/1PTeekZFbpzEefquyq7-1cw 提取码: di6p
环境:Python 3.7
构建函数,完成数据读取、数据处理、数据分类。
读取数据:
def readData(filename):
with open(filename) as f:
data = pd.read_csv(f , sep='\t' , header=None)
labels = data.iloc[:,-1] #将数据和label分开
return data.iloc[:,:-1] , labels
拆分训练数据和测试数据集
def dataSplit(data , lables , hoRatio = 0.2):
num_test = int(len(data)*hoRatio)
data_train = data.iloc[:len(data)-num_test]
data_test = data.iloc[-num_test:]
label_train = labels.iloc[:len(data)-num_test]
labels_test = labels.iloc[-num_test:]
return data_train , label_train , data_test , labels_test
数据归一化
def aotuNorm(data):
minVals = data.min()
maxVals = data.max()
ranges = maxVals - minVals
data = (data-minVals)/ranges
return data , minVals , ranges
knn分类器
def classify(inX , data , labels , k=5 , exponent=2):
df = pd.DataFrame([0]*len(data) , columns=['distance'] )
for i in range(len(inX)):
df['distance'] = df['distance'] + (data.iloc[:,i] - inX[i])**exponent
df['distance'] = df['distance']**(1/exponent)
df['class'] = labels
df.sort_values('distance' , axis=0 , inplace=True)
classVal = df['class'].iloc[:k].value_counts()
return classVal.index.tolist()[0]
训练及预测
def datingClassTest(data_train=data_train ,
label_train=label_train ,
data_test=data_test ,
labels_test=labels_test ,
minVals=minVals ,
ranges=ranges ,
k=5 ,
exponent=2 ):
forecast = []
for i in range(len(data_test)):
forecast.append( classify((data_test.iloc[i]-minVals)/ranges , data_train , label_train , k=k , exponent=exponent) )
data_test['labels'] = labels_test
data_test['test'] = forecast
return data_test
约会网站数据分类预测
import pandas as pd
data , lables = readData("datingTestSet.txt")
data_train , label_train , data_test , labels_test = dataSplit(data , lables )
data_train, minVals , ranges = aotuNorm(data_train)
data_test = datingClassTest()
查看准确率
print( '测试结果准确率:{0:.2f}%'.format(sum(data_test['labels'] ==data_test['test'] )/len(data_test)*100 )