import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import numpy as np
import operator
def classify(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistance = sqDiffMat.sum(axis=1)
distance = sqDistance**0.5
sortedDidIndicies = distance.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDidIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def filematrix(filename):
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
if listFromLine[-1] == 'didntLike':
classLabelVector.append(1)
elif listFromLine[-1] == 'smallDoses':
classLabelVector.append(2)
elif listFromLine[-1] == 'largeDoses':
classLabelVector.append(3)
index +=1
return returnMat,classLabelVector
def showdata(dataMat,dataLabel):
fig, axs = plt.subplots(nrows=2, ncols=2,figsize=(13, 8))
labelColor = []
for i in dataLabel:
if i==1:
labelColor.append("black")
elif i==2:
labelColor.append("orange")
elif i==3:
labelColor.append("red")
axs[0][0].scatter(x = dataMat[:,0],y = dataMat[:,1],color = labelColor,s = 15,alpha = 0.6)
axs[0][1].scatter(x=dataMat[:, 0], y=dataMat[:, 2], color=labelColor, s=1, alpha=0.6)
axs[1][0].scatter(x=dataMat[:, 2], y=dataMat[:, 1], color=labelColor, s=15, alpha=1)
x = mlines.Line2D([],[],marker='.',markersize=6,color='black',label='x')
t = mlines.Line2D([], [], marker='.',markersize=6,label='t')
y = mlines.Line2D([], [], marker='.',markersize=6,label='y')
axs[0][0].legend(handles=[x,t,y])
plt.show()
def autoNorm(dataset):
min = dataset.min(0)
max = dataset.max(0)
range = max-min
normMat = np.zeros(dataset.shape)
normMat = dataset-np.tile(min,(dataset.shape[0],1))
normMat = normMat/np.tile(range,(dataset.shape[0],1))
return normMat,range,min
def classifyTest():
dataMat,dataLabel = filematrix("datingTestSet.txt")
autoNormMat,ranges,min = autoNorm(dataMat)
errorrate = 0
testRate=0.1
dataNum = autoNormMat.shape[0]
testNum = int(testRate*dataNum)
for i in range(testNum):
classOut = classify(autoNormMat[i,:], autoNormMat[testNum:,:],dataLabel[testNum:],4)
if classOut !=dataLabel[i]:
errorrate+=1
print("分类结果%d\t真实结果%d"%(classOut,dataLabel[i]))
print("错误率%f%%"%(errorrate/testNum))
if __name__ == '__main__':
classifyTest()
import numpy as np
import operator
from sklearn.neighbors import KNeighborsClassifier as kNN
def classify(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistance = sqDiffMat.sum(axis=1)
distance = sqDistance**0.5
sortedDidIndicies = distance.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDidIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def imgVector(filename):
fr = open(filename)
fileVector = np.zeros((1,1024))
for i in range(32):
lineStr = fr.readline()
for j in range(32):
fileVector[0,32*i+j]=int(lineStr[j])
return fileVector
def handWriteClassify():
trainList = listdir("trainingDigits")
hwLabel=[]
m = len(trainList)
trainMat=np.zeros((m,1024))
for i in range(m):
fileStr = trainList[i]
hwLabel.append(fileStr.split("_")[0])
trainMat[i,:]=imgVector("trainingDigits/%s" % fileStr)
testList = listdir("testDigits")
errorRate=0
mtest = len(testList)
#使用sklearn自带的knn算法一样有效
#neigh = kNN(n_neighbors=3, algorithm='auto')
#neigh.fit(trainMat, hwLabel)
for i in range(mtest):
testFileStr = testList[i]
testVector = imgVector("testDigits/%s"% testFileStr)
testLabel = int(testFileStr.split("_")[0])
classifyOut = int(classify(testVector,trainMat,hwLabel,3))
#classifyOut = neigh.predict(testVector,)
print("真实数字%d,识别数字%d"%(testLabel,classifyOut))
if hwLabel!=testLabel:
errorRate +=1
print("错误率%s"%(errorRate/mtest))
if __name__ == '__main__':
handWriteClassify()
优点
简单好用,容易理解,精度高,理论成熟,既可以用来做分类也可以用来做回归;
可用于数值型数据和离散型数据;
训练时间复杂度为O(n);无数据输入假定;
对异常值不敏感。
缺点:
计算复杂性高;空间复杂性高;
样本不平衡问题(即有些类别的样本数量很多,而其它样本的数量很少);
一般数值很大的时候不用这个,计算量太大。但是单个样本又不能太少,否则容易发生误分。
最大的缺点是无法给出数据的内在含义。
from math import log
def shannonEnt(dataSet):
x_num = len(dataSet)
label_set={}
for x in dataSet:
current_label = x[-1]
if current_label not in label_set.keys():
label_set[current_label]=0
else:
label_set[current_label]+=1
shannon_ent = 0.0
for key in label_set:
prob = float(label_set[key])/x_num
shannon_ent -= prob * log(prob,2)
return shannon_ent
def split_dataset(dataSet,axis,value):
re_dataset = []
for x in dataSet:
if x[axis] == value:
rere_dataset=x[:axis]
rere_dataset.extend(x[axis+1:])
re_dataset.append(rere_dataset)
return re_dataset
def createDataSet():
dataSet = [[0, 0, 0, 0, 'no'],
[0, 0, 0, 1, 'no'],
[0, 1, 0, 1, 'yes'],
[0, 1, 1, 0, 'yes'],
[0, 0, 0, 0, 'no'],
[1, 0, 0, 0, 'no'],
[1, 0, 0, 1, 'no'],
[1, 1, 1, 1, 'yes'],
[1, 0, 1, 2, 'yes'],
[1, 0, 1, 2, 'yes'],
[2, 0, 1, 2, 'yes'],
[2, 0, 1, 1, 'yes'],
[2, 1, 0, 1, 'yes'],
[2, 1, 0, 2, 'yes'],
[2, 0, 0, 0, 'no']]
labels = ['年龄', '有工作', '有自己的房子', '信贷情况']
return dataSet, labels
def chooseBestFeatureToSplit(dataSet):
num_feature = len(dataSet)
ini_shannon_ent = shannonEnt(dataSet)
best_information_gain=0.0
best_feature = 0
for i in range(num_feature):
feature_value = set(dataSet[i])
feature_entropy = 0.0
for value in feature_value:
feature_i = split_dataset(dataSet,i,value)
prob = len(feature_i)/num_feature
feature_entropy += prob*shannonEnt(feature_i)
information_gain = ini_shannon_ent-feature_entropy
print("第%d个特征的信息增益为%.3f"%(i,information_gain))
if information_gain>best_information_gain:
best_information_gain=information_gain
best_feature = i
print("最好的划分点是%d,它的信息增益是是%d"%(best_feature,best_information_gain))
if __name__ == '__main__':
dataSet, features = createDataSet()
chooseBestFeatureToSplit(dataSet)