《机器学习实战》学习笔记

  • KNN约会好感预测
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import numpy as np
import operator

def classify(inX,dataSet,labels,k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistance = sqDiffMat.sum(axis=1)
    distance = sqDistance**0.5
    sortedDidIndicies = distance.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDidIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def filematrix(filename):
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    returnMat = np.zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOfLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]

        if listFromLine[-1] == 'didntLike':
            classLabelVector.append(1)
        elif listFromLine[-1] == 'smallDoses':
            classLabelVector.append(2)
        elif listFromLine[-1] == 'largeDoses':
            classLabelVector.append(3)
        index +=1
    return returnMat,classLabelVector

def showdata(dataMat,dataLabel):
    fig, axs = plt.subplots(nrows=2, ncols=2,figsize=(13, 8))
    labelColor = []
    for i in dataLabel:
        if i==1:
            labelColor.append("black")
        elif i==2:
            labelColor.append("orange")
        elif i==3:
            labelColor.append("red")
    axs[0][0].scatter(x = dataMat[:,0],y = dataMat[:,1],color = labelColor,s = 15,alpha = 0.6)
    axs[0][1].scatter(x=dataMat[:, 0], y=dataMat[:, 2], color=labelColor, s=1, alpha=0.6)
    axs[1][0].scatter(x=dataMat[:, 2], y=dataMat[:, 1], color=labelColor, s=15, alpha=1)
    x = mlines.Line2D([],[],marker='.',markersize=6,color='black',label='x')
    t = mlines.Line2D([], [], marker='.',markersize=6,label='t')
    y = mlines.Line2D([], [], marker='.',markersize=6,label='y')
    axs[0][0].legend(handles=[x,t,y])
    plt.show()

def autoNorm(dataset):
    min = dataset.min(0)
    max = dataset.max(0)
    range = max-min
    normMat = np.zeros(dataset.shape)
    normMat = dataset-np.tile(min,(dataset.shape[0],1))
    normMat = normMat/np.tile(range,(dataset.shape[0],1))
    return normMat,range,min

def classifyTest():
    dataMat,dataLabel = filematrix("datingTestSet.txt")
    autoNormMat,ranges,min = autoNorm(dataMat)
    errorrate = 0
    testRate=0.1
    dataNum = autoNormMat.shape[0]
    testNum = int(testRate*dataNum)
    for i in range(testNum):
        classOut = classify(autoNormMat[i,:], autoNormMat[testNum:,:],dataLabel[testNum:],4)
        if classOut !=dataLabel[i]:
            errorrate+=1
        print("分类结果%d\t真实结果%d"%(classOut,dataLabel[i]))
    print("错误率%f%%"%(errorrate/testNum))


if __name__ == '__main__':
    classifyTest()
  • KNN手写识别
import numpy as np
import operator
from sklearn.neighbors import KNeighborsClassifier as kNN

def classify(inX,dataSet,labels,k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistance = sqDiffMat.sum(axis=1)
    distance = sqDistance**0.5
    sortedDidIndicies = distance.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDidIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def imgVector(filename):
    fr = open(filename)
    fileVector = np.zeros((1,1024))
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            fileVector[0,32*i+j]=int(lineStr[j])
    return fileVector

def handWriteClassify():
    trainList = listdir("trainingDigits")
    hwLabel=[]
    m = len(trainList)
    trainMat=np.zeros((m,1024))
    for i in range(m):
        fileStr = trainList[i]
        hwLabel.append(fileStr.split("_")[0])
        trainMat[i,:]=imgVector("trainingDigits/%s" % fileStr)
    testList = listdir("testDigits")
    errorRate=0
    mtest = len(testList)
    #使用sklearn自带的knn算法一样有效
    #neigh = kNN(n_neighbors=3, algorithm='auto')
    #neigh.fit(trainMat, hwLabel)
    for i in range(mtest):
        testFileStr = testList[i]
        testVector = imgVector("testDigits/%s"% testFileStr)
        testLabel = int(testFileStr.split("_")[0])
        classifyOut = int(classify(testVector,trainMat,hwLabel,3))
        #classifyOut = neigh.predict(testVector,)
        print("真实数字%d,识别数字%d"%(testLabel,classifyOut))
        if hwLabel!=testLabel:
            errorRate +=1
        print("错误率%s"%(errorRate/mtest))

if __name__ == '__main__':
    handWriteClassify()




  • KNN优缺点

优点

简单好用,容易理解,精度高,理论成熟,既可以用来做分类也可以用来做回归;
可用于数值型数据和离散型数据;
训练时间复杂度为O(n);无数据输入假定;
对异常值不敏感。

缺点:

计算复杂性高;空间复杂性高;
样本不平衡问题(即有些类别的样本数量很多,而其它样本的数量很少);
一般数值很大的时候不用这个,计算量太大。但是单个样本又不能太少,否则容易发生误分。
最大的缺点是无法给出数据的内在含义。

  • 决策树
from math import log


def shannonEnt(dataSet):
    x_num = len(dataSet)
    label_set={}
    for x in dataSet:
        current_label = x[-1]
        if current_label not in label_set.keys():
            label_set[current_label]=0
        else:
            label_set[current_label]+=1
    shannon_ent = 0.0
    for key in label_set:
        prob = float(label_set[key])/x_num
        shannon_ent -= prob * log(prob,2)
    return shannon_ent

def split_dataset(dataSet,axis,value):
    re_dataset = []
    for x in dataSet:
        if x[axis] == value:
            rere_dataset=x[:axis]
            rere_dataset.extend(x[axis+1:])
            re_dataset.append(rere_dataset)
    return re_dataset


def createDataSet():
        dataSet = [[0, 0, 0, 0, 'no'],
                   [0, 0, 0, 1, 'no'],
                   [0, 1, 0, 1, 'yes'],
                   [0, 1, 1, 0, 'yes'],
                   [0, 0, 0, 0, 'no'],
                   [1, 0, 0, 0, 'no'],
                   [1, 0, 0, 1, 'no'],
                   [1, 1, 1, 1, 'yes'],
                   [1, 0, 1, 2, 'yes'],
                   [1, 0, 1, 2, 'yes'],
                   [2, 0, 1, 2, 'yes'],
                   [2, 0, 1, 1, 'yes'],
                   [2, 1, 0, 1, 'yes'],
                   [2, 1, 0, 2, 'yes'],
                   [2, 0, 0, 0, 'no']]
        labels = ['年龄', '有工作', '有自己的房子', '信贷情况']
        return dataSet, labels

def chooseBestFeatureToSplit(dataSet):
    num_feature = len(dataSet)
    ini_shannon_ent = shannonEnt(dataSet)
    best_information_gain=0.0
    best_feature = 0
    for i in range(num_feature):
        feature_value = set(dataSet[i])
        feature_entropy = 0.0
        for value in feature_value:
            feature_i = split_dataset(dataSet,i,value)
            prob = len(feature_i)/num_feature
            feature_entropy += prob*shannonEnt(feature_i)
        information_gain = ini_shannon_ent-feature_entropy
        print("第%d个特征的信息增益为%.3f"%(i,information_gain))
        if information_gain>best_information_gain:
            best_information_gain=information_gain
            best_feature = i
    print("最好的划分点是%d,它的信息增益是是%d"%(best_feature,best_information_gain))


if __name__ == '__main__':
    dataSet, features = createDataSet()
    chooseBestFeatureToSplit(dataSet)

你可能感兴趣的:(python)