机器学习实战-kNN笔记

import numpy as np
import operator
'''
《机器学习实战》kNN分类算法及注释
'''
'\n《机器学习实战》kNN分类算法及注释\n'
#创建数据集
def createDataSet():
    group = [[1.0,1.1], [0.9,1.0], [0.1,0.2], [0.2,0.3]]
    labels = ['A', 'A', 'B', 'B']
    return group,labels
#numpytile函数,重复[1,2] 3行4列
tile_result = np.tile([1,2],(3,4))
print(tile_result)
[[1 2 1 2 1 2 1 2]
 [1 2 1 2 1 2 1 2]
 [1 2 1 2 1 2 1 2]]
shape_result = tile_result.shape
print(shape_result)
print(shape_result[0])
print(shape_result[1])
(3, 8)
3
8
#定义kNN分类函数
def classify0(inX, dataSet, labels, k):
    #inX代表输入的数据坐标,DataSet:n个m维数据,大小n*m
    dataSetSize = dataSet.shape[0]
    #求x与各数据坐标的差,形成n*m的差值数组
    diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat ** 2 
    #np.sum(axis = 0/1);python自带的sum(sequence,[start,])求和再加start
    sqDistances = sqDiffMat.sum(axis = 1)
    distances = sqDistances ** 0.5
    #np.argsort(x),返回排序升序序号。http://blog.csdn.net/maoersong/article/details/21875705
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteLabel = labels[sortedDistIndicies[i]]
        classCount[voteLabel] = classCount.get(voteLabel, 0) +1
    #http://www.cnblogs.com/zle1992/p/6271105.html
    #http://blog.csdn.net/dongtingzhizi/article/details/12068205
    #key = operator.itemgetter(1) 提取classCount第2个域作为排序
    #python3 中 用dict.items()没有iteritems()方法了
    #key = operator.itemgetter(1)
    sortedClassCount = sorted(classCount.items(), key = lambda x:x[1], reverse = True)
    #sortedClassCount[0]为行最小第一项,第二个[0]提取第一项的列class
    return sortedClassCount[0][0]
#定义文本txt数据转换numpy格式
def file2matrix(filename):
    with open(filename,'r') as fr:
        #读取全文readlines()
        arrayLines = fr.readlines()
        numberOfLines = len(arrayLines)
        #生成与文件相同大小的zeros
        returnMat = np.zeros(((numberOfLines),3))
        classLabelVector = []
        index = 0
        for line in arrayLines:
            #去空格
            line = line.strip()
            #去line中3项之间的'\t'
            listFromLine = line.split('\t')
            #是否可以for index,line in enumerate(arrayLines):提取index而不单用index
            returnMat[index,:] = listFromLine[0:3]
            #生成classLabel
            classLabelVector.append(int(listFromLine[-1]))
            index += 1
        return returnMat,classLabelVector
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
datingDataMat
array([[  4.09200000e+04,   8.32697600e+00,   9.53952000e-01],
       [  1.44880000e+04,   7.15346900e+00,   1.67390400e+00],
       [  2.60520000e+04,   1.44187100e+00,   8.05124000e-01],
       ..., 
       [  2.65750000e+04,   1.06501020e+01,   8.66627000e-01],
       [  4.81110000e+04,   9.13452800e+00,   7.28045000e-01],
       [  4.37570000e+04,   7.88260100e+00,   1.33244600e+00]])
datingLabels[0:20]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
#归一化特征值,newValue = (oldValue - min)/(max - min)
def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    #np.shape(a)a是列表或是array都可以提取,也可以array.shape(),但list没有shape属性
    normDataSet = np.zeros(np.shape(dataSet))
    m = dataSet.shape[0]
    #全部减min
    normDataSet = dataSet - np.tile(minVals, (m,1))
    #除以ranges,得到归一化数据
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals
normMat, ranges, minVals = autoNorm(datingDataMat)
normMat
array([[ 0.44832535,  0.39805139,  0.56233353],
       [ 0.15873259,  0.34195467,  0.98724416],
       [ 0.28542943,  0.06892523,  0.47449629],
       ..., 
       [ 0.29115949,  0.50910294,  0.51079493],
       [ 0.52711097,  0.43665451,  0.4290048 ],
       [ 0.47940793,  0.3768091 ,  0.78571804]])
def datingClassTest(k):
    #输入待分类数据占训练数据的比例
    hoRatio = 0.1
    #'datingTestSet2.txt'可以运行,'datingTestSet.txt'不能运行
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    #分割测试数据与训练数据的位置
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        #输入数据前numTestVecs个,训练数据dataSet从numTestVecs到m,即剩下的数据集
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                    datingLabels[numTestVecs:m], k)
        print("the classifier came back with: {0},the real answer in :{1}\
        ".format(classifierResult, datingLabels[i]))
        if(classifierResult != datingLabels[i]):
            errorCount += 1.0
    print("the total error rate is: {0}".format(errorCount/float(numTestVecs)))
datingClassTest(3)
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :3        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 3,the real answer in :3        
the classifier came back with: 2,the real answer in :2        
the classifier came back with: 1,the real answer in :1        
the classifier came back with: 3,the real answer in :1        
the total error rate is: 0.05

你可能感兴趣的:(python,机器学习)