2.2 示例:使用K-近邻算法改进约会网站的配结果
1、数据:机器学习实战源码及数据集
密码:6irz
2、准备数据
# 将文本转换成Numpy矩阵
def fileToMatrix(filename):
# 打开文件
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
# 构建一个全零矩阵用来存储特征信息
returnMat = zeros((numberOfLines, 3))
# 构建一个标签数组用来存储特征对应的类别标签
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip()
listFromLine = line.split('\t')
# 复制特征信息
returnMat[index, :] = listFromLine[0:3]
# 复制类别标签
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
# 归一化数据
def autoNorm(dataSet):
# min(0)每一列中的最小值, min(1)每一行中的最小值
minValues = dataSet.min(0)
# max(0)每一列中的最大值
maxValues = dataSet.max(0)
# 取值范围
ranges = maxValues - minValues
# 初始化矩阵
normDataSet = mat(zeros(shape(dataSet)))
# print(normDataSet)
# 返回dataset的行数
m = dataSet.shape[0]
normDataSet = dataSet - tile(minValues, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1))
return normDataSet, ranges, minValues
3、分析数据
import matplotlib.pyplot as plt
from chapter2.KNN import *
datingDataMat, datingLabels = fileToMatrix("datingTestSet2.txt")
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0*array(datingLabels), 15.0*array(datingLabels))
plt.show()
可以自行设置横纵坐标轴表示的数据,即scatter()的前两个参数,如下
5、测试算法
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from chapter2.KNN import *
# 分类器针对约会网站分类
def datingClass():
hoRatio = 0.10
datingDataMat, datingLabels = fileToMatrix("datingTestSet2.txt")
normMat, ranges, minValues = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classfiy0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is : %f" % (errorCount / float(numTestVecs)))
datingClass()
另外附上完整的KNN.py文件(Python3)
from numpy import *
import operator
# 构造分类器,用于分类的inX, 训练的样本集dataSet, 标签向量labels, 最近邻居数目k
def classfiy0(inX, dataSet, labels, k):
# shape[0]返回行数, shape[1]返回列数
dataSetSize = dataSet.shape[0]
"""1、把当前数据复制成训练集大小,以便同训练集中每一个数据比较"""
# tile(A, n)将A数组重复n次, 这里是列数不变,行数变dataSetSize行
# 跟dataset做差,即与每一个训练数据做差(求距离)
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
# 分别对每一个数据平方
sqdiffMat = diffMat**2
# 将矩阵的每一行向量相加
sqDistance = sqdiffMat.sum(axis=1)
# 平方根
distances = sqDistance**0.5
"""2、将比较结果排序"""
# 返回从小到大排序后的索引值
sortedDistIndicies = distances.argsort()
"""3、统计最近k个值的类别"""
# 新建字典,保存最近的K个值分别是什么类别
classCount = {}
for i in range(k):
# 获取第i个值的类别
voteIlabel = labels[sortedDistIndicies[i]]
# 统计每个得数目
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
"""4、对统计结果拍序"""
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
"""5、返回在这K个值中,出现次数最多的类别"""
return sortedClassCount[0][0]
# 将文本转换成Numpy矩阵
def fileToMatrix(filename):
# 打开文件
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
# 构建一个全零矩阵用来存储特征信息
returnMat = zeros((numberOfLines, 3))
# 构建一个标签数组用来存储特征对应的类别标签
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip()
listFromLine = line.split('\t')
# 复制特征信息
returnMat[index, :] = listFromLine[0:3]
# 复制类别标签
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
# 归一化数据
def autoNorm(dataSet):
# min(0)每一列中的最小值, min(1)每一行中的最小值
minValues = dataSet.min(0)
# max(0)每一列中的最大值
maxValues = dataSet.max(0)
# 取值范围
ranges = maxValues - minValues
# 初始化矩阵
normDataSet = mat(zeros(shape(dataSet)))
# print(normDataSet)
# 返回dataset的行数
m = dataSet.shape[0]
normDataSet = dataSet - tile(minValues, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1))
return normDataSet, ranges, minValues
6、使用算法:输入特征数据以判断对方是否是自己喜欢的类型
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from chapter2.KNN import *
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingMat, datingLabels = fileToMatrix("datingTestSet2.txt")
normMat, ranges, minValues = autoNorm(datingMat)
inArry = [ffMiles, percentTats, iceCream]
classifierResult = classfiy0(inArry, datingMat, datingLabels, 3)
print("You will probably like this person: " + resultList[classifierResult - 1])
classifyPerson()
2.3 示例:手写识别系统
1、收集数据:见上
2、准备数据:编写imgToVector(), 将图像格式转换为分类器使用的向量格式
# 将二进制图像矩阵转换成一维数组
def imgToVector(filename):
returnVect = zeros((1, 1024))
# 打开文件
fr = open(filename)
for i in range(32):
linestr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(linestr[j])
return returnVect
3、测试算法:编写函数使用提供的部分数据集作为测试样本,测试样本与非测试样本的区别在于测试样本是已经完成分类的数据,如果预测分类与实际类别不同,则标记为一个错误。
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from os import listdir
from chapter2.KNN import *
# 将二进制图像矩阵转换成一维数组
def imgToVector(filename):
returnVect = zeros((1, 1024))
# 打开文件
fr = open(filename)
for i in range(32):
linestr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(linestr[j])
return returnVect
# 手写数字识别
def handwritingClassTest():
hwLabels = []
# 训练数据集
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
# 获取文件名
fileNameStr = trainingFileList[i]
filestr = fileNameStr.split('.')[0]
classNumStr = int(filestr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i, :] = imgToVector("trainingDigits/%s" % fileNameStr)
# 测试数据集
testFileList = listdir("testDigits")
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
filestr = fileNameStr.split('.')[0]
classNumStr = int(filestr.split('_')[0])
vectorUnderTest = imgToVector("trainingDigits/%s" % fileNameStr)
classifierResult = classfiy0(vectorUnderTest, trainingMat, hwLabels, 3)
print("the classifier came back with: %d, the real answer is : %d" % (classifierResult, classNumStr))
if classifierResult != classNumStr:
errorCount += 1.0
print("the total number of errors is : %d" % errorCount)
print("the total error rate is : %f" % (errorCount / float(mTest)))
handwritingClassTest()
个人问题及总结: