本人是一名数学系研究生,于2017年底第一次接触python和机器学习,作为一名新手,欢迎与大家交流。
我主要给大家讲解代码,理论部分给大家推荐3本书:
《机器学习实战中文版》
《机器学习》周志华
《统计学习方法》李航
以上3本书,第一本是基于python2的代码实现;剩余两本主要作为第一本书理论省略部分的补充,理论大部分都讲得很细。
博客上关于机器学习实战理论解释都很多,参差不齐,好作品也大都借鉴了以上3本书,网上有很多电子版的书。
与其看看一些没用的博客,真心不如以上3本书有收获。
说实话,学习一定要静下心来,切忌浮躁。不懂可以每天看一点,每天你懂一点,天天积累就多了。
操作系统:windows8.1
python版本:python3.6
运行环境:spyder(anaconda)
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 16 21:38:34 2018
@author: Loulch C.C
"""
from numpy import *
def loadDataSet(fileName):
"""
函数说明:数据导入函数
"""
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
#将每行映射成浮点数,python3中map()返回值改变,所以需要修改源代码
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
"""
函数说明: 数据切分函数(切分数据集为两个子集:左子集和右子集)
dataSet: 数据集
feature: 待切分特征
value: 特征值
"""
mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :]
#nonzero()的返回值是一个数组array([x,..., x], dtype=int64),其中第一项是索引值列表,()应用数组过滤
#[[x,..., x],:]将每个索引值所对应的行进行全部复制
"""
下面的原书代码报错 index 0 is out of bounds,使用上面两行代码
mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0]
mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0]
"""
return mat0, mat1
def regLeaf(dataSet):
"""
函数说明: 当chooseBestSplit()确定不再对数据进行切分时,调用regLeaf()生成叶结点
返回值: 叶结点(在回归树中,其实就是目标变量的均值)
"""
return mean(dataSet[:,-1])
def regErr(dataSet):
"""
函数说明:平方误差函数
返回值:平方误差
"""
return var(dataSet[:,-1]) * shape(dataSet)[0]
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
"""
函数说明:回归树切分函数(找到数据的最佳二元切分方式)
返回值:若找不到好的切分方式,本函数在3种情况下不会切分,直接创建叶结点;
若找到好的切分方式,返回最好的切分的特征编号和切分特征值
"""
tolS = ops[0] #允许的误差下降值,用户指定参数,用于控制函数的停止时机
tolN = ops[1] #切分的最小样本数,用户指定参数,用于控制函数的停止时机
#tolS,tolN进行的实际上是一种预剪枝处理
#################################若所有值都相同,则退出##############################
if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
#用set()对当前所有目标变量建立一个集合,不含相同项,用len()统计不同剩余特征值数目,若为1,则无需切分
return None,leafType(dataSet)
####################################################################################
m,n = shape(dataSet)
S = errType(dataSet)
#该误差S将用于与新切分的误差进行对比,来检查新切分能否降低误差
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1): #遍历数据的每个属性特征
# for splitVal in set(dataSet[:,featIndex]): python3报错修改为下面
for splitVal in set((dataSet[:, featIndex].T.A.tolist())[0]): #遍历每个特征里不同的特征值
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) #对每个特征进行二元切分
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS: #更新为误差最小的特征
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
#如果切分后误差效果下降不大,则取消切分,直接创建叶结点
return None,leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
#另外,检查切分后子集大小,若小于最小允许样本数tolN,则停止切分
return None, leafType(dataSet)
return bestIndex,bestValue #返回特征编号和用于切分的特征值
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
"""
函数说明: 树构建函数
dataSet: 数据集
leafType: 建立叶节点函数
errType: 误差计算函数
ops: 包含树构建所需的其他参数的元组
返回值: 构建好的树retTree
"""
feat,val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None: return val #满足停止条件时返回叶结点值,见chooseBestSplit()
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
#切分后的左右子树
lSet, rSet = binSplitDataSet(dataSet, feat, val) #切分数据集
retTree['left'] = createTree(lSet, leafType, errType, ops) #建立左子树
retTree['right'] = createTree(rSet, leafType, errType, ops) #建立右子树
return retTree
"""
if __name__=='__main__':
myDat=loadDataSet('ex00.txt')
myMat=mat(myDat)
retTree=createTree(myMat)
print(retTree)
myDat1=loadDataSet('ex0.txt')
myMat1=mat(myDat1)
retTree1=createTree(myMat1)
print(retTree1)
#"""
import matplotlib.pyplot as plt
def showDataSet():
myDat=loadDataSet('ex00.txt')
#print('myDat=',myDat)
myMat=mat(myDat)
fig, axs = plt.subplots(nrows=3, ncols=1,figsize=(6,19))
axs[0].scatter(myMat[:,0].flatten().A[0],myMat[:,1].flatten().A[0], c = 'blue')
myDat1=loadDataSet('ex0.txt')
#print('myDat1=',myDat1)
myMat1=mat(myDat1)
axs[1].scatter(myMat1[:,1].flatten().A[0],myMat1[:,2].flatten().A[0], c = 'blue')
myDat2=loadDataSet('ex2.txt')
#print('myDat2=',myDat2)
myMat2=mat(myDat2)
axs[2].scatter(myMat2[:,0].flatten().A[0],myMat2[:,1].flatten().A[0], c = 'blue')
plt.show()
"""
if __name__=='__main__':
showDataSet()
#"""
def isTree(obj):
"""
函数说明:测试输入变量是否为一棵树
"""
return (type(obj).__name__=='dict') #判断为字典类型返回true
#
def getMean(tree):
"""
函数说明:从上往下遍历树直到找到叶节点为止,若找到两个叶节点,计算他们的平均值
返回值:树的平均值(对树进行塌陷式处理)
"""
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left']+tree['right'])/2.0
#
def prune(tree, testData):
"""
函数说明:树的后剪枝
tree:待剪枝的树
testData:剪枝所需的测试数据
返回值:剪枝后的树tree
"""
if shape(testData)[0] == 0: return getMean(tree) #没有测试数据则对树进行塌陷式处理
if (isTree(tree['right']) or isTree(tree['left'])): #左右子树有一个非空
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
if not isTree(tree['left']) and not isTree(tree['right']): #剪枝后判断是否还是有子树
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = sum(power(lSet[:, -1] - tree['left'], 2)) + \
sum(power(rSet[:, -1] - tree['right'], 2)) #计算合并前的错误率
treeMean = (tree['left'] + tree['right']) / 2.0
errorMerge = sum(power(testData[:, -1] - treeMean, 2)) #计算合并后的错误率
if errorMerge < errorNoMerge: #如果合并后误差变小
print("merging")
return treeMean
else:
return tree
else:
return tree
"""
if __name__ == '__main__':
myDat2=loadDataSet('ex2.txt')
#print('myDat2=',myDat2)
myMat2=mat(myDat2)
myTree=createTree(myMat2,ops=(0,1))
print('myTree=',myTree)
myDat2Test=loadDataSet('ex2test.txt')
myMat2Test=mat(myDat2Test)
print('prune(myTree,myMat2Test)=',prune(myTree,myMat2Test))
#"""
#模型树
def linearSolve(dataSet):
"""
函数说明:将数据集格式化为自变量X,目标变量Y
"""
m,n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1)))
X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]
xTx = X.T*X
if linalg.det(xTx) == 0.0: #X Y用于简单线性回归,需要判断矩阵可逆
raise NameError('This matrix is singular, cannot do inverse,\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y) #标准线性回归
return ws,X,Y
def modelLeaf(dataSet): #不需要切分时生成模型树叶节点
ws,X,Y = linearSolve(dataSet)
return ws #返回回归系数
def modelErr(dataSet): #用来计算误差找到最佳切分
ws,X,Y = linearSolve(dataSet)
yHat = X * ws
return sum(power(Y-yHat,2))
"""
if __name__ == '__main__':
myDat2=loadDataSet('exp2.txt')
#print('myDat2=',myDat2)
myMat2=mat(myDat2)
myTree=createTree(myMat2,modelLeaf,modelErr,ops=(1,10))
print('myTree=',myTree)
plt.scatter(myMat2[:,0].flatten().A[0],myMat2[:,1].flatten().A[0], c = 'blue')
#"""
#用树回归进行预测
def regTreeEval(model, inDat):
"""
函数说明:对回归树叶节点进行预测
"""
return float(model)
def modelTreeEval(model, inDat):
"""
函数说明:对模型树叶节点进行预测
"""
############输入数据格式化处理###########
n = shape(inDat)[1]
X = mat(ones((1, n + 1)))
X[:, 1:n + 1] = inDat
########################################
return float(X * model) #返回预测值
#
def treeForeCast(tree, inData, modelEval=regTreeEval): #指定树类型
"""
函数说明:自顶向下遍历整个树,直到命中叶节点为止,输入单个数据点/行向量。
返回值:返回一个预测值
"""
if not isTree(tree): return modelEval(tree, inData)
if inData[tree['spInd']] > tree['spVal']:
if isTree(tree['left']): #有左子树 递归进入子树
return treeForeCast(tree['left'], inData, modelEval)
else: #不存在子树 返回左叶节点
return modelEval(tree['left'], inData)
else:
if isTree(tree['right']): #有右子树 递归进入子树
return treeForeCast(tree['right'], inData, modelEval)
else: #不存在子树 返回右叶节点
return modelEval(tree['right'], inData)
#对数据进行树结构建模
def createForeCast(tree, testData, modelEval=regTreeEval):
"""
函数说明:对整个测试集进行预测
"""
m = len(testData)
yHat = mat(zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat
"""
if __name__ == '__main__':
trainMat=mat(loadDataSet('bikeSpeedVsIq_train.txt'))
testMat=mat(loadDataSet('bikeSpeedVsIq_test.txt'))
###################################画散点图####################################
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(testMat[:,0].A.reshape(1,-1),testMat[:,1].A.reshape(1,-1), c = 'blue')
plt.xlabel('speed of riding')
plt.ylabel('IQ')
plt.show()
###############################################################################
myTree=createTree(trainMat,ops=(1,20))
# print('myTree=',myTree)
# myTree= {'spInd': 0, 'spVal': 10.0, 'left': {'spInd': 0, 'spVal': 17.0, 'left':
# {'spInd': 0, 'spVal': 20.0, 'left': 168.34161286956524, 'right': 157.0484078846154},
# 'right': {'spInd': 0, 'spVal': 14.0, 'left': 141.06067981481482,
# 'right': 122.90893026923078}}, 'right': {'spInd': 0, 'spVal': 7.0,
# 'left': 94.7066578125, 'right': {'spInd': 0, 'spVal': 5.0,
# 'left': 69.02117757692308,
# 'right': 50.94683665}}}
yHat=createForeCast(myTree,testMat[:,0])
print('回归树的相关性R^2:',corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])
# 回归树的相关性R^2: 0.9640852318222145
#corrcoef()计算预测值和实际值的相关性R^2,R^2越接近1越好。
myTree1=createTree(trainMat,modelLeaf,modelErr,ops=(1,20))
yHat=createForeCast(myTree1,testMat[:,0],modelTreeEval)
print('模型树的相关性R^2:',corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])
# 模型树的相关性R^2: 0.9760412191380623
ws,X,Y=linearSolve(trainMat)
print('ws=',ws)
# ws= [[37.58916794]
# [ 6.18978355]]
for i in range(shape(testMat)[0]):
yHat[i]=testMat[i,0]*ws[1,0]+ws[0,0]
print('标准线性回归的相关性R^2:',corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])
# 标准线性回归的相关性R^2: 0.9434684235674762
#"""
"""
#python Tkinter库创建GUI
import regTrees
import matplotlib
matplotlib.use('TkAgg') #设置后端TkAgg
#将TkAgg和matplotlib链接起来
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure
def reDraw(tolS, tolN):
reDraw.f.clf() #清空之前的图像
reDraw.a = reDraw.f.add_subplot(111)#重新添加新图
if chkBtnVar.get():#检查选框model tree是否被选中
if tolN < 2: tolN = 2
myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf,regTrees.modelErr, (tolS, tolN))
yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval)
else:
myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN))
yHat = regTrees.createForeCast(myTree, reDraw.testDat)
reDraw.a.scatter(reDraw.rawDat[:, 0], reDraw.rawDat[:, 1], s=5) # 绘制真实值
reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0) # 绘制预测值
reDraw.canvas.show()
def getInputs():#获取输入
try:#期望输入是整数
tolN = int(tolNentry.get())
except:#清楚错误用默认值替换
tolN = 10
print("enter Integer for tolN")
tolNentry.delete(0, END)
tolNentry.insert(0, '10')
try:#期望输入是浮点数
tolS = float(tolSentry.get())
except:
tolS = 1.0
print("enter Float for tolS")
tolSentry.delete(0, END)
tolSentry.insert(0, '1.0')
return tolN, tolS
def drawNewTree():
tolN, tolS = getInputs() # 从输入文本框中获取参数
reDraw(tolS, tolN) #绘制图
root = Tk()
reDraw.f = Figure(figsize=(5, 4), dpi=100) # 创建画布
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
reDraw.canvas.show()
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)
Label(root, text="tolN").grid(row=1, column=0)
tolNentry = Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0, '10')
Label(root, text="tolS").grid(row=2, column=0)
tolSentry = Entry(root)
tolSentry.grid(row=2, column=1)
tolSentry.insert(0, '1.0')
Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
chkBtnVar = IntVar()
chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)
reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt'))
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
reDraw(1.0, 10)
root.mainloop()
"""