CART
import matplotlib.pyplot as plt
import numpy as np
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
根据特征切分数据集合
Parameters:
dataSet - 数据集合
feature - 带切分的特征
value - 该特征的值
Returns:
mat0 - 切分的数据集合0
mat1 - 切分的数据集合1
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
"""
Desc:
生成叶结点
Parameters:
dataSet - 数据集合
Returns:
目标变量均值
"""
def regLeaf(dataSet):
return np.mean(dataSet[:, -1])
"""
Desc:
误差估计函数
Parameters:
dataSet - 数据集合
Returns:
目标变量的总方差
"""
def regErr(dataSet):
return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]
"""
Desc:
找到数据的最佳二元切分方式函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
bestIndex - 最佳切分特征
bestValue - 最佳特征值
"""
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m, n = np.shape(dataSet)
S = errType(dataSet)
bestS = float('inf')
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
"""
Desc:
树构建函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
retTree - 构建的回归树
"""
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
if __name__ == '__main__':
myData = loadDataSet('ex2.txt')
myMat = np.mat(myData)
print(createTree(myMat))
import matplotlib.pyplot as plt
import numpy as np
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][1])
ycord.append(dataMat[i][2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
if __name__ == '__main__':
filename = 'ex0.txt'
plotDataSet(filename)
import matplotlib.pyplot as plt
import numpy as np
import types
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
"""
Desc:
根据特征切分数据集合
Parameters:
dataSet - 数据集合
feature - 带切分的特征
value - 该特征的值
Returns:
mat0 - 切分的数据集合0
mat1 - 切分的数据集合1
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
"""
Desc:
生成叶结点
Parameters:
dataSet - 数据集合
Returns:
目标变量均值
"""
def regLeaf(dataSet):
return np.mean(dataSet[:, -1])
"""
Desc:
误差估计函数
Parameters:
dataSet - 数据集合
Returns:
目标变量的总方差
"""
def regErr(dataSet):
return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]
"""
Desc:
找到数据的最佳二元切分方式函数
预剪枝
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
bestIndex - 最佳切分特征
bestValue - 最佳特征值
"""
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m, n = np.shape(dataSet)
S = errType(dataSet)
bestS = float('inf')
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
"""
Desc:
树构建函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
retTree - 构建的回归树
"""
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
"""
Desc:
判断测试输入变量是否是一颗树
树是通过字典存储的
Parameters:
obj - 测试对象
Returns:
是否是一颗树
"""
def isTree(obj):
return (type(obj).__name__ == 'dict')
"""
Desc:
对树进行塌陷处理(即返回树平均值)
Parameters:
tree - 树
Returns:
树的平均值
"""
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left'] + tree['right']) / 2.0
"""
Desc:
后剪枝
Parameters:
tree - 树
testData - 测试集
Returns:
树
"""
def prune(tree, testData):
if np.shape(testData)[0] == 0:
return getMean(tree)
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'], 2)) + np.sum(np.power(rSet[:, -1] - tree['right'], 2))
treeMean = (tree['left'] + tree['right']) / 2.0
errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
if errorMerge < errorNoMerge:
return treeMean
else:
return tree
else:
return tree
if __name__ == '__main__':
train_filename = 'ex2.txt'
train_Data = loadDataSet(train_filename)
train_Mat = np.mat(train_Data)
tree = createTree(train_Mat)
print("剪枝前:", tree)
test_filename = 'ex2test.txt'
test_Data = loadDataSet(test_filename)
test_Mat = np.mat(test_Data)
print("\n剪枝后:", prune(tree, test_Mat))
import matplotlib.pyplot as plt
import numpy as np
import types
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
"""
Desc:
根据特征切分数据集合
Parameters:
dataSet - 数据集合
feature - 带切分的特征
value - 该特征的值
Returns:
mat0 - 切分的数据集合0
mat1 - 切分的数据集合1
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
"""
Desc:
生成叶结点
Parameters:
dataSet - 数据集合
Returns:
目标变量均值
"""
def regLeaf(dataSet):
return np.mean(dataSet[:, -1])
"""
Desc:
误差估计函数
Parameters:
dataSet - 数据集合
Returns:
目标变量的总方差
"""
def regErr(dataSet):
return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]
"""
Desc:
找到数据的最佳二元切分方式函数预剪枝
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
bestIndex - 最佳切分特征
bestValue - 最佳特征值
"""
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m, n = np.shape(dataSet)
S = errType(dataSet)
bestS = float('inf')
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
"""
Desc:
树构建函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
retTree - 构建的回归树
"""
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
"""
Desc:
判断测试输入变量是否是一颗树;树是通过字典存储的
Parameters:
obj - 测试对象
Returns:
是否是一颗树
"""
def isTree(obj):
return (type(obj).__name__ == 'dict')
"""
Desc:
对树进行塌陷处理(即返回树平均值)
Parameters:
tree - 树
Returns:
树的平均值
"""
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left'] + tree['right']) / 2.0
"""
Desc:
后剪枝
Parameters:
tree - 树
testData - 测试集
Returns:
树
"""
def prune(tree, testData):
if np.shape(testData)[0] == 0:
return getMean(tree)
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'], 2)) + np.sum(np.power(rSet[:, 1] - tree['right'], 2))
treeMean = (tree['left'] + tree['right']) / 2.0
errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
if errorMerge < errorNoMerge:
return treeMean
else:
return tree
else:
return tree
"""
Desc:
简单线性回归
Parameters:
dataSet - 数据集
Returns:
ws - 最佳回归系数
X - 特征矩阵
Y - label列向量
"""
def linearSolve(dataSet):
m, n = np.shape(dataSet)
X = np.mat(np.ones((m, n)))
Y = np.mat(np.ones((m, 1)))
X[:, 1:n] = dataSet[:, 0:n-1]
Y = dataSet[:, -1]
xTx = X.T * X
if np.linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannont do inverse,\n\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y)
return ws, X, Y
"""
Desc:
返回数据集的回归系数
Parameters:
dataSet - 数据集
Returns:
ws - 最佳回归系数
"""
def modelLeaf(dataSet):
ws, X, Y = linearSolve(dataSet)
return ws
"""
Desc:
计算误差
Parameters:
dataSet - 数据集
Returns:
误差值
"""
def modelErr(dataSet):
ws, X, Y = linearSolve(dataSet)
yHat = X * ws
return sum(np.power(Y - yHat, 2))
if __name__ == '__main__':
train_filename = 'exp2.txt'
train_Data = loadDataSet(train_filename)
dataMat = np.mat(train_Data)
Tree = createTree(dataMat, modelLeaf, modelErr, (1, 10))
print(Tree)
plotDataSet(train_filename)
x1 = np.linspace(0, Tree['spVal'])
plt.plot(x1, float(Tree['right'][1])*x1 + float(Tree['right'][0]), 'r--')
x2 = np.linspace(Tree['spVal'], 1)
plt.plot(x2, float(Tree['left'][1])*x2 + float(Tree['left'][0]), 'r--')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
"""
Desc:
根据特征切分数据集合
Parameters:
dataSet - 数据集合
feature - 待切分的特征
value - 该特征的值
Returns:
mat0 - 切分的数据集合0
mat1 - 切分的数据集合1
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
"""
Desc:
生成叶结点
Parameters:
dataSet - 数据集合
Returns:
目标变量均值
"""
def regLeaf(dataSet):
return np.mean(dataSet[:, -1])
"""
Desc:
误差估计函数
Parameters:
dataSet - 数据集合
Returns:
目标变量的总方差
"""
def regErr(dataSet):
return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]
"""
Desc:
找到数据的最佳二元切分方式函数,预剪枝
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
bestIndex - 最佳切分特征
bestValue - 最佳特征值
"""
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m, n = np.shape(dataSet)
S = errType(dataSet)
bestS = float('inf')
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
"""
Desc:
树构建函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
retTree - 构建的回归树
"""
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
"""
Desc:
判断测试输入变量是否是一颗树;树是通过字典存储的
Parameters:
obj - 测试对象
Returns:
是否是一颗树
"""
def isTree(obj):
return (type(obj).__name__ == 'dict')
"""
Desc:
对树进行塌陷处理(即返回树平均值)
Parameters:
tree - 树
Returns:
树的平均值
"""
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left'] + tree['right']) / 2.0
"""
Desc:
后剪枝
Parameters:
tree - 树
testData - 测试集
Returns:
树
"""
def prune(tree, testData):
if np.shape(testData)[0] == 0:
return getMean(tree)
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'], 2)) + np.sum(np.power(rSet[:, 1] - tree['right'], 2))
treeMean = (tree['left'] + tree['right']) / 2.0
errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
if errorMerge < errorNoMerge:
return treeMean
else:
return tree
else:
return tree
"""
Desc:
简单线性回归
Parameters:
dataSet - 数据集
X第一列为1是偏移量
Returns:
ws - 最佳回归系数
X - 特征矩阵
Y - label列向量
"""
def linearSolve(dataSet):
m, n = np.shape(dataSet)
X = np.mat(np.ones((m, n)))
Y = np.mat(np.ones((m, 1)))
X[:, 1:n] = dataSet[:, 0:n-1]
Y = dataSet[:, -1]
xTx = X.T * X
if np.linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannont do inverse,\n\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y)
return ws, X, Y
"""
Desc:
返回数据集的回归系数
Parameters:
dataSet - 数据集
Returns:
ws - 最佳回归系数
"""
def modelLeaf(dataSet):
ws, X, Y = linearSolve(dataSet)
return ws
"""
Desc:
计算误差
Parameters:
dataSet - 数据集
Returns:
误差值
"""
def modelErr(dataSet):
ws, X, Y = linearSolve(dataSet)
yHat = X * ws
return sum(np.power(Y - yHat, 2))
"""
Desc:
返回回归树叶结点值
由于Tree的叶结点数据类型为matrix所以需要转化为float类型
Parameters:
model - tree叶结点
inDat - 输入数据
Returns:
叶结点值
"""
def regTreeEval(model, inDat):
return float(model)
"""
Desc:
模型树
Parameters:
model - 叶结点值
inDat - 输入的特征矩阵
Returns:
预测值 相当于X*ws
"""
def modelTreeEval(model, inDat):
n = np.shape(inDat)[1]
X = np.mat(np.ones((1, n+1)))
X[:, 1:n+1] = inDat
return float(X * model)
"""
Desc:
对于输入的单个数据点,treeForeCast返回一个预测值
Parameters:
tree - 树结构
testData - 测试数据集
modelEval - 求解方式
Returns:
误差值
"""
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree):
return modelEval(tree, inData)
if inData[tree['spInd']] > tree['spVal']:
if isTree(tree['left']):
return treeForeCast(tree['left'], inData, modelEval)
else:
return modelEval(tree['left'], inData)
else:
if isTree(tree['right']):
return treeForeCast(tree['right'], inData, modelEval)
else:
return modelEval(tree['right'], inData)
"""
Desc:
对数据进行树结构建模
Parameters:
tree - 树结构
testData - 测试数据集
modelEval - 求解方式
Returns:
yHat - 预测值
"""
def createForeCast(tree, testData, modelEval=regTreeEval):
m = len(testData)
yHat = np.mat(np.zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
return yHat
if __name__ == '__main__':
trainMat = np.mat(loadDataSet('bikeSpeedVsIq_train.txt'))
testMat = np.mat(loadDataSet('bikeSpeedVsIq_test.txt'))
myTree = createTree(trainMat, ops=(1, 20))
yHat = createForeCast(myTree, testMat[:, 0])
RegressionTree = np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]
print("回归树拟合精度 = ", RegressionTree)
myTree = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
yHat = createForeCast(myTree, testMat[:, 0], modelTreeEval)
ModelTree = np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]
print("模型树拟合精度 = ", ModelTree)
ws, X, Y = linearSolve(trainMat)
for i in range(np.shape(testMat)[0]):
yHat[i] = testMat[i, 0] * ws[1, 0] + ws[0, 0]
LinearRegression = np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]
print("线性回归拟合精度 = ", LinearRegression)
import matplotlib
matplotlib.use('TkAgg')
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure
import numpy as np
import tkinter as tk
import CART
"""
Desc:
绘制原始数据的散点图以及拟合数据的曲线图
Parameters:
tolS - 允许的误差下降值
tolN - 切分的最小样本数
Returns:
None
"""
def reDraw(tolS, tolN):
reDraw.f.clf()
reDraw.a = reDraw.f.add_subplot(111)
if chkBtnVar.get():
if tolN < 2:
tolN = 2
myTree = CART.createTree(reDraw.rawDat, CART.modelLeaf, CART.modelErr, (tolS, tolN))
yHat = CART.createForeCast(myTree, reDraw.testDat, CART.modelTreeEval)
else:
myTree = CART.createTree(reDraw.rawDat, ops=(tolS, tolN))
yHat = CART.createForeCast(myTree, reDraw.testDat)
reDraw.a.scatter(reDraw.rawDat[:, 0].tolist(), reDraw.rawDat[:, 1].tolist(), s=5)
reDraw.a.plot(reDraw.testDat, yHat, 'b', linewidth=2.0)
reDraw.canvas.draw()
"""
Desc:
获取文本框输入值
Parameters:
None
Returns:
None
"""
def getInputs():
try:
tolN = int(tolNentry.get())
except:
tolN = 10
print("enter Integer for tolN")
tolNentry.delete(0, END)
tolNentry.insert(0, '10')
try:
tolS = float(tolSentry.get())
except:
tolS = 1.0
print("enter Float for tolS")
tolSentry.delete(0, END)
tolSentry.insert(0, '1.0')
return tolN, tolS
"""
Desc:
根据文本框输入参数绘图
Parameters:
None
Returns:
None
"""
def drawNewTree():
tolN, tolS = getInputs()
reDraw(tolS, tolN)
root = tk.Tk()
reDraw.f = Figure(figsize=(5,4), dpi=100)
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
reDraw.canvas.draw()
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)
tk.Label(root, text="tolN").grid(row=1, column=0)
tolNentry = tk.Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0, '10')
tk.Label(root, text="tolS").grid(row=2, column=0)
tolSentry = tk.Entry(root)
tolSentry.grid(row=2, column=1)
tolSentry.insert(0, '1.0')
tk.Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
chkBtnVar = tk.IntVar()
chkBtn = tk.Checkbutton(root, text="Model Tree", variable=chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)
reDraw.rawDat = np.mat(CART.loadDataSet('sine.txt'))
reDraw.testDat = np.arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
reDraw(1.0, 10)
root.mainloop()