决策树算法(DecisionTree)

手动实现决策树算法

  1. 算法使用如下数据集(来自统计学习方法):

    https://github.com/xu1995yong/ml/tree/master/python%E5%AE%9E%E7%8E%B0

  2. 代码:

    from pandas import Series
    import numpy as np
    
    class DecisionTree:
        def __init__(self):
            self.tree = None
        def entropy(self,data,y):
        '''
            求经验熵
        '''
            m = data.shape[0]
            yType = Series(y).unique()
            count = []
            for v in yType:
                sub_data = data[np.where(y == v)]
                count.append(sub_data.shape[0])
            p = np.array(count) / m 
            s = np.dot(-1 * p,np.log2(p))
            return s
    
        def conditionalEntropy(self,xi,y):  
            '''
                计算特征A对数据集D的经验条件熵
            '''
            m = xi.shape[0]
            s = 0
            for i in Series(xi).value_counts().index:
                sub_xi = xi[np.where(xi == i)]
                sub_y = y[np.where(xi == i)]
                p = self.entropy(sub_xi,sub_y) #计算特征A的某个取值的经验熵
                s += 1.0 * sub_xi.shape[0] / m * p
            return s
    
        def infoGain(self,x,y):
            n = x.shape[1]
            index = None
            g = None
            for i in range(n):
                t = self.entropy(y,y) - self.conditionalEntropy(x[:,i],y)  #计算每个特征的信息增益
                if i == 0:
                    g = t
                    index = 0
                if t > g:
                    g = t
                    index = i
            return index,g
    
        def train(self,x,y,theta):
            if np.sum(y == y[0]) == y.shape[0]:  #y中只有同一类样本
                return y[0]
            [i,g] = self.infoGain(x,y)  #返回信息增益最大的特征的索引、信息增益值
    
            tree = {str(i):{}}
    
            if g < theta:               #特征x的信息增益小于阈值theta,则返回y中数量最多的类
                 returnVal = Series(y).value_counts().index[0]
                 return returnVal
            else :              
                #根据(信息增益最大的)特征的取值,划分数据集,并递归求每一部分数据集的信息增益
                uniqueVal = Series(x[:,i]).unique()
                for val in uniqueVal:
                    xi = x[np.where(x[:,i] == val)]
                    yi = y[np.where(x[:,i] == val)]
                    xi = np.hstack((xi[:,0:i],xi[:,i+1:xi.shape[1]+1]))
                    returnVal = self.train(xi,yi,theta)
                    tree[str(i)][val] = returnVal
            self.tree = tree
            return tree
    if __name__ == '__main__':
        x = np.loadtxt('x.txt')
        y = np.loadtxt('y.txt')
        dt = DecisionTree()
        dt.train(x,y,0)
        print(dt.tree)
    
  3. 输出结果:
    {‘2’: {0.0: {‘1’: {0.0: 0.0, 1.0: 1.0}}, 1.0: 1.0}}

  4. 树的可视化:

你可能感兴趣的:(机器学习,机器学习,python,决策树,决策树算法)