《统计学习方法》 决策树ID3,C45 python实现

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Nov  2 10:51:33 2017
@author: lee
"""
from collections import defaultdict
import math
def CreateDataSet():
    data = [[1, 1, 'yes' ],
               [1, 1, 'yes' ],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return data, labels

def CreateTree(data, labels,eps=-1):
    #print data
    #获取训练集的分类
    data_labels = [y[-1] for y in data]
    #获取特征集
    unique_feature = set(data[0][:-1])
    #(1)如果所有实例都属于同一个类,则决策树为单结点树
    #可怕的坑,不能直接用len(labels)==1
    if data_labels.count(data_labels[0]) == len(data_labels):
        #print len(data_labels),data_labels,1
        return data_labels[0]
    #(2)特征集为空,为单结点树,取训练集中实例数最大的类返回
    if len(unique_feature) == 0:
        dic = defaultdict(int)
        for i in range(len(data_labels)):
            dic[data_labels[i]] += 1
# =============================================================================
#         #字典取最大值的方法:
#         max(dic,key=dic.get)#返回最大值所对应的键
#         max(dic.items(),key=lambda x: x[1])#返回最大值(key,value)
# =============================================================================
        return max(dic,key=dic.get)
    #(3)一般情况
    #id3的特征选择
    maxfindex,maxgain = Maxinformation_gain(data,labels)
    #c45的特征选择
    #maxfindex,maxgain = Maxinformation_gain_ratio(data,labels)
    
    #(4)最大特征值增益小于阈值,为单结点树,取训练集中实例数最大的类返回
    if maxgain < eps:
        dic = defaultdict(int)
        for i in range(len(data_labels)):
            dic[data_labels[i]] += 1
        return max(dic,key=dic.get)
    #(5)对Ag的每一个取值划分一个树
    #取出特征,并在label中删除当前选中特征
    maxfeature = labels[maxfindex]
    del labels[maxfindex]
    #取出该特征对应的所有训练数据
    featureVal = [value[maxfindex] for value in data]
    #用字典表示树
    idtree = {maxfeature:{}}
    #取出该特征的取值
    featureValunq = set(featureVal)
    #(6)对每一个取值,划分出一个子树
    for fval in featureValunq:
        #print 'mu',maxfeature,fval,idtree
        idtree[maxfeature][fval] = CreateTree(SplitData(data,maxfindex,fval),labels[:])
        #print 'digui',idtree,'idtree'
    return idtree
#筛选出信息增益最大的特征,返回特征索引,该特征的信息增益    
def Maxinformation_gain(data,labels):
    feature_gain ={}
    data_labels = [y[-1] for y in data]
    entropyD = entropy(data_labels)
    #计算每一个feature的信息增益
    for f in range(len(labels)):
        featureVal = [value[f] for value in data]
        entropyF = ConditionalEntropy(featureVal,data_labels)
        feature_gain[f] = entropyD-entropyF
    
    result = max(feature_gain.items(),key=lambda x:x[1])    
    #print 'max',labels,result,feature_gain
    #print 'max',data
    return result[0],result[1]
#根据特征索引split_index,特征取值划分数据集
def SplitData(data,split_index,split_value):
    #print 'befor split',data,split_index,split_value
    subdata = []
    for row in data:
        if row[split_index] == split_value:
            temp1 = row[:split_index]
            temp2 = row[split_index+1:]
            subdata.append(temp1+temp2)
    #print 'after split',subdata,split_index,split_value
    return subdata

#计算数据集的经验信息熵
def entropy(y):

    n = len(y)
    elist = defaultdict(int)
    result = 0.
    for i in range(len(y)):
        elist[y[i]] += 1
    
    for i in elist.keys():
        p = elist[i]/float(n)
        if p == 0:
            result -= 0
        else:
            result -= p*math.log(p,2)
    return result

#计算经验条件信息熵
def ConditionalEntropy(datalist,y):
    data_u = set(datalist)
    data_dic = defaultdict(int)
    y_dic = defaultdict(list)
    n = len(datalist)
    result = 0.
    #计数特征不同取值的个数
    for i in range(len(datalist)):
        data_dic[datalist[i]] += 1
        y_dic[datalist[i]].append(y[i])
    for i in data_u:
        result += data_dic[i]/float(n) * entropy(y_dic[i])
    return result

def Maxinformation_gain_ratio(data,labels):
    #计算每个特征的信息增益
    result = {}
    data_labels = [y[-1] for y in data]
    entropyD = entropy(data_labels)
    #计算每一个feature的信息增益
    for f in range(len(labels)):
        featureVal = [value[f] for value in data]
        entropyF = ConditionalEntropy(featureVal,data_labels)
        feature_gain = entropyD-entropyF
        feature_data_en = FeatureDataEntropy(featureVal)
        result[f] = feature_gain/feature_data_en
    return max(result,key=result.get)[0],max(result,key=result.get)[1]
        
#计算数据集关于每个特征的熵     
def FeatureDataEntropy(datalist):
    data_dic = defaultdict(int)
    result = 0.
    n = len(datalist)
    #计数特征不同取值的个数
    for i in range(n):
        data_dic[datalist[i]] += 1
    for i in data_dic.keys():
        if data_dic[i] == 0:
            result += 0
        else:
            p = data_dic[i]/float(n)
            result += p * math.log(p,2)
    return result
  
data,label = CreateDataSet()
print CreateTree(data,label)

你可能感兴趣的:(《统计学习方法》 决策树ID3,C45 python实现)