MachineLearning—CART分类回归树python应用实现

 

# -*- coding: utf-8 -*-  

from numpy import *  
import numpy as np  
import pandas as pd  
from math import log  
import operator  
import re 
from collections import defaultdict
import itertools

def calGini(dataSet):
    numEntries = len(dataSet)
    labelCounts={}
    for featVec in dataSet: 
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    gini=1
    for label in labelCounts.keys():
        prop=float(labelCounts[label])/numEntries
        gini -=prop*prop
    return gini

# 传入的是一个特征值的列表,返回特征值二分的结果
def featuresplit(features):
    count = len(features)#特征值的个数
    if count < 2:        #特征值只有一个值比如'cold_blood'
        li=[]
        print "please check sample's features,only one feature value"
        li.append(features)
        return tuple(li)     #列表转化为元组
    
    # 由于需要返回二分结果,所以每个分支至少需要一个特征值,所以要从所有的特征组合中选取1个以上的组合
    # itertools的combinations 函数可以返回一个列表选多少个元素的组合结果,例如combinations(list,2)返回的列表元素选2个的组合
    # 我们需要选择1-(count-1)的组合
    featureIndex = range(count)
    featureIndex.pop(0) 
    combinationsList = []    
    resList=[]
    # 遍历所有的组合
    for i in featureIndex:
        temp_combination = list(itertools.combinations(features, len(features[0:i])))
        combinationsList.extend(temp_combination)
        combiLen = len(combinationsList)
    # 每次组合的顺序都是一致的,并且也是对称的,所以我们取首尾组合集合
    # zip函数提供了两个列表对应位置组合的功能
    resList = zip(combinationsList[0:combiLen/2], combinationsList[combiLen-1:combiLen/2-1:-1])   #往回数间隔为1
    
    return resList       #二分特征的不同情况

#def splitDataSet(dataSet, axis, values):
#    retDataSet = []
#    for featVec in dataSet:
#        for value in values:
#            if featVec[axis] == value:
#                reducedFeatVec = featVec[:axis]     #剔除样本集
#                reducedFeatVec.extend(featVec[axis+1:])
#                retDataSet.append(reducedFeatVec)
#    return retDataSet   #把那些特征值等于value的都剔出来

#def splitDataSet(dataSet, axis, values):     #实现了一些特征的重复利用 比如cover   特征复用
#    retDataSet = []
#    if len(values) < 2:
#        for featVec in dataSet:        #长度小于2即只有一个特征值
#            if featVec[axis] == values[0]:   #如果特征值只有一个,不抽取当选特征
#                reducedFeatVec = featVec[:axis]     
#                reducedFeatVec.extend(featVec[axis+1:])
#                retDataSet.append(reducedFeatVec)
#    else:
#        for featVec in dataSet:
#            for value in values:
#                if featVec[axis] == value:   #如果特征值多于一个,选取当前特征
#                    retDataSet.append(featVec)
#
#    return retDataSet

#处理连续特征值
def splitDataSet(dataSet, axis, value,threshold):
    retDataSet = []
    if threshold == 'lt':
        for featVec in dataSet:
            if featVec[axis] <= value:
                retDataSet.append(featVec)
    else:
        for featVec in dataSet:
            if featVec[axis] > value:
                retDataSet.append(featVec)

    return retDataSet


# 返回最好的特征以及二分特征值
"""def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #
    bestGiniGain = 1.0
    bestFeature = -1
    bestBinarySplit=()
    for i in range(numFeatures):        #遍历特征
        featList = [example[i] for example in dataSet]   #得到特征列
        uniqueVals = list(set(featList))       #去除重复值的特征列
        # 三个特征值的二分结果:
        #   [(('young',), ('old', 'middle')), (('old',), ('young', 'middle')), (('middle',), ('young', 'old'))]
        for split in featuresplit(uniqueVals):   #featuresplit返回特征的所有二分情况
            GiniGain = 0.0
            if len(split)==1:         #split是一个元组 特征值只有一个比如:cold_blood 只有一个特征值就没办法继续划分下去了 所以跳出循环继续下一循环
                continue
            (left,right)=split
            
            # 对于每一个可能的二分结果计算gini增益
            # 左增益
            left_subDataSet = splitDataSet(dataSet, i, left)
            left_prob = len(left_subDataSet)/float(len(dataSet))
            GiniGain += left_prob * calGini(left_subDataSet)
            # 右增益
            right_subDataSet = splitDataSet(dataSet, i, right)
            right_prob = len(right_subDataSet)/float(len(dataSet))
            GiniGain += right_prob * calGini(right_subDataSet)
            if (GiniGain <= bestGiniGain):       #比较是否是最好的结果
                bestGiniGain = GiniGain         #记录最好的结果和最好的特征
                bestFeature = i
                bestBinarySplit=(left,right)
    return bestFeature,bestBinarySplit  
"""

#处理连续特征值
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    bestGiniGain = 1.0; bestFeature = -1;bsetValue=""
    for i in range(numFeatures):               #遍历特征
        featList = [example[i] for example in dataSet]     #得到特征列
        uniqueVals = list(set(featList))       #从特征列获取该特征的特征值的set集合
        uniqueVals.sort()
        for value in uniqueVals:               #遍历所有的特征值
            GiniGain = 0.0
            #左基尼指数
            left_subDataSet = splitDataSet(dataSet, i, value,'lt')
            left_prob = len(left_subDataSet)/float(len(dataSet))
            GiniGain += left_prob * calGini(left_subDataSet)
            
            #右基尼指数
            right_subDataSet = splitDataSet(dataSet, i, value,'gt')
            right_prob = len(right_subDataSet)/float(len(dataSet))
            GiniGain += right_prob * calGini(right_subDataSet)
                 
            
            if (GiniGain < bestGiniGain):       #比较是否是最好的结果
                bestGiniGain = GiniGain         #记录最好的结果和最好的特征
                bestFeature = i
                bestValue=value
    return bestFeature,bestValue


def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]   #返回标签


"""def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
#    9/0
    # print dataSet
    if classList.count(classList[0]) == len(classList): 
        return classList[0]       #所有的类别都一样,就不用再划分了
    if len(dataSet) == 1:         #如果没有继续可以划分的特征,就多数表决决定分支的类别
        # print "here"
        return majorityCnt(classList)
    bestFeat,bestBinarySplit = chooseBestFeatureToSplit(dataSet)
#    9/0
    # print bestFeat,bestBinarySplit,labels
    bestFeatLabel = labels[bestFeat]
    if bestFeat==-1:
        return majorityCnt(classList)
    myTree = {bestFeatLabel:{}}
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = list(set(featValues))
#    9/0
    for value in bestBinarySplit:
#        9/0
        subLabels = labels[:]       #拷贝防止其他地方修改  特征标签
        if len(value)<2:
            del(subLabels[bestFeat])
#        9/0
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
#        9/0
    return myTree 
"""

#处理连续特征值, labels是特征标签
def createTree(dataSet,labels):  
    classList = [example[-1] for example in dataSet]
    
    if classList.count(classList[0]) == len(classList): 
        return classList[0]      #所有的类别都一样,就不用再划分了
    if len(dataSet) == 1:        #如果没有继续可以划分的特征,就多数表决决定分支的类别
        return majorityCnt(classList)
    
    bestFeat,bestValue = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    if bestFeat==-1:
        return majorityCnt(classList)
    myTree = {bestFeatLabel:{}}
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = list(set(featValues))
    subLabels = labels[:]
    
    myTree[bestFeatLabel][bestFeatLabel+'<='+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'lt'),subLabels)
    myTree[bestFeatLabel][bestFeatLabel+'>'+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'gt'),subLabels)
    
    return myTree  
#完美没有问题!!!


####测试分类
#由于在Tree中,连续值特征的名称改为了feature<=value的形式  
#因此对于这类特征,需要利用正则表达式进行分割,获得特征名以及分割阈值(其他方法也可以)  
def classify(inputTree,featLabels,testVec):  
    firstStr=inputTree.values()[0].keys()     #第一个为最佳分类特征      #连续值    
    if '<=' not in firstStr[0]:
        firstStr.reverse()

    
    featvalue=float(re.compile("(<=.+)").search(firstStr[0]).group()[2:])   #例子中的97      
    featkey=re.compile("(.+<=)").search(firstStr[0]).group()[:-2]           #例子中的money       
    featIndex=featLabels.index(featkey)   #特征列表中当前特征标签位置

    if testVec[featIndex]<=featvalue:    #测试样本中对应位置的值
        secondDict=inputTree.values()[0][firstStr[0]]
        if type(secondDict).__name__=='dict':  
            classLabel=classify(secondDict,featLabels,testVec)    #递归调用
        else:  
            classLabel=secondDict
    else:
        secondDict=inputTree.values()[0][firstStr[1]]
        if type(secondDict).__name__=='dict':  
            classLabel=classify(secondDict,featLabels,testVec)    #递归调用
        else:  
            classLabel=secondDict  
#else:离散的先不考虑  
    return classLabel    #返回分类标签
  
  
def testing(myTree,data_test,labels):  
    error=0.0  
    for i in range(len(data_test)):  #一个个的测试
        if classify(myTree,labels,data_test[i])!=data_test[i][-1]:  #如果测试的结果与实际的标签不同  
            error+=1    
    print 'myTree %f'%((len(data_test)-error)/len(data_test))   #注意这里的%f 之前是%d返回整型 
#   9/0
    return None  



df=pd.read_csv('C:/Users/test_5.csv')  
data=df.values[:280,1:].tolist()        #从0行开始 从1列开始  data包含特征数据和类别值

data_full=data[:]  
data_test=df.values[280:,1:].tolist()     #划分测试集
#test_length=len(data_test)
labels=df.columns.values[1:-1].tolist()  #特征标签 color root knocks texture navel touch 400个
labels_full=labels[:]  
myTree=createTree(data,labels)  

testing(myTree,data_test,labels_full)  
  
import treePlotter  
treePlotter.createPlot(myTree)  

 

你可能感兴趣的:(Machine,Learning)