决策树的建立步骤(西瓜书例题)

树的建立步骤:引入包、数据的获取与处理、获取名称与类别标记——>选择样本最多的类作为类别标记——>重点来了,计算信息熵——>子数据集构建——>计算信息增益——>选择最优属性——>建立决策树。这些步骤搞懂了,基本就理解了决策树的原理。

import pandas as pd
import numpy as np
from collections import Counter
from math import log2


# In[56]:


#数据获取与处理
def getData(file_Path):
    file_path = r'F:date427\decisionTree\xigua.xls'
    data = pd.read_excel( file_path)
    return data

def dataDeal(data):
    dataList = np.array(data).tolist()    #将矩阵转化成列表
    dataSet = [element[1:] for element in dataList]
    return dataSet


# In[57]:


#获取属性名称
def getLabels(data):
    labels = list(data.columns)[1:-1]
    return labels


# In[58]:


#获取类别标记
def targetClass(dataSet):
    classification = set([element[-1] for element in dataSet])
    return classification


# In[59]:


#将分支结点标记为叶结点,选择样本数最多的类作为类标记
def majorityRule(dataSet):
    mostKind = Counter([element[-1] for element in dataSet]).most_common(1)
    majorityKind = mostKind[0][0]
    return majorityKind


# In[60]:


#计算信息熵
def infoEntropy(dataSet):
    classColumnCnt = Counter([element[-1] for element in dataSet])
    Ent = 0
    for symbol in classColumnCnt:
        p_k = classColumnCnt[symbol]/len(dataSet)
        Ent = Ent-p_k*log2(p_k)
    return Ent


# In[61]:


#子数据集构建
def makeAttributeData(dataSet,value,iColumn):
    attributeData = []
    for element in dataSet:
        if element[iColumn]==value:
            row = element[:iColumn]
            row.extend(element[iColumn+1:])
            attributeData.append(row)
    return attributeData


# In[62]:

#计算信息增益
def infoGain(dataSet,iColumn):
    Ent = infoEntropy(dataSet)
    tempGain = 0.0
    attribute = set([element[iColumn] for element in dataSet])
    for value in attribute:
        attributeData = makeAttributeData(dataSet,value,iColumn)
        tempGain = tempGain+len(attributeData)/len(dataSet)*infoEntropy(attributeData)
        Gain = Ent-tempGain
    return Gain


# In[63]:


#选择最优属性                
def selectOptimalAttribute(dataSet,labels):
    bestGain = 0
    sequence = 0
    for iColumn in range(0,len(labels)):#不计最后的类别列
        Gain = infoGain(dataSet,iColumn)
        if Gain>bestGain:
            bestGain = Gain
            sequence = iColumn
        print(labels[iColumn],Gain)
    return sequence


# In[64]:


#建立决策树
def createTree(dataSet,labels):
    classification = targetClass(dataSet) #获取类别种类(集合去重)
    if len(classification) == 1:
        return list(classification)[0]
    if len(labels) == 1:
        return majorityRule(dataSet)#返回样本种类较多的类别
    sequence = selectOptimalAttribute(dataSet,labels)
    print(labels)
    optimalAttribute = labels[sequence]
    del(labels[sequence])
    myTree = {optimalAttribute:{}}
    attribute = set([element[sequence] for element in dataSet])
    for value in attribute:
        
        print(myTree)
        print(value)
        subLabels = labels[:]
        myTree[optimalAttribute][value] =                createTree(makeAttributeData(dataSet,value,sequence),subLabels)
    return myTree


# In[67]:


filePath = 'watermelonData.xls'
data = getData(filePath)
dataSet = dataDeal(data)
labels = getLabels(data)
myTree = createTree(dataSet,labels)
print(myTree)

 

你可能感兴趣的:(机器学习)