【无标题】

import numpy as np
import scipy as sp
import time, sklearn, math
from sklearn.model_selection import train_test_split
import sklearn.datasets, sklearn.neighbors, sklearn.tree, sklearn.metrics




# 计算信息熵
def cal_entropy(dataset):
    numInstances = len(dataset)
    labelCounts = {}  # 定义空字典
    for featVec in dataset:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numInstances
        shannonEnt -= prob * math.log(prob, 2)
    return shannonEnt

# 提取数据集
def draw_dataset(dataset, axis, value):
    resultDataSet = []
    for featVec in dataset:
        if featVec[axis] == value:
            # 当前属性不需要
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            resultDataSet.append(reducedFeatVec)
    return resultDataSet

#选择最好的特征划分
def choose_best_feature(dataset):
    #决策属性不算
    numFeatures = len(dataset[0]) - 1
    baseEntropy = cal_entropy(dataset)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        #把第i列属性的值取出来生成一维数组
        featList = [example[i] for example in dataset]
        #剔除重复值
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = draw_dataset(dataset, i, value)
            prob = len(subDataSet) / float(len(dataset))
            newEntropy += prob*cal_entropy(subDataSet)
        infoGain = baseEntropy - newEntropy
        if(infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

#如果剩下的数据中无特征,则直接按最大百分比形成叶节点
def build_leaf_node(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount += 1;
    sortedClassCount = sorted(classCount.iteritems(), key = np.operator.itemgette(1), reverse = True)
    return sortedClassCount[0][0]


# 创建决策树
def create_decision_tree(dataset, paraFeatureName):
    featureName = paraFeatureName.copy()
    classList = [example[-1] for example in dataset]

    # Already pure
    if classList.count(classList[0]) == len(classList):
        return classList[0]

    # No more attribute
    if len(dataset[0]) == 1:
        # if len(dataSet) == 1:
        return build_leaf_node(classList)

    bestFeat = choose_best_feature(dataset)
    # print(dataSet)
    # print("bestFeat:", bestFeat)
    bestFeatureName = featureName[bestFeat]
    myTree = {bestFeatureName: {}}
    del (featureName[bestFeat])
    featvalue = [example[bestFeat] for example in dataset]
    uniqueVals = set(featvalue)
    for value in uniqueVals:
        subfeatureName = featureName[:]
        myTree[bestFeatureName][value] = create_decision_tree(draw_dataset(dataset, bestFeat, value), subfeatureName)
    return myTree

# 分类以及返回准确率
def id3Classify(paraTree, paraTestingSet, featureNames, classValues):
    tempCorrect = 0.0
    tempTotal = len(paraTestingSet)
    tempPrediction = classValues[0]
    for featureVector in paraTestingSet:
        print("Instance: ", featureVector)
        tempTree = paraTree
        while True:
            for feature in featureNames:
                try:
                    tempTree[feature]
                    splitFeature = feature
                    break
                except:
                    i = 1 #Do nothing

            attributeValue = featureVector[featureNames.index(splitFeature)]
            print(splitFeature, " = ", attributeValue)

            tempPrediction = tempTree[splitFeature][attributeValue]
            if tempPrediction in classValues:
                break
            else:
                tempTree = tempPrediction
        print("Prediction = ", tempPrediction)
        if featureVector[-1] == tempPrediction:
            tempCorrect += 1

    return tempCorrect/tempTotal

def mfID3Test():
    #Step 1. Load the dataset
    weatherData = [['Sunny','Hot','High','FALSE','N'],
        ['Sunny','Hot','High','TRUE','N'],
        ['Overcast','Hot','High','FALSE','P'],
        ['Rain','Mild','High','FALSE','P'],
        ['Rain','Cool','Normal','FALSE','P'],
        ['Rain','Cool','Normal','TRUE','N'],
        ['Overcast','Cool','Normal','TRUE','P'],
        ['Sunny','Mild','High','FALSE','N'],
        ['Sunny','Cool','Normal','FALSE','P'],
        ['Rain','Mild','Normal','FALSE','P'],
        ['Sunny','Mild','Normal','TRUE','P'],
        ['Overcast','Mild','High','TRUE','P'],
        ['Overcast','Hot','Normal','FALSE','P'],
        ['Rain','Mild','High','TRUE','N']]

    featureName = ['Outlook', 'Temperature', 'Humidity', 'Windy']
    classValues = ['P', 'N']
    tempTree = create_decision_tree(weatherData, featureName)
    print(tempTree)
    #print(createTree(mydata, featureName))

    #featureName = ['Outlook', 'Temperature', 'Humidity', 'Windy']
    print("Before classification, feature names = ", featureName)
    tempAccuracy = id3Classify(tempTree, weatherData, featureName, classValues)
    print("The accuracy of ID3 classifier is {}".format(tempAccuracy))

# 鸢尾花数据集测试
def iris_test():
    # 加载数据集
    data_iris = sklearn.datasets.load_iris()
    x = data_iris.data
    y = data_iris.target

    #for i in range(len(data_iris)):
     #   x[i].append(y[i])

    print(x[1])
    feature_name = ['花萼的长', '花萼的宽', '花瓣的长', '花瓣的宽']
    classValues = ['山鸢尾Setosa', '杂色鸢尾Versicolour', '维吉尼亚鸢尾Virginica']
    temp_tree = create_decision_tree(data_iris, feature_name)
    print(temp_tree)

    print("Before classification, feature names = ", feature_name)
    tempAccuracy = id3Classify(temp_tree, data_iris, feature_name, classValues)
    print("The accuracy of ID3 classifier is {}".format(tempAccuracy))


def main():
    # sklearnDecisionTreeTest()
    # mfID3Test()
    iris_test()

main()





你可能感兴趣的:(机器学习,sklearn,python)