python多线程并行实现随机森林

程序针对特征值取值范围为[-1, 1]来实现,数据预处理后即可使用。


import math
import operator
import threading
import random

def loadDataSet(filename, boundry):
    num = len(open(filename).readline().split(','))-1
    dataMat = []; testMat = []
    fr = open(filename)
    ii = 0
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split(',')
        for i in range(num):
            lineArr.append(float(curLine[i+1]))
        if ii >= boundry[0] and ii <= boundry[1]:
            testMat.append(lineArr)
        else:
            dataMat.append(lineArr)
        ii += 1
    return dataMat, testMat

def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob*math.log(prob, 2)
    return shannonEnt
    
def binSplitDataSet(dataSet, feature, value):
    mat0 = [ele for ele in dataSet if ele[feature] > value]
    mat1 = [ele for ele in dataSet if ele[feature] <= value]
    return mat0, mat1
    
def chooseBestFeatureToSplit(dataSet):
    Features = [i for i in range(len(dataSet[0]))]
    selectedFeatures = random.sample(Features, int(math.sqrt(len(Features))))
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = None; threshold = None
    Vals = [-0.9, -0.5, 0, 0.5, 0.9]
    for i in selectedFeatures:
        for value in Vals:
            m0, m1 = binSplitDataSet(dataSet, i, value)
            prob1 = len(m0)/float(len(dataSet))
            prob2 = len(m1)/float(len(dataSet))
            newEntropy = prob1*calcShannonEnt(m0)+prob2*calcShannonEnt(m1)
            infoGain = baseEntropy-newEntropy
            if (infoGain > bestInfoGain):
                bestInfoGain = infoGain
                bestFeature = i
                threshold = value
    return bestFeature, threshold

def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
    return sortedClassCount[0][0]

def createTree(dataSet):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    bestFeat, threshold = chooseBestFeatureToSplit(dataSet)
    #if bestFeat == None:    return majorityCnt(classList)
    rSet, lSet = binSplitDataSet(dataSet, bestFeat, threshold)
    myTree = {(bestFeat, threshold):{}}
    myTree[(bestFeat, threshold)][0] = createTree(lSet)
    myTree[(bestFeat, threshold)][1] = createTree(rSet)
    return myTree

def classify(tree, testVec):
    classLabel = None
    firstStr = tree.keys()[0]
    secondDict = tree[firstStr]
    key = testVec[firstStr[0]]
    nextL = None
    if key > firstStr[1]:
        nextL = secondDict[1]
    else:
        nextL = secondDict[0]
    if isinstance(nextL, dict):
        classLabel = classify(nextL, testVec)
    else:
        classLabel = nextL
    return classLabel

def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'w')
    pickle.dump(inputTree, fw)
    fw.close()
    
def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)

def bootstrap(dataSet):
    dataR = []
    for i in range(len(dataSet)):
        rindex = random.randint(0, len(dataSet)-1)
        dataR.append(dataSet[rindex])
    return dataR

def learning(dataSet, times, trees):
    for i in range(times):
        dataTemp = bootstrap(dataSet)
        trees.append(createTree(dataTemp))

def classify_RF(trees, testVecs):
    count = 0
    for testVec in testVecs:
        classes = []
        for tree in trees:
            classes.append(classify(tree, testVec))
        if majorityCnt(classes) == testVec[-1]:
            count += 1
    print count

if __name__ == '__main__':

    trainData, testData = loadDataSet('train_temp.csv', (0, 1000))

    trees, trees1, trees2, trees3 = [], [], [], []
    learning(trainData, 10, trees)
    t1 = threading.Thread(target = learning, args = (trainData, 10, trees1))
    t2 = threading.Thread(target = learning, args = (trainData, 10, trees2))
    t3 = threading.Thread(target = learning, args = (trainData, 10, trees3))
    t1.setDaemon(True); t1.start()
    t2.setDaemon(True); t2.start()
    t3.setDaemon(True); t3.start()

    t1.join()
    t2.join()
    t3.join()

    trees.extend(trees1)
    trees.extend(trees2)
    trees.extend(trees3)

    offset = len(testData)/4
    classify_RF(trees, testData[:offset])
    t1 = threading.Thread(target = classify_RF, args = (trees, testData[offset:2*offset]))
    t2 = threading.Thread(target = classify_RF, args = (trees, testData[2*offset:3*offset]))
    t3 = threading.Thread(target = classify_RF, args = (trees, testData[3*offset:]))
    t1.setDaemon(True); t1.start()
    t2.setDaemon(True); t2.start()
    t3.setDaemon(True); t3.start()

    t1.join()
    t2.join()
    t3.join()


你可能感兴趣的:(机器学习)