Isolation forest的python代码实现

近期一直在看Isolation forest,random forest,k-d tree,有关Isolation forest的资料很少,Isolation forest在异常检测方面效果不错

Isolation forest算法思想如下:

建立多个tree(理论上最好的效果tree的个数为100),其对应的训练集大小一样(训练集最佳大小为256),训练集中的数据随机,建树的过程中节点分为外部节点和内部节点,对于每个外部节点由于树高或者特征对应的那一列值相等的限制,让数据划分终止于外部节点,外部节点里面存储划分到该节点下的数据集的尺寸。对于特征的选择是随机的,那如何检测异常数据?对于每个实例,计算它被划分在哪个外部节点下并且求出该节点所处于的树高,E(h(x))表示实例x在多个树中的树高h(x)的平均值,s(x, n) = 2**−E(h(x))/c(n)

c(n) = 2H(n − 1) − (2(n − 1)/n)

where H(i) is the harmonic number and it can be estimated by ln(i) + 0.5772156649 (Euler’s constant). As c(n) is the average of h(x) given n, we use it to normalise h(x).

• when E(h(x)) ->c(n), s -> 0.5;
• when E(h(x)) -> 0, s -> 1;
• and when E(h(x)) -> n − 1, s -> 0.
s is monotonic to h(x). Figure 2 illustrates the relationship between E(h(x)) and s, and the following conditions applied where 0 < s 1 for 0 < h(x) n − 1. Using the anomaly score s, we are able to make the following assessment:
• (a) if instances return s very close to 1, then they are
definitely anomalies,
• (b) if instances have s much smaller than 0.5, then they
are quite safe to be regarded as normal instances, and
• (c) if all the instances return s 0.5, then the entire
sample does not really have any distinct anomaly.

(今天不在状态,写的有点混乱,改天再重写)

__author__ = 'hzlihongxia'

import math
import numpy
import dataTransform
import random


# dataSet in train
# warning:dataSet is the global variable
dataSet_global, label = dataTransform.dataFromTrain()


# preprocess in dataSet
def preprocess(dataSet):
    for i in [3, 4, 5]:
        maxV = -1
        minV = 10000
        for v in dataSet:
            if float(v[i]) > maxV:
                maxV = float(v[i])
            if float(v[i]) < minV:
                minV = float(v[i])

        for v in dataSet:
            listD = [0] * (int(maxV - minV) + 1)
            listD[int(float(v[i]) - minV)] = 1
            v[i] = listD
    return dataSet


# dataSet is input data,t is number of trees,p is sub-sampling size
def iForest(dataSet, t, p):
    Forest = []
    limit = math.ceil(math.log(p, 2))

    for i in range(t):
        dataSet_New = sample(dataSet, p)

        # test
        count = 0
        for v in dataSet_New:
            print count, v[1]
            count += 1

        attr_List = [0, 1, 2, 3, 4, 5, 6]
        root = inNode()
        dataSet_Newest = preprocess(dataSet_New)
        iTree(root, dataSet_Newest, attr_List, 0, limit)
        Forest.append(root)
        #test
        print 'length:::', PathLength(dataSet_Newest[0], root, 0)

    print Forest


# p is size of sample and p has a limit in dataSet
# do with dataSet
def sample(dataSet, p):
    data = []
    dataSet_len = len(dataSet)

    j = []
    for k in range(p):
        j.append(random.randint(0, dataSet_len - 1))

    for i in j:
        data.append(dataSet[int(i)])

    return data


class exNode(object):
    def __init__(self, size=0):
        self.size = size


class inNode(object):
    def __init__(self, splitAtt=-1, splitValue=-1, left=None, right=None, exNode=None):
        self.splitAtt = splitAtt
        self.splitValue = splitValue
        self.left = left
        self.right = right
        self.exNode = exNode


# establish a tree in limitation by l
# dataSet is input data,e is current tree height,l is height limit
def iTree_remove(dataSet, attr):
    i = random.choice(attr)
    if i == 3 or i == 4 or i == 5:
        r = int(random.uniform(0, len(dataSet[0][i]) - 1))
        dataSetB = filterData_category(dataSet, i, r, '>')
        dataSetS = filterData_category(dataSet, i, r, '<')
        return dataSetB, dataSetS, i, r

    else:
        maxV = -1
        minV = 10000
        for v in dataSet:
            if float(v[i]) > maxV:
                maxV = float(v[i])
            if float(v[i]) < minV:
                minV = float(v[i])
        r = int(random.uniform(minV, maxV))
        dataSetB = filterData(dataSet, i, r, '>')
        dataSetS = filterData(dataSet, i, r, '<')
        return dataSetB, dataSetS, i, r


def iTree(node, dataSet, attr, e, l):
    if e >= l or len(dataSet) <= 1:
        exNode_t = exNode()
        exNode_t.size = len(dataSet)
        node.exNode = exNode_t
        # test
        print exNode_t

        return exNode_t
    else:
        dataSetB, dataSetS, i, r = iTree_remove(dataSet, attr)

        if isinstance(dataSet[0][i], list):
            if len(set([v[r] for v in list([value[i] for value in dataSet])])) == 1:
                exNode_t = exNode()
                exNode_t.size = len(dataSet)
                node.exNode = exNode_t
                # test
                print 'attr is the same to data', i, exNode_t

                return exNode_t

        elif len(set([v[i] for v in dataSet])) == 1:
            exNode_t = exNode()
            exNode_t.size = len(dataSet)
            node.exNode = exNode_t
            # test
            print 'attr is the same to data', i, exNode_t

            return exNode_t

        node.splitAtt = i
        node.splitValue = r
        # test
        print node.splitAtt, node.splitValue
        print 'dataSetB=', dataSetB
        print 'dataSetS=', dataSetS

        node.right = inNode()
        node.left = inNode()
        iTree(node.left, dataSetS, attr, e + 1, l)
        iTree(node.right, dataSetB, attr, e + 1, l)

        # test
        print node


# r is random number which between minV and maxV , r can divide from dataSet  in attribute i
def filterData(dataSet, i, r, k):
    dataSetP = []
    if k == '<':
        for v in dataSet:
            if float(v[i]) <= r:
                dataSetP.append(v)
    if k == '>':
        for v in dataSet:
            if float(v[i]) > r:
                dataSetP.append(v)
    return dataSetP


def filterData_category(dataSet, i, r, k):
    dataSetP = []
    if k == '<':
        for v in dataSet:
            if float(v[i][r]) < 1:
                dataSetP.append(v)
    if k == '>':
        for v in dataSet:
            if float(v[i][r]) >= 1:
                dataSetP.append(v)
    return dataSetP


# PathLength is according to data in x
def PathLength(x, T, e):
    if T.splitAtt == -1:
        return e + c(T.exNode.size)
    else:
        i = T.splitAtt
        if x[i] <= T.splitValue:
            return PathLength(x, T.left, e + 1)
        else:
            return PathLength(x, T.right, e + 1)


def c(n):
    if n > 1:
        return 2 * (math.log(n - 1, math.e) + 0.5772156649) - (2 * n - 2) / float(n)
    return 0


if __name__ == '__main__':
    iForest(dataSet_global, 1, 8)

你可能感兴趣的:(机器学习)