近期一直在看Isolation forest,random forest,k-d tree,有关Isolation forest的资料很少,Isolation forest在异常检测方面效果不错
Isolation forest算法思想如下:
建立多个tree(理论上最好的效果tree的个数为100),其对应的训练集大小一样(训练集最佳大小为256),训练集中的数据随机,建树的过程中节点分为外部节点和内部节点,对于每个外部节点由于树高或者特征对应的那一列值相等的限制,让数据划分终止于外部节点,外部节点里面存储划分到该节点下的数据集的尺寸。对于特征的选择是随机的,那如何检测异常数据?对于每个实例,计算它被划分在哪个外部节点下并且求出该节点所处于的树高,E(h(x))表示实例x在多个树中的树高h(x)的平均值,s(x, n) = 2**−E(h(x))/c(n)
c(n) = 2H(n − 1) − (2(n − 1)/n)
where H(i) is the harmonic number and it can be estimated by ln(i) + 0.5772156649 (Euler’s constant). As c(n) is the average of h(x) given n, we use it to normalise h(x).
• when E(h(x)) ->c(n), s -> 0.5;
• when E(h(x)) -> 0, s -> 1;
• and when E(h(x)) -> n − 1, s -> 0.
s is monotonic to h(x). Figure 2 illustrates the relationship between E(h(x)) and s, and the following conditions applied where 0 < s 1 for 0 < h(x) n − 1. Using the anomaly score s, we are able to make the following assessment:
• (a) if instances return s very close to 1, then they are
definitely anomalies,
• (b) if instances have s much smaller than 0.5, then they
are quite safe to be regarded as normal instances, and
• (c) if all the instances return s 0.5, then the entire
sample does not really have any distinct anomaly.
(今天不在状态,写的有点混乱,改天再重写)
__author__ = 'hzlihongxia'
import math
import numpy
import dataTransform
import random
# dataSet in train
# warning:dataSet is the global variable
dataSet_global, label = dataTransform.dataFromTrain()
# preprocess in dataSet
def preprocess(dataSet):
for i in [3, 4, 5]:
maxV = -1
minV = 10000
for v in dataSet:
if float(v[i]) > maxV:
maxV = float(v[i])
if float(v[i]) < minV:
minV = float(v[i])
for v in dataSet:
listD = [0] * (int(maxV - minV) + 1)
listD[int(float(v[i]) - minV)] = 1
v[i] = listD
return dataSet
# dataSet is input data,t is number of trees,p is sub-sampling size
def iForest(dataSet, t, p):
Forest = []
limit = math.ceil(math.log(p, 2))
for i in range(t):
dataSet_New = sample(dataSet, p)
# test
count = 0
for v in dataSet_New:
print count, v[1]
count += 1
attr_List = [0, 1, 2, 3, 4, 5, 6]
root = inNode()
dataSet_Newest = preprocess(dataSet_New)
iTree(root, dataSet_Newest, attr_List, 0, limit)
Forest.append(root)
#test
print 'length:::', PathLength(dataSet_Newest[0], root, 0)
print Forest
# p is size of sample and p has a limit in dataSet
# do with dataSet
def sample(dataSet, p):
data = []
dataSet_len = len(dataSet)
j = []
for k in range(p):
j.append(random.randint(0, dataSet_len - 1))
for i in j:
data.append(dataSet[int(i)])
return data
class exNode(object):
def __init__(self, size=0):
self.size = size
class inNode(object):
def __init__(self, splitAtt=-1, splitValue=-1, left=None, right=None, exNode=None):
self.splitAtt = splitAtt
self.splitValue = splitValue
self.left = left
self.right = right
self.exNode = exNode
# establish a tree in limitation by l
# dataSet is input data,e is current tree height,l is height limit
def iTree_remove(dataSet, attr):
i = random.choice(attr)
if i == 3 or i == 4 or i == 5:
r = int(random.uniform(0, len(dataSet[0][i]) - 1))
dataSetB = filterData_category(dataSet, i, r, '>')
dataSetS = filterData_category(dataSet, i, r, '<')
return dataSetB, dataSetS, i, r
else:
maxV = -1
minV = 10000
for v in dataSet:
if float(v[i]) > maxV:
maxV = float(v[i])
if float(v[i]) < minV:
minV = float(v[i])
r = int(random.uniform(minV, maxV))
dataSetB = filterData(dataSet, i, r, '>')
dataSetS = filterData(dataSet, i, r, '<')
return dataSetB, dataSetS, i, r
def iTree(node, dataSet, attr, e, l):
if e >= l or len(dataSet) <= 1:
exNode_t = exNode()
exNode_t.size = len(dataSet)
node.exNode = exNode_t
# test
print exNode_t
return exNode_t
else:
dataSetB, dataSetS, i, r = iTree_remove(dataSet, attr)
if isinstance(dataSet[0][i], list):
if len(set([v[r] for v in list([value[i] for value in dataSet])])) == 1:
exNode_t = exNode()
exNode_t.size = len(dataSet)
node.exNode = exNode_t
# test
print 'attr is the same to data', i, exNode_t
return exNode_t
elif len(set([v[i] for v in dataSet])) == 1:
exNode_t = exNode()
exNode_t.size = len(dataSet)
node.exNode = exNode_t
# test
print 'attr is the same to data', i, exNode_t
return exNode_t
node.splitAtt = i
node.splitValue = r
# test
print node.splitAtt, node.splitValue
print 'dataSetB=', dataSetB
print 'dataSetS=', dataSetS
node.right = inNode()
node.left = inNode()
iTree(node.left, dataSetS, attr, e + 1, l)
iTree(node.right, dataSetB, attr, e + 1, l)
# test
print node
# r is random number which between minV and maxV , r can divide from dataSet in attribute i
def filterData(dataSet, i, r, k):
dataSetP = []
if k == '<':
for v in dataSet:
if float(v[i]) <= r:
dataSetP.append(v)
if k == '>':
for v in dataSet:
if float(v[i]) > r:
dataSetP.append(v)
return dataSetP
def filterData_category(dataSet, i, r, k):
dataSetP = []
if k == '<':
for v in dataSet:
if float(v[i][r]) < 1:
dataSetP.append(v)
if k == '>':
for v in dataSet:
if float(v[i][r]) >= 1:
dataSetP.append(v)
return dataSetP
# PathLength is according to data in x
def PathLength(x, T, e):
if T.splitAtt == -1:
return e + c(T.exNode.size)
else:
i = T.splitAtt
if x[i] <= T.splitValue:
return PathLength(x, T.left, e + 1)
else:
return PathLength(x, T.right, e + 1)
def c(n):
if n > 1:
return 2 * (math.log(n - 1, math.e) + 0.5772156649) - (2 * n - 2) / float(n)
return 0
if __name__ == '__main__':
iForest(dataSet_global, 1, 8)