随机森林是一种以决策树(常用CART树)为基学习器的bagging算法。
sklearn随机森林模型中有一个oob_score参数,用于计算oob样本(没有被采样到的样本)在不包含该样本基学习器上的预测结果平均得分。
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42
X, y = make_classification(n_samples=1000, n_features=25,
n_clusters_per_class=1, n_informative=15,
random_state=RANDOM_STATE)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=RANDOM_STATE)
clf = RandomForestClassifier(n_estimators=3,
max_depth=10,
random_state=RANDOM_STATE)
clf.fit(X_train, y_train)
results = clf.predict(X_test)
用于连续特征数据的异常检测
基本思想
多次随机选取特征和对应的分割点以分开空间中样本点,那么异常点很容易在较早的几次分割中就已经与其他样本隔开,正常点由于较为紧密故需要更多的分割次数才能将其分开。
异常值判断指标
假设样本数为n,
当 E h x → 0 \mathbb{E}h_x\rightarrow0 Ehx→0时, s ( x , n ) → 1 s(x,n)\rightarrow1 s(x,n)→1;
当 E h x → n − 1 \mathbb{E}h_x\rightarrow n-1 Ehx→n−1时, s ( x , n ) → 0 s(x,n)\rightarrow0 s(x,n)→0;
当 E h x → c ( n ) \mathbb{E}h_x\rightarrow c(n) Ehx→c(n)时, s ( x , n ) → 1 2 s(x,n)\rightarrow\frac{1}{2} s(x,n)→21。
最大树高
因为
lim n → ∞ H n − log n = γ \lim_{n\to\infty} H_n-\log n = \gamma n→∞limHn−logn=γ
其中, γ ≈ 0.5772 \gamma\approx0.5772 γ≈0.5772为欧拉常数, log n \log n logn与 c ( n ) c(n) c(n)数量级相同。
故,最大树高可设为 log n \log n logn。
算法步骤
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest as sklearnIF
def cal_h(n):
if n>=10000:
hn = (np.log(n) + 0.5772)
else:
hn = 0
for i in range(1,n+1):
hn += 1/i
return hn
class Node:
def __init__(self, depth):
self.depth = depth
self.left = None
self.right = None
self.feature = None
self.pivot = None
class Tree:
def __init__(self, max_height):
self.root = Node(0)
self.max_height = max_height
self.c = None
def _build(self, node, X,):
if X.shape[0] == 1:
return
if node.depth+1 > self.max_height:
node.depth += self._c(X.shape[0])
return
node.feature = np.random.randint(X.shape[1])
pivot_min = X[:, node.feature].min()
pivot_max = X[:, node.feature].max()
node.pivot = np.random.uniform(pivot_min, pivot_max)
node.left, node.right = Node(node.depth+1), Node(node.depth+1)
self._build(node.left, X[X[:, node.feature]<node.pivot])
self._build(node.right, X[X[:, node.feature]>=node.pivot])
def build(self, X):
self.c = self._c(X.shape[0])
self._build(self.root, X)
def _c(self, n):
if n == 1:
return 0
else:
return 2 * (cal_h(n-1) - (n-1)/n)
def _get_h_score(self, node, x):
if node.left is None and node.right is None:
return node.depth
if x[node.feature] < node.pivot:
return self._get_h_score(node.left, x)
else:
return self._get_h_score(node.right, x)
def get_h_score(self, x):
return self._get_h_score(self.root, x)
class IsolationForest:
def __init__(self, n_estimators=100, max_samples=256):
self.n_estimator = n_estimators
self.max_samples = max_samples
self.trees = []
def fit(self, X):
for tree_id in range(self.n_estimator):
random_X = X[np.random.randint(0, X.shape[0], self.max_samples)]
tree = Tree(np.log(random_X.shape[0]))
tree.build(X)
self.trees.append(tree)
def predict(self, X):
result = []
for x in X:
h = 0
for tree in self.trees:
h += tree.get_h_score(x) / tree.c
score = np.power(2, - h/len(self.trees))
result.append(score)
return np.array(result)
if __name__ == "__main__":
rng = np.random.RandomState(42)
# Generate train data
X = 0.3 * rng.randn(1000, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(50, 2)
X_normal = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.vstack([X_normal,X_outliers])
IF = IsolationForest()
IF.fit(X_train)
res = IF.predict(X_test)
result = sklearnIF(max_samples=100, random_state=rng).fit(X_train).predict(X_test)
abnormal_X = X_test[res > np.quantile(res, 0.90)]
abnormal_X2 = X_test[result ==-1]
b1 = plt.scatter(X_test[:, 0], X_test[:, 1], s=5)
b2 = plt.scatter(
abnormal_X[:, 0], abnormal_X[:, 1],
s=50, edgecolors="Red", facecolor="none"
)
b3 = plt.scatter(
abnormal_X2[:, 0], abnormal_X2[:, 1],
s=30, edgecolors="green", facecolor="none"
)
plt.legend([b1, b2, b3],
["test observations",
"class IsolationForest outliers", "sklearnIF outliers"],
loc="upper left")
plt.show()
[参考]:
DataWhale树模型与集成学习