pyod —— 模型组合实例
代码参考地址:https://github.com/yzhao062/pyod/blob/master/examples/comb_example.py
离群值检测由于其不受监督的性质而经常遭受模型不稳定性的困扰。因此,建议例如通过求平均来组合各种检测器输出以提高其鲁棒性。
此演示中显示了四种分数组合机制:
1.平均值:所有检测器的平均分数。
2.最大化:所有探测器的最高分。
3.平均值的最大值(AOM):将基本检测器划分为子组,并为每个子组获取最大分数。最终分数是所有子组分数的平均值。
4.最大平均值(MOA):将基本检测器划分为子组,并获取每个子组的平均分数。最终分数是所有子组分数中的最高分数。
import numpy as np
from sklearn.model_selection import train_test_split
from pyod.models.knn import KNN # kNN detector
from pyod.models.combination import aom, moa, average, maximization, median
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.utility import standardizer
X, y= generate_data(train_only=True) # load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
# standardizing data for processing
X_train_norm, X_test_norm = standardizer(X_train, X_test)
n_clf = 20 # number of base detectors
k_list = list(range(10,201,10))
train_scores = np.zeros([X_train.shape[0], n_clf])
test_scores = np.zeros([X_test.shape[0], n_clf])
for i in range(n_clf):
k=k_list[i]
clf = KNN(n_neighbors=k,method='largest')
clf.fit(X_train_norm)
train_scores[:,i] = clf.decision_scores_
test_scores[:,i] = clf.decision_function(X_test_norm)
# Decision scores have to be normalized before combination
train_scores_norm,test_scores_norm = standardizer(train_scores,test_scores)
# conbine by average
y_by_average = average(test_scores_norm)
evaluate_print('Combination by Average', y_test, y_by_average)
# Combination by max
y_by_maximization = maximization(test_scores_norm)
evaluate_print('Combination by Maximization', y_test, y_by_maximization)
# Combination by max
y_by_maximization = median(test_scores_norm)
evaluate_print('Combination by Median', y_test, y_by_maximization)
# Combination by aom
y_by_aom = aom(test_scores_norm, n_buckets=5)
evaluate_print('Combination by AOM', y_test, y_by_aom)
# Combination by moa
y_by_moa = moa(test_scores_norm, n_buckets=5)
evaluate_print('Combination by MOA', y_test, y_by_moa)