MAD异常检测算法

一、MAD介绍

MAD(median absolute deviation)绝对中位差
在统计学中,MAD是对单变量数值型数据的样本偏差的一种鲁棒性测量,即是用来描述单变量样本在定量数据中可变的一种标准。

 

MAD异常检测算法_第1张图片

 MAD异常检测算法_第2张图片

 

 

from __future__ import division
from __future__ import print_function
import numpy as np
from sklearn.utils import check_array
from .base import BaseDetector

#检查X维度
def _check_dim(X):

    if X.shape[1] != 1:
        raise ValueError('MAD algorithm is just for univariate data. '
                         'Got Data with {} Dimensions.'.format(X.shape[1]))


class MAD(BaseDetector):
#初始化
    def __init__(self, threshold=3.5):
        # contamination is unneeded since threshold must be
        # decided manually by the user
        super(MAD, self).__init__()
        if not isinstance(threshold, (float, int)):
            raise TypeError(
                'threshold must be a number. Got {}'.format(type(threshold)))
        self.threshold = threshold
#定义变量进行拟合
    def fit(self, X, y=None):
        X = check_array(X, ensure_2d=False, force_all_finite=False)
        _check_dim(X)
        self._set_n_classes(y)
        self.threshold_ = self.threshold
        self.median_ = None  # reset median after each call
        self.median_diff_ = None  # reset median_diff after each call
        self.decision_scores_ = self.decision_function(X)
        self._process_decision_scores()
        return self
#调用_mad(X)计算异常值得分
    def decision_function(self, X):
        X = check_array(X, ensure_2d=False, force_all_finite=False)
        _check_dim(X)
        return self._mad(X)
#核心部分
    def _mad(self, X):
        obs = np.reshape(X, (-1, 1))
        # `self.median` will be None only before `fit()` is called
        self.median_ = np.nanmedian(obs) if self.median_ is None else self.median_
        diff = np.abs(obs - self.median_)
        self.median_diff_ = np.nanmedian(diff) if self.median_diff_ is None else self.median_diff_
        return np.nan_to_num(np.ravel(0.6745 * diff / self.median_diff_))
#打标签并进行计算均值方差
    def _process_decision_scores(self):
        self.labels_ = (self.decision_scores_ > self.threshold).astype('int').ravel()
        # calculate for predict_proba()
        self._mu = np.nanmean(self.decision_scores_)
        self._sigma = np.nanstd(self.decision_scores_)

        return self

你可能感兴趣的:(算法,python,开发语言)