基于孤立森林算法的异常值检测

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest
%matplotlib inline
data = pd.read_csv("g_cons.csv")
data.set_index("date", inplace=True)
data.head()
g_cons
date
2019-04-19 NaN
2019-04-20 658.0
2019-04-21 86.0
2019-04-22 764.0
2019-04-23 808.0
plt.scatter(data.index, data["g_cons"], s=15, marker='o', c='b')

基于孤立森林算法的异常值检测_第1张图片

# 箱线图 四分位距
q3 = data["g_cons"].quantile(0.75)
q2 = data["g_cons"].median()
q1 = data["g_cons"].quantile(0.25)
iqr = q3 - q1
ub = q3 + 1.5 * iqr
lb = q1 - 1.5 * iqr
ub = q3 + 3 * (q3 - q2)
lb = q1 - 3 * (q2 - q1)

# 均值
avg = 0
n=0
for i in range(len(data["g_cons"])):
    if lb < data.iloc[i]["g_cons"] < ub:
        avg += data.iloc[i]["g_cons"]
        n+=1
avg/=n
data["g_cons"].fillna(value=0., axis=0, inplace=True)
data["g_cons"].replace(0., data["g_cons"].median(), inplace=True)
model = IsolationForest(n_estimators=300, 
                        max_samples='auto', 
                        contamination=float(0.1),)
model.fit(data[['g_cons']])
data['scores']  = model.decision_function(data[['g_cons']])
data['anomaly'] = model.predict(data[['g_cons']])
data.to_excel("scores.xlsx")
D:\SOFT\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
(data["anomaly"] == -1).sum() / len(data)
0.10050568900126422
data.loc[data["anomaly"] == -1, ["g_cons"]] = data["g_cons"].median()
plt.boxplot(data["g_cons"])
{'whiskers': [,
  ],
 'caps': [,
  ],
 'boxes': [],
 'medians': [],
 'fliers': [],
 'means': []}

基于孤立森林算法的异常值检测_第2张图片

plt.scatter(data.index, data["g_cons"], s=15, marker='o', c='b')

基于孤立森林算法的异常值检测_第3张图片


你可能感兴趣的:(机器学习,python,机器学习,异常值检测)