import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest
%matplotlib inline
data = pd.read_csv("g_cons.csv")
data.set_index("date", inplace=True)
data.head()
|
g_cons |
date |
|
2019-04-19 |
NaN |
2019-04-20 |
658.0 |
2019-04-21 |
86.0 |
2019-04-22 |
764.0 |
2019-04-23 |
808.0 |
plt.scatter(data.index, data["g_cons"], s=15, marker='o', c='b')

q3 = data["g_cons"].quantile(0.75)
q2 = data["g_cons"].median()
q1 = data["g_cons"].quantile(0.25)
iqr = q3 - q1
ub = q3 + 1.5 * iqr
lb = q1 - 1.5 * iqr
ub = q3 + 3 * (q3 - q2)
lb = q1 - 3 * (q2 - q1)
avg = 0
n=0
for i in range(len(data["g_cons"])):
if lb < data.iloc[i]["g_cons"] < ub:
avg += data.iloc[i]["g_cons"]
n+=1
avg/=n
data["g_cons"].fillna(value=0., axis=0, inplace=True)
data["g_cons"].replace(0., data["g_cons"].median(), inplace=True)
model = IsolationForest(n_estimators=300,
max_samples='auto',
contamination=float(0.1),)
model.fit(data[['g_cons']])
data['scores'] = model.decision_function(data[['g_cons']])
data['anomaly'] = model.predict(data[['g_cons']])
data.to_excel("scores.xlsx")
D:\SOFT\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
warnings.warn(
(data["anomaly"] == -1).sum() / len(data)
0.10050568900126422
data.loc[data["anomaly"] == -1, ["g_cons"]] = data["g_cons"].median()
plt.boxplot(data["g_cons"])
{'whiskers': [,
],
'caps': [,
],
'boxes': [],
'medians': [],
'fliers': [],
'means': []}

plt.scatter(data.index, data["g_cons"], s=15, marker='o', c='b')
