箱型图查找异常值并替换
定义为值大于上四分位或者小于下四分位的值位异常值,若是异常值,则用mean替换,项目中遇到的,但是这个可能或将0值也替换了,但是0值不是异常值,故而舍弃这种方法。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
'''
异常值分析
'''
def get_data(df, row):
df = df.iloc[:, 1]
'''
1 {}
2 {}
.
.
.
10 {}
'''
series_1 = df[row]
list_1 = series_1.split(',')
for i in range(len(list_1)):
list_1[i] = float(list_1[i])
series = pd.Series(list_1)
print(series)
return series
def box_plot_outliers(data_ser, box_scale):
'''
利用箱型图去除异常值
:param data_ser: 接收pandas.Series数据格式
:param box_scale: 箱型图尺寸,取3
:return:
'''
print(box_scale)
print(type(box_scale))
s = (data_ser.quantile(0.75) - data_ser.quantile(0.25))
print(s)
print(type(s))
iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
val_low = data_ser.quantile(0.25) - iqr * 0.5
val_up = data_ser.quantile(0.25) + iqr * 0.5
outlier = data_ser[(data_ser < val_low) | (data_ser > val_up)]
normal_val = data_ser[(data_ser > val_low) & (data_ser < val_up)]
return outlier, normal_val, val_low, val_up
if __name__ == '__main__':
df = pd.read_csv(
r'C:\Users\yelinfeng\Downloads\photovoltaics-master\photovoltaics-master\test.csv')
series = get_data(df, 2)
outlier, normal_val, val_low, val_up = box_plot_outliers(series, 3)
print('异常值:' + str(outlier))
outlier = np.array(outlier)
print('val_up的类型:{}'.format(type(val_up)))
print('normal_val的类型:{}'.format(type(normal_val)))
print(normal_val)
sum = 0
normal_val = np.array(normal_val)
for i in range(len(normal_val)):
sum += normal_val[i]
mean = sum / (len(normal_val))
print('均值:{}'.format(mean))
if outlier is None:
print('没有异常值')
else:
for i, val in enumerate(series):
val_up = np.float(val_up)
val_low = np.float(val_low)
series[i] = float(np.where((((val < val_low) | (val > val_up))), mean, val))
print(series)
print(series[35])