一、背景
最近想学习下pytorch,遂参加个比赛搞一搞,结果pytorch没有学到学到了sklearn。
比赛链接:风电机组异常数据识别与清洗
二、录入数据 将时间转化一下变成天方便处理
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LinearRegression,RANSACRegressor
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
df = pd.read_csv('dataset.csv' )
df['timeStamp'] = df.apply(lambda x:time.mktime(time.strptime(x['Time'],'%Y/%m/%d %H:%M')),axis=1 )
df['day'] = df.apply(lambda x: int(x['timeStamp'] /(3600*24)) ,axis=1 )
#录入一些状态数据
rotor_speed_limit = [-1,8.33,8.33,8.33,8.33,5.5,8.33,8.33,8.33,8.33,8.33,5.0,5.5]
rotor_speed_limit_up = [-1,16.8,16.8,16.8,16.8,17,16.8,16.8,16.8,16.8,16.8,14.0,17.0]
wind_speed_limit = [-1,3,3,3,3,3,3,3,3,3,3,2.5,3]
wind_speed_limit_up = [-1,25,25,25,25,22,25,25,25,25,25,19,22]
三、开始处理
3.1 规则处理一波
df['label'] = df.apply(lambda x:1 if (
x['Power']<=0 or x['WindSpeed']<=0 or x['RotorSpeed']<=0
or (
(x['WindSpeed']wind_speed_limit_up[x['WindNumber']] )
and x['Power']>0
)
) else 0,axis=1 )
3.2 用精选挑选的数据train一个线性模型 [笑哭]
注意: 进行分层取样
max_=[20000,20000,17750,20000,18200,20000,20000,20000,20000,20000,20000,20000]
min_=[17780,17800,17650,18100,18100,17780,17800,17770,17770,18100,18100,18100]
models= []
scal=10
for i in list(range(12)):
i=i+1
one=df[df.WindNumber==i]
to_train_regression=one[(df.daymin_[i-1])][df.Power<2000]
to_train_regression['Power_n']=np.floor(to_train_regression.Power/scal)
target_num = int(to_train_regression.groupby("Power_n")['Power_n'].count().quantile(0.4))
sample_datas = []
for pn in to_train_regression['Power_n'].unique().tolist():
to_samp_data =to_train_regression[to_train_regression.Power_n==pn]
al = to_samp_data['Power_n'].count()
sample_data = to_samp_data.sample(n=min(target_num,al) )
sample_datas.append(sample_data)
d = pd.concat(sample_datas)
x=d[['Power']]
y=d[['WindSpeed']]
poly_linear_model = RANSACRegressor(LinearRegression(),loss='squared_loss' )
poly_linear_model.fit(PolynomialFeatures(degree = 3).fit_transform(x), y)
models.append(poly_linear_model)
3.3 预测过滤数据
max_p=2000
resx = []
for i in list(range(12)):
modle = models[i]
i=i+1
q1 = df[df.WindNumber==i]
py=modle.predict(PolynomialFeatures(degree = 3).fit_transform(q1[['Power']]))
q1['ds']=(py-q1[['WindSpeed']])
f=-1.1
max_p =q1.Power.max()-150
q1['ds']=q1.apply(lambda x:1 if ( (x['ds'] > f and x['ds'] <-f*1.5) or x['Power']>max_p ) else 0,axis=1)
resx.append(q1)
r = pd.concat(resx)
r.sort_index()
df['ds']=r['ds']
df['label']=df.apply(lambda x: 1 if (x['label']==1 or x['ds']==0) else 0 ,axis=1)
四、想法
- 之前想用pytorch深度学习模型做自编码,结果维度太少,没法看
- 之前想法很多,结果简单的东西反而效果会好一些,重要的是做好记录不出bug呀