工业数据预处理过程(带时间标签):
带时间标签的数据统一时间戳
由于不同来源的工业数据时间间隔和时间点可能不同,因此需要根据时间点对齐数据
数据插值(时间对齐):以时间为横坐标插值,固定时间范围,将数据规整到每5min一组数据
from scipy import interpolate
import pandas as pd
import numpy as np
def interpolation():
# 日期转化成秒
x = self.indoor_temp_filter_dic[key][:, 0]
y = self.indoor_temp_filter_dic[key][:, 2]
x1 = []
re_date = []
for i in x:
i = pd.Timestamp(i)
i = i.timestamp()
x1.append(i)
# 截取时间范围2018-12-22 00:00:00 - 2019-03-18 00:00:00, 间隔per = 5min, 转化成秒
start = pd.Timestamp('2018-12-22 00:00:00')
start1 = start.timestamp()
end = pd.Timestamp('2019-03-18 00:00:00')
end1 = end.timestamp()
interval = pd.Timedelta(minutes=5)
interval1 = interval.seconds
per = (end1 - start1)/interval1
kk = np.linspace(start1, end1, per+1)
f = interpolate.interp1d(x1, y, kind='linear')
ynew = f(kk)
# 再将以秒为单位的数据转化回Timestamp类型保存
for k in kk:
k = pd.Timestamp(k, unit='s')
re_date.append(k)
self.indoor_temp_interpolation_dic[key] = np.column_stack((re_date, ynew))
相关性分析(热图)
def Heatmap(dataset):
key_list = ['indoor_temp','water_supply_temp','outdoor_temp','wind_pow','wind_air','light_time','humidity','air_pre']
dataset = np.array(dataset)
cor_matri = np.corrcoef(dataset.tolist(), rowvar=0)
sns.heatmap(pd.DataFrame(cor_matri, columns=key_list, index=key_list), annot=True, vmax=1, vmin=0,
xticklabels=True,
yticklabels=True, square=True, cmap="YlGnBu")
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_title(' Heat Map ', fontsize=18)
ax.set_ylabel('Y', fontsize=18)
ax.set_xlabel('X', fontsize=18)
plt.show()
plt.savefig('Random.png')
皮尔逊相关系数衡量两个序列相关性
from scipy.stats import pearsonr
p = pearsonr(x, y)[0]
滤波+坏点剔除
滑动均值滤波:
# input: 一维数组
# per:相邻几个点取平均
def Filter(inputs, per):
temp = []
lengh = len(inputs)
for i, element in enumerate(inputs):
if (i >= int(per/2)) and (i <= lengh-int(per/2)-1):
filter_list = inputs[i-int(per/2) : i + int(per/2) + 1]
temp1 = [sum(filter_list) / per]
else:
temp1 = [inputs[i]]
temp.append(temp1)
return temp
坏点剔除常用方法:
数据集随机抽取4/5做训练集,1/5做测试集
proportion = 4/5
dataNumber = dataSet.__len__() # 数据集数据条数
index = int(dataNumber * proportion) # 训练集数据条数
testDataSet = [] # 测试数据集
trainDataSet = [] # 训练数据集
indexes = np.array(range(0, dataNumber))
random.shuffle(indexes)
train_indexes = indexes[0:index]
test_indexes = indexes[index: dataNumber]
trainDataSet = np.array(dataSet)[train_indexes, :]
testDataSet = np.array(dataSet)[test_indexes, :]