最近在做多变量时间序列异常检测相关的工作,顺带也整理了目前市面上比较常用的五个多变量时间序列异常检测数据集,测试集都有标好的label,这五个数据集应该是在这个领域最为常用benchmark的数据集,整理主要来自于很多顶会的对比实验。
本文主要介绍五个数据集的具体信息和对应的标准化处理,并给出处理的代码和最终标准化的格式。
wget https://s3-us-west-2.amazonaws.com/telemanom/data.zip && unzip data.zip && rm data.zip
cd data && wget https://raw.githubusercontent.com/khundman/telemanom/master/labeled_anomalies.csv
最终的格式为:
时间统一为datetime格式,很多时间信息匿名的,给予0-len(dataset),label列名统一用label,0为正常,1为异常。
import ast
import csv
import os
import sys
from pickle import dump
import pandas as pd
import numpy as np
output_folder = 'processed_csv'
os.makedirs(output_folder, exist_ok=True)
def load_and_save(category, filename, dataset, dataset_folder):
os.makedirs(os.path.join(output_folder, filename.split('.')[0]), exist_ok=True)
temp = np.genfromtxt(os.path.join(dataset_folder, category, filename),
dtype=np.float32,
delimiter=',')
# print(dataset, category, filename, temp.shape)
fea_len = len(temp[0, :])
header_list = []
for i in range(fea_len):
header_list.append("col_%d"%i)
data = pd.DataFrame(temp, columns=header_list).reset_index()
data.rename(columns={'index': 'timestamp'}, inplace=True)
if category == "test":
temp1 = np.genfromtxt(os.path.join(dataset_folder, "test_label", filename),
dtype=np.float32,
delimiter=',')
data1 = pd.DataFrame(temp1, columns=["label"]).reset_index()
data1.rename(columns={'index': 'timestamp'}, inplace=True)
data = pd.merge(data, data1, how="left", on='timestamp')
print(dataset, category, filename, temp.shape)
data.to_csv(os.path.join(output_folder, filename.split('.')[0], dataset + "_" + category + ".csv"), index=False)
def load_data(dataset):
if dataset == 'SMD':
dataset_folder = 'ServerMachineDataset'
file_list = os.listdir(os.path.join(dataset_folder, "train"))
for filename in file_list:
if filename.endswith('.txt'):
load_and_save('train', filename, filename.strip('.txt'), dataset_folder)
load_and_save('test', filename, filename.strip('.txt'), dataset_folder)
elif dataset == 'SMAP' or dataset == 'MSL':
dataset_folder = 'data'
with open(os.path.join(dataset_folder, 'labeled_anomalies.csv'), 'r') as file:
csv_reader = csv.reader(file, delimiter=',')
res = [row for row in csv_reader][1:]
res = sorted(res, key=lambda k: k[0])
label_folder = os.path.join(dataset_folder, 'test_label')
os.makedirs(label_folder, exist_ok=True)
data_info = [row for row in res if row[1] == dataset and row[0] != 'P-2']
labels = []
for row in data_info:
anomalies = ast.literal_eval(row[2])
length = int(row[-1])
label = np.zeros([length], dtype=np.int)
for anomaly in anomalies:
label[anomaly[0]:anomaly[1] + 1] = 1
labels.extend(label)
labels = np.asarray(labels)
print(dataset, 'test_label', labels.shape)
labels = pd.DataFrame(labels, columns=["label"]).reset_index()
labels.rename(columns={'index': 'timestamp'}, inplace=True)
def concatenate_and_save(category):
data = []
for row in data_info:
filename = row[0]
print(os.path.join(dataset_folder, category, filename + '.npy'))
temp = np.load(os.path.join(dataset_folder, category, filename + '.npy'))
data.extend(temp)
data = np.asarray(data)
print(dataset, category, data.shape)
fea_len = len(data[0, :])
header_list = []
for i in range(fea_len):
header_list.append("col_%d" % i)
data = pd.DataFrame(data, columns=header_list).reset_index()
data.rename(columns={'index': 'timestamp'}, inplace=True)
if category == "test":
data = pd.merge(data, labels, how="left", on='timestamp')
print(dataset, category, filename, temp.shape)
data.to_csv(os.path.join(output_folder, dataset + "_" + category + ".csv"),
index=False)
for c in ['train', 'test']:
concatenate_and_save(c)
if __name__ == '__main__':
datasets = ['SMD', 'SMAP', 'MSL']
load_data('MSL')
改于:https://github.com/NetManAIOps/OmniAnomaly/blob/master/data_preprocess.py
import pandas as pd
train_new = pd.read_csv('./WADI.A2_19 Nov 2019/WADI_14days_new.csv')
test_new = pd.read_csv('./WADI.A2_19 Nov 2019/WADI_attackdataLABLE.csv', skiprows=1)
test = pd.read_csv('./WADI.A1_9 Oct 2017/WADI_attackdata.csv')
train = pd.read_csv('./WADI.A1_9 Oct 2017/WADI_14days.csv', skiprows=4)
def recover_date(str1, str2):
return str1+" "+str2
train["datetime"] = train.apply(lambda x : recover_date(x['Date'], x['Time']), axis=1)
train["datetime"] = pd.to_datetime(train['datetime'])
train_time = train[['Row', 'datetime']]
train_new_time = pd.merge(train_new, train_time, how='left', on='Row')
del train_new_time['Row']
del train_new_time['Date']
del train_new_time['Time']
train_new_time.to_csv('./processing/WADI_train.csv', index=False)
test["datetime"] = test.apply(lambda x : recover_date(x['Date'], x['Time']), axis=1)
test["datetime"] = pd.to_datetime(test['datetime'])
test = test.loc[-2:, :]
test_new = test_new.rename(columns={'Row ':'Row'})
test_time = test[['Row', 'datetime']]
test_new_time = pd.merge(test_new, test_time, how='left', on='Row')
del test_new_time['Row']
del test_new_time['Date ']
del test_new_time['Time']
test_new_time = test_new_time.rename(columns={'Attack LABLE (1:No Attack, -1:Attack)':'label'})
test_new_time.loc[test_new_time['label'] == 1, 'label'] = 0
test_new_time.loc[test_new_time['label'] == -1, 'label'] = 1
test_new_time.to_csv('./processing/WADI_test.csv', index=False)
import numpy as np
import pandas as pd
normal = pd.read_csv("input/SWaT_Dataset_Normal_v1.csv")
attack = pd.read_csv("input/SWaT_Dataset_Attack_v0.csv",sep=";")
normal['Timestamp'] = pd.to_datetime(normal['Timestamp'])
del normal['Normal/Attack']
normal = normal.rename(columns={'Timestamp':'datetime'})
datetime = normal['datetime']
del normal['datetime']
for i in list(normal):
normal[i]=normal[i].apply(lambda x: str(x).replace("," , "."))
normal = normal.astype(float)
normal['datetime']= datetime
normal.to_csv('SWaT_train.csv', index=False)
attack['Timestamp'] = pd.to_datetime(attack['Timestamp'])
attack = attack.rename(columns={'Timestamp':'datetime'})
datetime = attack['datetime']
del attack['datetime']
labels = [ float(label!= 'Normal' ) for label in attack["Normal/Attack"].values]
del attack['Normal/Attack']
for i in list(attack):
attack[i]=attack[i].apply(lambda x: str(x).replace("," , "."))
attack = attack.astype(float)
attack['datetime'] = datetime
attack['label'] = labels
attack.to_csv('SWaT_test.csv', index=False)