【长时间序列预测】Autoformer 代码详解之[1]数据预处理及数据读取

1. 数据预处理及数据读取

1.1 数据格式

文件格式:CSV

特点:

        第一列:date(标准时间,如 2002-01-01 00:00:00)

        最后一列:OT(用于预测单变量)

        其他列:数据的其他特征

如下表:(需要将第一列和最后一列分别命名为date和OT)

【长时间序列预测】Autoformer 代码详解之[1]数据预处理及数据读取_第1张图片

可用以下脚本用来转换时间格式:

import pandas as pd
from datetime import datetime

if __name__ == '__main__':    
    df = pd.read_csv("a.csv")
    df['data'] = pd.to_datetime(df['data'],format = '%Y/%m/%d %H:%M')

    df['data'] = df['data'].dt.strftime('%Y-%m-%d %H:%M:%S')
    print(df)
    df = df.to_csv('b.csv', index=False)
    print(df)

1.2 数据读取

入口:exp/exp_main.py:class Exp_Main: _get_data(self, flag) # flag 分别为 'train'、'val'和 'test'。

data_set, data_loader = data_provider(self.args, flag) # data_set没用,我觉得应该修改为:只返回 data_loader 即可。

def data_provider(args, flag):
    Data = data_dict[args.data] # 这里我们使用 'custom': Dataset_Custom,
    timeenc = 0 if args.embed != 'timeF' else 1 

    if flag == 'test':    # test 时读取的数据
        shuffle_flag = False  # test 时候不打乱
        drop_last = True
        batch_size = args.batch_size
        freq = args.freq
    elif flag == 'pred':
        shuffle_flag = False
        drop_last = False
        batch_size = 1
        freq = args.freq
        Data = Dataset_Pred
    else:                  # 训练和验证时候
        shuffle_flag = True
        drop_last = True
        batch_size = args.batch_size
        freq = args.freq

    data_set = Data(       # 实例化一个自定义的Dataset的对象
        root_path=args.root_path,
        data_path=args.data_path,
        flag=flag,
        size=[args.seq_len, args.label_len, args.pred_len],
        features=args.features,
        target=args.target,
        timeenc=timeenc,
        freq=freq
    )
    print(flag, len(data_set))
    data_loader = DataLoader(  # torch.utils.data 中的DataLoader
        data_set,
        batch_size=batch_size,
        shuffle=shuffle_flag,
        num_workers=args.num_workers,
        drop_last=drop_last)
    return data_set, data_loader  # 这里d ata_set 没必要返回

自定义的 Dataset_Custom类, 继承父类:Dataset,重载两个私有成员函数__len__、__getitem__。

class Dataset_Custom(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='OT', scale=True, timeenc=0, freq='h'):
        # size [seq_len, label_len, pred_len]
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler() # 使经过处理的数据符合标准正态分布,即均值为0,标准差为1;减均值,然后除以标准差
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))

        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(df_raw.columns) # 列
        cols.remove(self.target) # 去除 'OT'
        cols.remove('date') # 去除 'date'
        df_raw = df_raw[['date'] + cols + [self.target]] # 这不就是原来的 df_raw 吗?多此一举?
        num_train = int(len(df_raw) * 0.7) # 假设一共100行, 70
        num_test = int(len(df_raw) * 0.2) # 20 
        num_vali = len(df_raw) - num_train - num_test # 10 
        border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len] # 假设 seq_len=5,  [0, 65, 75]
        border2s = [num_train, num_train + num_vali, len(df_raw)] # [70, 80, 100]
        border1 = border1s[self.set_type] # train:0  # val:65 # test:75
        border2 = border2s[self.set_type] # train:70 # val:80 # test:100 这里验证集会有泄漏,可能导致训练loss和验证loss很低,但是 test loss 却很高

        if self.features == 'M' or self.features == 'MS': # 多变量
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data] # 去掉第一列(时间列)的数据
        elif self.features == 'S':  # 单变量
            df_data = df_raw[[self.target]]

        if self.scale: # 进行标准化
            train_data = df_data[border1s[0]:border2s[0]] 
            self.scaler.fit(train_data.values) # 参考:https://blog.csdn.net/weixin_42279212/article/details/121342576
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2] # 当前某个模式(train/val/test)下的第一列时间数据
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0: # 如果 args.embed != 'timeF',就会把时间编码为 month,day,weekday,hour 四个数
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) # +1 = 星期几
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1: # args.embed == 'timeF'
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) # 根据传入的 freq 对时间戳进行解析
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2] # 感觉后面的 [border1:border2] 没用
        self.data_y = data[border1:border2] # 感觉后面的 [border1:border2] 没用
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1 #? 这样写估计是最后一个能去到完整seq_len+ pred_len序列
    def inverse_transform(self, data): # 数据恢复到原来尺度,好像没有用到
        return self.scaler.inverse_transform(data)
def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
    """
    Returns a list of time features that will be appropriate for the given frequency string.
    Parameters
    ----------
    freq_str
        Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.  
    """

    features_by_offsets = {
        offsets.YearEnd: [],
        offsets.QuarterEnd: [MonthOfYear],
        offsets.MonthEnd: [MonthOfYear],
        offsets.Week: [DayOfMonth, WeekOfYear],
        offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear],
        offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear],
        offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear], # h, 4个值
        offsets.Minute: [ # t  返回5个值
            MinuteOfHour,
            HourOfDay,
            DayOfWeek,
            DayOfMonth,
            DayOfYear,
        ],
        offsets.Second: [
            SecondOfMinute,
            MinuteOfHour,
            HourOfDay,
            DayOfWeek,
            DayOfMonth,
            DayOfYear,
        ],
    } # 没有定义 to_offset('15min')  # <15 * Minutes>,因此不能使用

    offset = to_offset(freq_str)

    for offset_type, feature_classes in features_by_offsets.items():
        if isinstance(offset, offset_type):
            return [cls() for cls in feature_classes]

    supported_freq_msg = f"""
    Unsupported frequency {freq_str}
    The following frequencies are supported:
        Y   - yearly
            alias: A
        M   - monthly
        W   - weekly
        D   - daily
        B   - business days
        H   - hourly
        T   - minutely
            alias: min
        S   - secondly
    """ # 只支持这些
    raise RuntimeError(supported_freq_msg)

因此,Autoformer 把时间戳根据传入的freq来解析,比如freq='h',那一个时间就解析为4个值。

for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):

那么输入的数据shape为:

# freq = 'h'
batch_x: batch_size, seq_len, d_in
batch_x_mark: batch_size, seq_len, 4
batch_y: batch_size, label_len+pred_len, d_in
batch_y_mark: batch_size, label_len+pred_len, 4

为什么要编码时间戳作为输入?

        因为真实世界场景中,时间戳通常可以获得,并且信息丰富。但是Transformer 模型中很少使用, Informer 模型首先将时间戳编码为位置编码,然后使用一个嵌入层进行编码。后面的 Aotoformer 模型和  FEDformer 模型采用了相同的方案。

参考:

https://github.com/thuml/Autoformer

你可能感兴趣的:(时间序列预测以及异常检测,时间序列预测)