机器学习时间特征处理方法汇总
- 时间特征基本处理
- 基本处理基础上二次处理
- 差分、滞后、滑窗、指数加权
- 参考文献:https://zhuanlan.zhihu.com/p/466773545
时间特征基本处理
# 时间特征的基本转换
class HandleDateFeature:
def __init__(self, df, col):
self.df = df
self.col = col
def datetime_transfrom(self):
df[self.col + '_year'] = df[self.col].dt.year
df[self.col + '_quarter'] = df[self.col].dt.quarter
df[self.col + '_month'] = df[self.col].dt.month
df[self.col + '_week'] = df[self.col].dt.weekofyear
df[self.col + '_day'] = df[self.col].dt.dayofyear
df[self.col + '_month_day'] = df[self.col].dt.day
df[self.col + '_dayofweek'] = df[self.col].dt.dayofweek
df[self.col + '_hour'] = df[self.col].dt.hour
df[self.col + '_minute'] = df[self.col].dt.minute
df[self.col + '_second'] = df[self.col].dt.second
return df
def date_isbegin(self):
df[self.col + '_year_start'] = df[self.col].dt.is_year_start
df[self.col + '_year_end'] = df[self.col].dt.is_year_end
df[self.col + '_quarter_start'] = df[self.col].dt.is_quarter_start
df[self.col + '_quarter_end'] = df[self.col].dt.is_quarter_end
df[self.col + '_month_start'] = df[self.col].dt.is_month_start
df[self.col + '_month_end'] = df[self.col].dt.is_month_end
return df
基本处理基础上二次处理
class TwoHandleDateFeature:
def __init__(self, df, col):
self.df = df
self.col = col
# 统计年、月、季度、天、时、分、秒、是否周末 周期特征
def cycle_fea(self):
df[self.col + '_sin_month'] = df[self.col + '_month'].apply(lambda x: np.sin(np.pi*2/12 * x))
df[self.col + '_cos_month'] = df[self.col + '_month'].apply(lambda x: np.cos(np.pi * 2 / 12 * x))
df[self.col + '_sin_month_day'] = df[self.col + '_month_day'].apply(lambda x: np.sin(np.pi * 2 / 30 * x))
df[self.col + '_cos_month_day'] = df[self.col + '_month_day'].apply(lambda x: np.cos(np.pi * 2 / 30 * x))
df[self.col + '_sin_week'] = df[self.col + '_week'].apply(lambda x: np.sin(np.pi * 2 / 52 * x))
df[self.col + '_cos_week'] = df[self.col + '_week'].apply(lambda x: np.cos(np.pi * 2 / 52 * x))
df[self.col + '_sin_day'] = df[self.col + '_day'].apply(lambda x: np.sin(np.pi * 2 / 365 * x))
df[self.col + '_cos_day'] = df[self.col + '_day'].apply(lambda x: np.cos(np.pi * 2 / 365 * x))
df[self.col + '_sin_dayofweek'] = df[self.col + '_dayofweek'].apply(lambda x: np.sin(np.pi * 2 / 7 * x))
df[self.col + '_cos_dayofweek'] = df[self.col + '_dayofweek'].apply(lambda x: np.cos(np.pi * 2 / 7 * x))
df[self.col + '_sin_hour'] = df[self.col + '_hour'].apply(lambda x: np.sin(np.pi * 2 / 24 * x))
df[self.col + '_cos_hour'] = df[self.col + '_hour'].apply(lambda x: np.cos(np.pi * 2 / 24 * x))
df[self.col + '_sin_minute'] = df[self.col + '_minute'].apply(lambda x: np.sin(np.pi * 2 / 60 * x))
df[self.col + '_cos_minute'] = df[self.col + '_minute'].apply(lambda x: np.sin(np.pi * 2 / 60 * x))
df[self.col + '_weekend'] = np.where(df[self.col + '_dayofweek'].isin([5, 6]), 1, 0)
return df
# 工作、努力、卷王、黎明、清晨、早上、上午、中午、下午、傍晚、晚上、深夜 划分
def part_day(self):
df[self.col + '_work_hours'] = np.where(df[self.col + '_hour'].isin([9, 10, 11, 14, 15, 17]), 1, 0)
df[self.col + '_early_bird__hours'] = np.where(df[self.col + '_hour'].isin([8, 18]), 1, 0)
df[self.col + '_blackleg__hours'] = np.where(df[self.col + '_hour'].isin([7, 19, 20, 21]), 1, 0)
df[self.col + '_dawn_hours'] = np.where(df[self.col + '_hour'].isin([4, 5]), 1, 0)
df[self.col + '_early_morning_hours'] = np.where(df[self.col + '_hour'].isin([6, 7]), 1, 0)
df[self.col + '_later_morning_hours'] = np.where(df[self.col + '_hour'].isin([8, 9, 10]), 1, 0)
df[self.col + '_noon_hours'] = np.where(df[self.col + '_hour'].isin([11, 12, 13]), 1, 0)
df[self.col + '_afternoon_hours'] = np.where(df[self.col + '_hour'].isin([14, 15, 16]), 1, 0)
df[self.col + '_evening_hours'] = np.where(df[self.col + '_hour'].isin([17, 18, 19]), 1, 0)
df[self.col + '_night_hours'] = np.where(df[self.col + '_hour'].isin([20, 21, 22]), 1, 0)
df[self.col + '_midnight_hours'] = np.where(df[self.col + '_hour'].isin([23, 24, 1, 2, 3]), 1, 0)
return df
差分、滞后、滑窗、指数加权
class DateShiftRollingEwm:
def __init__(self, df, col, group, windows=None, alpha=None, shift=None):
self.df = df
self.col = col
self.shift = shift
self.windows = windows
self.alpha = alpha
# group---> list
self.group = group
def diffs(self):
df[self.col + '_diff_1'] = df.groupby(self.group)[self.col].diff(periods=1)
df[self.col + '_diff_2'] = df.groupby(self.group)[self.col + '_diff_1'].diff(periods=1)
return df
def shifts(self):
df[self.col + '_shift_' + str(self.shift)] = df.groupby(self.group)[self.col].shift(
self.shift).fillna(method='ffill').reset_index().sort_index().set_index('index')
return df
def rollings(self):
"""
DataFrame.rolling(window, min_periods=None, freq=None, center=False, win_type=None, on=None, axis=0,
closed=None)
- min_periods: 最少需要有值的观测点的数量, 对于int类型,默认与window相等
- center: 把窗口的标签设置为居中, 布尔型, 默认False
- win_type: 窗口的类型, 截取窗的各种函数。字符串类型,默认为None
- on: 可选参数, 对于dataframe而言,指定要计算滚动窗口的列, 值为列名
- closed:定义区间的开闭,支持int类型的window, 对于offset类型默认是左开右闭的即默认为right, 可以根据情况指定为left、both等
- axis:方向(轴), 一般都是0
"""
df[self.col + '_rolling_' + str(self.windows) + '_mean'] = df.groupby(self.group)[self.col].transform(
lambda x: x.shift(1).rolling(window=self.windows, min_periods=3, win_type="triang").mean()).values.tolist()
df[self.col + '_rolling_' + str(self.windows) + '_max'] = df.groupby(self.group)[self.col].transform(
lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).max()).values.tolist()
df[self.col + '_rolling_' + str(self.windows) + '_min'] = df.groupby(self.group)[self.col].transform(
lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).min()).values.tolist()
# df[self.col + '_rolling_' + str(self.windows) + '_std'] = df.groupby(self.group)[self.col].transform(
# lambda x: x.shift(1).rolling(window=self.windows, min_periods=3, win_type="triang").std()).values.tolist()
# df[self.col + '_rolling_' + str(self.windows) + '_skew'] = df.groupby(self.group)[self.col].transform(
# lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).skew()).values.tolist()
# df[self.col + '_rolling_' + str(self.windows) + '_kurt'] = df.groupby(self.group)[self.col].transform(
# lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).kurt()).values.tolist()
# df[self.col + '_rolling_' + str(self.windows) + '_quantile'] = df.groupby(self.group)[self.col].transform(
# lambda x: x.rolling(window=self.windows, min_periods=3).quantile()).values.tolist()
# df[self.col + '_rolling_' + str(self.windows) + '_corr'] = df.groupby(self.group)[self.col].transform(
# lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).corr()).values.tolist()
return df
def ewms(self):
"""
DataFrame.ewm(self, com=None, span=None, halflife=None, alpha=None, min_periods=0, adjust=True, ignore_na=False, axis=0)
com : float,可选,根据质心指定衰减, α=1/(1+com), for com≥0。
span : float,可选,根据范围指定衰减, α=2/(span+1), for span≥1。
halflife : float,可选,根据半衰期指定衰减, α=1−exp(log(0.5)/halflife),forhalflife>0。
alpha : float,可选,直接指定平滑系数α, 0<α≤1。0.18.0版中的新功能。
min_periods : int,默认0,窗口中具有值的最小观察数(否则结果为NA)。
adjust : bool,默认为True,除以开始阶段的衰减调整因子,以解释相对权重的不平衡(将EWMA视为移动平均线)。
ignore_na : bool,默认为False,计算权重时忽略缺失值;指定True可重现0.15.0之前的行为。
axis : {0或'index',1或'columns'},默认0,要使用的轴。值0标识行,值1标识列。
只能提供一个参数: com, span, halflife, 和 alpha 四个参数中有且仅有一个参数可被设置(不支持2个或2个以上的设置)。
可供使用指数加权函数有:mean(), var(), std(), corr(), cov()
"""
df[self.col + '_ewm_' + str(self.windows) + '_mean'] = df.groupby(self.group)[self.col].transform(
lambda x: x.shift(1).ewm(alpha=self.alpha).mean()).values.tolist()
df[self.col + '_ewm_' + str(self.windows) + '_std'] = df.groupby(self.group)[self.col].transform(
lambda x: x.shift(1).ewm(alpha=self.alpha).std()).values.tolist()
df[self.col + '_ewm_' + str(self.windows) + '_corr'] = df.groupby(self.group)[self.col].transform(
lambda x: x.shift(1).ewm(alpha=self.alpha).corr()).values.tolist()
return df
参考文献:https://zhuanlan.zhihu.com/p/466773545