本次参加2020CCF大数据与计算智能大赛,在Serverless工作负载预测赛道侥幸进入决赛,在这里首先感谢我的队友在比赛中的不懈努力,然后和大家分享一下我个人的模型(进入决赛得分为融模结果),本文给出方案A榜线上成绩为0.344,简易改进后得分为0.346和0.348,根据我们队伍A/B榜波动看来,模型相对稳定,B榜成绩预估在top20左右
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
train=pd.read_csv('../data/train.csv')
#由于训练集中某些特征存在测试集中不包含的离散值 根据此进行筛选
train = train[train.STATUS=='available']
train = train[train.PLATFORM=='x86_64']
train = train[train.RESOURCE_TYPE=='vm']
train = train[train.QUEUE_TYPE!='spark']
train = train.reset_index(drop=True)
del train['STATUS']
del train['PLATFORM']
del train['RESOURCE_TYPE']
#时间格式转换
train.DOTTING_TIME = train.DOTTING_TIME.astype('datetime64[ms]')
#根据时间进行排序 并去重复时间
train = train.sort_values(by='DOTTING_TIME')
train.drop_duplicates(subset=['DOTTING_TIME','QUEUE_ID'],keep='last',inplace=True)
print(train.shape)
train_new=[]
train['qid_group']=''
'''
对训练集进行重新分组 首先根据qid分组 然后根据时间间隔 若相邻样本(由于之前已根据时间进行排序)
时间间隔大于30(题目要求的预测未来时间是5-25,这里比较随意的给定了一个30)
则将其分为另外一组(这一步操作与之后特征处理以及目标值构造时的操作有关)
(只考虑到了相邻的样本的时间)
'''
for qid in tqdm(train.QUEUE_ID.unique()):
k=0
train['diff_time']=train[train.QUEUE_ID==qid].DOTTING_TIME.diff(1)
for row in train[train.QUEUE_ID==qid].iterrows():
if row[1].diff_time>pd.to_timedelta(30,'min') :
k+=1
row[1].qid_group='{}_{}'.format(row[1].QUEUE_ID,str(k))
train_new.append(row[1])
else:
row[1].qid_group='{}_{}'.format(row[1].QUEUE_ID,str(k))
train_new.append(row[1])
train_new=pd.DataFrame(np.array(train_new),columns=train.columns)
train_new.to_csv('../data/train_new.csv',index=False)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
df_train = pd.read_csv('../data/train_new.csv')
df_test = pd.read_csv('../data/evaluation_public.csv')
del df_test['STATUS']
del df_test['PLATFORM']
del df_test['RESOURCE_TYPE']
#离散特征编码
df_train.loc[df_train.QUEUE_TYPE=='sql','QUEUE_TYPE'] = 0
df_train.loc[df_train.QUEUE_TYPE=='general','QUEUE_TYPE'] = 1
df_test.loc[df_test.QUEUE_TYPE=='sql','QUEUE_TYPE'] = 0
df_test.loc[df_test.QUEUE_TYPE=='general','QUEUE_TYPE'] = 1
df_train.QUEUE_TYPE = df_train.QUEUE_TYPE.astype(int)
df_test.QUEUE_TYPE = df_test.QUEUE_TYPE.astype(int)
#保留一部分时间差较小的数据作为训练集 并且数量足够我构造特征以及目标值
#前面四个做差分特征,后面五个做目标所以至少需要10条数据
qid_group = df_train.groupby('qid_group').qid_group.agg('count')
qid_group = qid_group[qid_group>9].index
df_train = df_train[df_train.qid_group.isin(qid_group)]
#时间类型转化并排序
df_train.DOTTING_TIME = df_train.DOTTING_TIME.astype('datetime64[ms]')
df_train = df_train.sort_values(by='DOTTING_TIME')
df_test.DOTTING_TIME = df_test.DOTTING_TIME.astype('datetime64[ms]')
df_test = df_test.sort_values(by='DOTTING_TIME')
df_train.DOTTING_TIME = pd.to_datetime(df_train.DOTTING_TIME)
df_test.DOTTING_TIME = pd.to_datetime(df_test.DOTTING_TIME)
#获取时间信息中的小时(由于测试集对年月日进行脱敏 因此这里使用小时粒度)
df_train['hour'] = df_train.DOTTING_TIME.dt.hour + df_train.DOTTING_TIME.dt.minute/60
df_test['hour'] = df_test.DOTTING_TIME.dt.hour + df_test.DOTTING_TIME.dt.minute/60
#作业数相关的一些特征
df_train['to_run_jobs'] = df_train['LAUNCHING_JOB_NUMS'] - df_train['RUNNING_JOB_NUMS']
df_train['Ljobs_cu_ratio'] = df_train['LAUNCHING_JOB_NUMS'] / df_train['CU']
df_train['Rjobs_cu_ratio'] = df_train['RUNNING_JOB_NUMS'] / df_train['CU']
df_test['to_run_jobs'] = df_test['LAUNCHING_JOB_NUMS'] - df_test['RUNNING_JOB_NUMS']
df_test['Ljobs_cu_ratio'] = df_test['LAUNCHING_JOB_NUMS'] / df_test['CU']
df_test['Rjobs_cu_ratio'] = df_test['RUNNING_JOB_NUMS'] / df_test['CU']
#cpu、内存使用比例的比值
df_train['cpu_mem_ratio'] = df_train.CPU_USAGE/df_train.MEM_USAGE
df_test['cpu_mem_ratio'] = df_test.CPU_USAGE/df_test.MEM_USAGE
#将取消的作业数与失败作业数进行加和
df_train['cancel_job'] = df_train.CANCELLED_JOB_NUMS + df_train.FAILED_JOB_NUMS
df_test['cancel_job'] = df_test.CANCELLED_JOB_NUMS + df_test.FAILED_JOB_NUMS
#尝试下根据qid或者cu 或者两者结合 做平均值 测试集不能使用
#尝试一下对比 测试集单独统计 与 直接使用训练集根据CU统计的结果
#如果说 根据qid统计效果不好 是否是因为测试集单独统计 如果直接填充训练集中的数据呢
#这和分qid预测有点相似 都是利用测试集中的qid都能够在训练集中找到 是否要利用这点?
#统计特征
for i in ['mean','std','median','max','min']:
df_train['cpu_{}'.format(i)] = df_train.groupby('QUEUE_ID').CPU_USAGE.transform(i)
df_test = pd.merge(df_test,df_train[['QUEUE_ID','cpu_{}'.format(i)]].drop_duplicates(),on='QUEUE_ID')
df_train['mem_{}'.format(i)] = df_train.groupby('QUEUE_ID').MEM_USAGE.transform(i)
df_test = pd.merge(df_test,df_train[['QUEUE_ID','mem_{}'.format(i)]].drop_duplicates(),on='QUEUE_ID')
df_train['ljob_{}'.format(i)] = df_train.groupby('QUEUE_ID').LAUNCHING_JOB_NUMS.transform(i)
df_test = pd.merge(df_test,df_train[['QUEUE_ID','ljob_{}'.format(i)]].drop_duplicates(),on='QUEUE_ID')
df_train['sjob_{}'.format(i)] = df_train.groupby('QUEUE_ID').SUCCEED_JOB_NUMS.transform(i)
df_test = pd.merge(df_test,df_train[['QUEUE_ID','sjob_{}'.format(i)]].drop_duplicates(),on='QUEUE_ID')
df_train['rjob_{}'.format(i)] = df_train.groupby('QUEUE_ID').RUNNING_JOB_NUMS.transform(i)
df_test = pd.merge(df_test,df_train[['QUEUE_ID','rjob_{}'.format(i)]].drop_duplicates(),on='QUEUE_ID')
df_train['cjob_{}'.format(i)] = df_train.groupby('QUEUE_ID').cancel_job.transform(i)
df_test = pd.merge(df_test,df_train[['QUEUE_ID','cjob_{}'.format(i)]].drop_duplicates(),on='QUEUE_ID')
for i in range(1,5):
df_test['cpu_diff_{}'.format(i)] = df_test.groupby('ID').CPU_USAGE.diff(i)
df_test['mem_diff_{}'.format(i)] = df_test.groupby('ID').MEM_USAGE.diff(i)
df_test['rjob_diff_{}'.format(i)] = df_test.groupby('ID').RUNNING_JOB_NUMS.diff(i)
df_test['sjob_diff_{}'.format(i)] = df_test.groupby('ID').SUCCEED_JOB_NUMS.diff(i)
df_test['ljob_diff_{}'.format(i)] = df_test.groupby('ID').LAUNCHING_JOB_NUMS.diff(i)
df_train['cpu_diff_{}'.format(i)] = df_train.groupby('qid_group').CPU_USAGE.diff(i)
df_train['mem_diff_{}'.format(i)] = df_train.groupby('qid_group').MEM_USAGE.diff(i)
df_train['rjob_diff_{}'.format(i)] = df_train.groupby('qid_group').RUNNING_JOB_NUMS.diff(i)
df_train['sjob_diff_{}'.format(i)] = df_train.groupby('qid_group').SUCCEED_JOB_NUMS.diff(i)
df_train['ljob_diff_{}'.format(i)] = df_train.groupby('qid_group').LAUNCHING_JOB_NUMS.diff(i)
#对数据进行聚合排序后 shift平移构建目标值
def make_label(data):
data['CPU_USAGE_1'] = data.CPU_USAGE.shift(-1)
data['CPU_USAGE_2'] = data.CPU_USAGE.shift(-2)
data['CPU_USAGE_3'] = data.CPU_USAGE.shift(-3)
data['CPU_USAGE_4'] = data.CPU_USAGE.shift(-4)
data['CPU_USAGE_5'] = data.CPU_USAGE.shift(-5)
data['LAUNCHING_JOB_NUMS_1'] = data.LAUNCHING_JOB_NUMS.shift(-1)
data['LAUNCHING_JOB_NUMS_2'] = data.LAUNCHING_JOB_NUMS.shift(-2)
data['LAUNCHING_JOB_NUMS_3'] = data.LAUNCHING_JOB_NUMS.shift(-3)
data['LAUNCHING_JOB_NUMS_4'] = data.LAUNCHING_JOB_NUMS.shift(-4)
data['LAUNCHING_JOB_NUMS_5'] = data.LAUNCHING_JOB_NUMS.shift(-5)
return data.dropna()
#将之后五个时间点的数值作为label
df_train = df_train.groupby('qid_group').apply(make_label)
df_train = df_train.reset_index(drop=True)
df_test.drop(['DOTTING_TIME'],axis=1,inplace=True)
df_test = df_test.dropna()
#删除的特征
del_features = ['SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS','DISK_USAGE','QUEUE_ID']
df_train.drop(del_features,axis=1,inplace=True)
df_test.drop(del_features,axis=1,inplace=True)
#记录id
ids = df_test.ID
df_train.drop(['diff_time','DOTTING_TIME','qid_group'],axis=1,inplace=True)
df_test.drop(['ID'],axis=1,inplace=True)
targets_names = ['CPU_USAGE_1','CPU_USAGE_2','CPU_USAGE_3','CPU_USAGE_4','CPU_USAGE_5']
#目标值列表
targets = []
for i in targets_names:
targets.append(df_train[i])
#在训练集中删除 分离data与target
df_train.drop(i,axis=1,inplace=True)
job_col = ['LAUNCHING_JOB_NUMS_1','LAUNCHING_JOB_NUMS_2','LAUNCHING_JOB_NUMS_3','LAUNCHING_JOB_NUMS_4','LAUNCHING_JOB_NUMS_5']
df_job = df_test['LAUNCHING_JOB_NUMS']
df_train.drop(job_col,axis=1,inplace=True)
#十折交叉验证 LGB原始参数
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
df = pd.DataFrame()
df['ID'] = ids
DIST = 0
n = 0
#习惯将迭代次数设大 然后设置早停
model = LGBMRegressor(n_estimators=100000,metric='mse')
kf = KFold(n_splits=10,random_state=10100,shuffle=True)
for i in targets:
df[i.name] = 0
for k,(train,test) in enumerate(kf.split(df_train,i)):
cpu_all_loss=0
job_all_loss=0
train_x=df_train.iloc[train]
test_x=df_train.iloc[test]
train_y=i.iloc[train]
test_y=i.iloc[test]
model.fit(train_x,train_y,eval_set = (test_x,test_y),early_stopping_rounds=100,verbose=False)
pre = model.predict(test_x,num_iteration=model.best_iteration_)
cpu_all_loss += mean_squared_error(model.predict(test_x,num_iteration=model.best_iteration_),test_y)
cpu = model.predict(df_test,num_iteration=model.best_iteration_)
cpu[cpu<0] = 0
cpu[cpu>100] = 100
df[i.name] += cpu/10
print(i.name,'cpu_all_loss:',cpu_all_loss)
job_col = ['LAUNCHING_JOB_NUMS_1','LAUNCHING_JOB_NUMS_2','LAUNCHING_JOB_NUMS_3','LAUNCHING_JOB_NUMS_4','LAUNCHING_JOB_NUMS_5']
for i in job_col:
df[i] = df_job
df = df[['ID','CPU_USAGE_1','LAUNCHING_JOB_NUMS_1','CPU_USAGE_2','LAUNCHING_JOB_NUMS_2','CPU_USAGE_3','LAUNCHING_JOB_NUMS_3',
'CPU_USAGE_4','LAUNCHING_JOB_NUMS_4','CPU_USAGE_5','LAUNCHING_JOB_NUMS_5']]
submit=pd.read_csv('../data/submit_example.csv')[['ID']]
submit=pd.merge(submit,df.astype(int),on='ID')
submit.to_csv('../subs/3444.csv',index=False)
以上为LGB模型获得A榜0.344分数的建模方法
1、目标变化,将预测的目标变化为预测当前与未来的差值
2、增加如下特征进行差分特征构建,并在模型训练时删除以下两特征
df_train['cpu_use'] = df_train.CPU_USAGE * df_train.CU /100
df_test['cpu_use'] = df_test.CPU_USAGE * df_test.CU /100
df_train['mem_use'] = df_train.MEM_USAGE * df_train.CU * 0.04
df_test['mem_use'] = df_test.MEM_USAGE * df_test.CU * 0.04
3、其余操作不变,以上更改可将A榜成绩由0.344提升为0.3468
在一次提升的基础上,在十折交叉验证中,加入RFE递归特征消除做特征筛选,使用LGB模型作为筛选方式,保留30个特征,其余操作不变
rfe = RFE(estimator=LGBMRegressor(), n_features_to_select=30)
rfe_train = pd.DataFrame(rfe.fit_transform(df_train, targets[i]))
rfe_test = pd.DataFrame(rfe.transform(df_test))
以上修改可将A榜成绩由0.3468提升到0.348
在比赛过程中,发现预测出的赛题数据相对真实数据整体偏低,将预测出的结果整体乘一个较小的系数,能使最终结果有所提升,此处未进行尝试,预估结果在0.35以上
在比赛过程中,也有过一些其他想法,然而效果不佳,遂放弃,也可能是使用方法有误,放在这里,若有兴趣,可以进行一些尝试
在本次比赛中,与队友一起奋斗,学习到了很多东西,认识了一些大佬,在这里感谢一下我的队友们,感谢lan哥、K哥、鱼佬、丫哥,希望有机会还能与各位一起打比赛。