转载自https://mp.weixin.qq.com/s/j5YICHrkHLDm7OldPFPOjw
本篇文章分享2019年腾讯广告算法大赛的方法,文末附带本次比赛简短高效的代码,代码的解析。
赛题理解
特征工程
模型介绍
思考总结
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
path = './data/'
def get_base_data():
ad_static = pd.read_csv(path + '/map_ad_static.out',
names=['aid', 'create_time', 'adv_id', 'product_id', 'ad_cate_id',
'industry_id', 'creative_size'], sep='\t')
ad_static['create_time_int'] = ad_static['create_time']
ad_static['create_time'] = pd.to_datetime(ad_static['create_time'] + 8 * 3600, unit='s')
cond_req = pd.read_csv(path + '//Btest_select_request_20190424.out',
names=['aid', 'req_set'], sep='\t')
cond_req['req_set_len'] = cond_req['req_set'].apply(lambda x: len(x.split('|')))
test = pd.read_csv(path + '/Btest_sample_bid.out',
names=['sample_id', 'aid', 'target_type', 'bid_type', 'bid'], sep='\t')
test['day'] = 24
test['aid_day_count'] = test['aid'].map(cond_req[['aid', 'req_set_len']].set_index('aid')['req_set_len'])
return ad_static, test
ad_static, test = get_base_data()
def get_train_data():
aid_day_label_stat = {}
aid_day_count_stat = {}
for day in range(10, 23):
temp_track_log = pd.read_csv(path + '/track_log/track_log_201904' + str(day) + '.out', names=[
'req_id', 'req_time', 'uid', 'loc_id', 'comp_info'], sep='\t')
for comp_info in temp_track_log['comp_info']:
for ad in comp_info.split(';'):
ad = ad.split(',')
aid_day = int(ad[0]) * 100 + day
try:
aid_day_count_stat[aid_day] = aid_day_count_stat[aid_day] + 1
except:
aid_day_count_stat[aid_day] = 1
if ad[6] == '1':
try:
aid_day_label_stat[aid_day] = aid_day_label_stat[aid_day] + 1
except:
aid_day_label_stat[aid_day] = 1
aid_day_label = pd.DataFrame(
{'aid_day': list(aid_day_label_stat.keys()), 'label': list(aid_day_label_stat.values())})
aid_day_count = pd.DataFrame(
{'aid_day': list(aid_day_count_stat.keys()), 'aid_day_count': list(aid_day_count_stat.values())})
train = aid_day_count.merge(aid_day_label, 'left', 'aid_day').fillna(0)
train['aid'] = train['aid_day'] // 100
train['day'] = train['aid_day'] % 100
train = train[(train['aid'].isin(test.aid.unique()) | (train['aid_day_count'] > 100))]
return train, aid_day_label
train, aid_day_label = get_train_data()
def get_test_expos():
track_test_a = pd.read_csv(path + '/test_tracklog_20190423.last.out', names=[
'req_id', 'req_time', 'uid', 'loc_id', 'comp_info'], sep='\t')
aid_expos = []
aid_not_expos = []
for i in track_test_a['comp_info']:
tmp_expos = False
for ad in i.split(';'):
ad = ad.split(',')
if ad[-1] == '0' and tmp_expos is False:
tmp_expos = True
aid_expos.append(int(ad[0]))
else:
aid_not_expos.append(int(ad[0]))
aid_not_expos_df = pd.Series(aid_not_expos).value_counts()
aid_not_expos_df = aid_not_expos_df.reset_index().rename(columns={'index': 'aid', 0: 'not_count'})
aid_expos_df = pd.Series(aid_expos).value_counts()
aid_expos_df = aid_expos_df.reset_index().rename(columns={'index': 'aid', 0: 'label'})
test_expos = aid_expos_df.merge(aid_not_expos_df, 'outer', 'aid')
test_expos = test_expos.fillna(0)
test_expos['aid_day_count'] = test_expos['label'] + test_expos['not_count']
test_expos['day'] = 23
test_expos = test_expos[test_expos['aid_day_count'] > 50]
test_23 = pd.read_csv(path + '/final_select_test_request.out', names=['aid', 'req_set'], sep='\t')
test_23['day'] = 23
test_23['aid_day_count'] = test_23['req_set'].apply(lambda x: len(x.split('|')))
return test_expos, test_23
test_expos, test_23 = get_test_expos()
data = pd.concat([train, test_expos, test_23, test], ignore_index=True)
data = data.merge(ad_static, 'left', 'aid')
data['olabel'] = data['label']
data['oaid_day_count'] = data['aid_day_count']
data['not_count'] = data['aid_day_count'] - data['olabel']
data['aid_day'] = data['aid'] * 100 + data['day']
data['label'] = np.log1p(data['label'])
data['rate_label'] = data['olabel'] / data['aid_day_count']
data['sample_id'] = data['sample_id'].fillna(0).astype(int)
data['week'] = (data['day'] - 1) % 7
data['create_year'] = data['create_time'].dt.year
data['day_keep'] = data['day'] - data['create_time'].dt.month * 31 - data['create_time'].dt.day
ad_static_fea = ['adv_id', 'product_id', 'ad_cate_id', 'industry_id', 'creative_size']
cnt_feat = ['cnt_static_' + i for i in ad_static_fea + ['create_time']]
for i in ad_static_fea + ['create_time']:
data['cnt_static_' + i] = data[i].map(ad_static[i].value_counts())
data['day_unique'] = data.groupby('aid')['day'].transform('unique')
day_nunique = []
for x in data[['day', 'day_unique']].values:
x_1 = x[0] - 1
day_nunique.append(len([i for i in x[1] if i < x_1 and i > x_1 - 7]))
data['day_nunique'] = day_nunique
aid_day_label['day'] = aid_day_label['aid_day'] % 100
data['day_count'] = data['day'].map(aid_day_label.groupby('day')['label'].sum())
data_23 = data[data.day == 23].set_index('aid')
data['rate_label_23'] = data['aid'].map(data_23['rate_label'])
data['label_23'] = data['aid'].map(data_23['label'])
data['aid_day_count_23'] = data['aid'].map(data_23['aid_day_count'])
data['aid_day_1'] = data['aid_day'] + 1
data['aid_day_count_1'] = data['aid_day'].map(data[data.day < 24].set_index('aid_day_1')['aid_day_count'])
data['pred_23'] = data['rate_label_23'] * data['aid_day_count']
data['n_label'] = data['label'] / data['day_count']
data['n_rate_label'] = data['rate_label'] / data.groupby('day')['rate_label'].transform('mean')
shift_feat = []
data['aid_day'] = data['aid'] * 100 + data['day']
for i in range(14):
i = i + 1
shift_feat.append('aid_day_label_' + str(i))
shift_feat.append('day_label_' + str(i))
shift_feat.append('day_not_' + str(i))
shift_feat.append('rate_label_' + str(i))
data['aid_day_' + str(i)] = data['aid_day'] + i
data_last = data[~data.label.isnull()].set_index('aid_day_' + str(i))
data['day_label_' + str(i)] = data['aid_day'].map(data_last['n_label'])
data['rate_label_' + str(i)] = data['aid_day'].map(data_last['n_rate_label'])
data['day_not_' + str(i)] = data['aid_day'].map(data_last['not_count'])
data['aid_day_label_' + str(i)] = data['aid_day_count'] * data['aid_day'].map(data_last['rate_label'])
data['adv_id_day'] = data['adv_id'] * 100 + data['day']
data['adv_id_day_count_sum'] = data['adv_id_day'].map(
data.drop_duplicates(['aid_day']).groupby('adv_id_day')['aid_day_count'].sum())
data['adv_id_day_count'] = data['adv_id_day'].map(
data.drop_duplicates(['aid_day']).groupby('adv_id_day')['aid_day_count'].mean())
adv_shift_feat = []
for i in range(1, 10):
i = i + 1
adv_shift_feat.append('adv_id_day_label_' + str(i))
data['adv_id_day_' + str(i)] = data['adv_id_day'] + i
data['adv_id_day_label_' + str(i)] = data['adv_id_day_count'] * data['adv_id_day'].map(
data[~data.label.isnull()].groupby('adv_id_day_' + str(i))['rate_label'].mean())
data = data.sort_values(['sample_id', 'day']).reset_index(drop=True)
cate_feature = ['week', 'create_year'] + ad_static_fea
num_feature = shift_feat + adv_shift_feat + ['day_keep'] + [
'day_nunique', 'aid_day_count', 'rate_label_23', 'pred_23', 'label_23', 'aid_day_count_1',
'adv_id_day_count_sum', 'adv_id_day_count', 'aid_day_count_23',
] + cnt_feat
features = cate_feature + num_feature
lgb_model = lgb.LGBMRegressor(
num_leaves=150, reg_alpha=0., reg_lambda=0.01, objective='mae', metric='mae',
max_depth=-1, learning_rate=0.05, min_child_samples=100, n_jobs=-1,
n_estimators=1000, subsample=0.7, colsample_bytree=0.8, subsample_freq=1, random_state=2019
)
data['rule_pred'] = data[['aid_day_label_2', 'aid_day_label_3', 'aid_day_label_4']].mean(axis=1).fillna(0)
data['loss'] = data['label'] - np.log1p(data['rule_pred'])
lgb_model = lgb_model.fit(data[(data.day < 23)][features], data[(data.day < 23)]['loss'])
sub_control = data[data.sample_id > 0][['sample_id', 'rule_pred', 'aid', 'bid']].reset_index(drop=True)
sub_control['pred'] = lgb_model.predict(data[data.sample_id > 0][features])
sub_control['pred'] = np.expm1(sub_control['pred'] + np.log1p(sub_control['rule_pred']))
def get_predict_w(model, data, label='label', feature=[], random_state=2018, n_splits=5):
model.random_state = random_state
predict_label = 'predict_' + label
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
data[predict_label] = 0
test_index = (data[label].isnull()) | (data[label] == -1)
train_data = data[~test_index].reset_index(drop=True)
test_data = data[test_index]
for train_idx, val_idx in kfold.split(train_data):
model.random_state = model.random_state + 1
train_x = train_data.loc[train_idx][feature]
train_y = train_data.loc[train_idx][label].values
test_x = train_data.loc[val_idx][feature]
test_y = train_data.loc[val_idx][label].values
model.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=None)
train_data.loc[val_idx, predict_label] = model.predict(test_x)
if len(test_data) != 0:
test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])
test_data[predict_label] = test_data[predict_label] / n_splits
return pd.concat([train_data, test_data], ignore_index=True), predict_label
lgb_model.n_estimators = 500
data.loc[data.day == 24, 'aid_day_count'] = data[data.day == 24]['oaid_day_count'] / 1.9
data, pred_label = get_predict_w(lgb_model, data, 'label', [i for i in features if i not in shift_feat[:4]],random_state=2018, n_splits=5)
sub_norm = data[(data.day == 24)][['sample_id', 'aid', 'bid', pred_label]].reset_index(drop=True)
sub_norm['pred'] = sub_norm[pred_label]
sub_norm['pred'] = np.expm1(sub_norm['pred']) * 1.3
sub_merge = sub_norm.copy()
sub_merge['rank_num'] = sub_merge.groupby('aid')['bid'].rank()
sub_merge['pred'] = sub_norm['pred'] * 0.4 + sub_control['pred'] * 0.6
print(sub_merge['pred'].min())
sub_merge.loc[sub_merge['pred'] < 0, 'pred'] = 0
sub_merge['pred'] = sub_merge['pred'].round() + sub_merge['rank_num'] * 0.0001
sub_merge['pred'] = sub_merge['pred'].round(4)
sub_merge[['sample_id', 'pred']].to_csv('submission.csv', index=False, header=None)
PS: 关于w2v:
这个方案在初赛的时候是一个很强的操作,但是在复赛阶段由于大量0样本的泄露问题,调整采样,以及增加随机数带来的误差过大,导致并不适用。故代码未加入进来。(在PPT不足与反思处也已经注明。)
代码性能解析
计算速度部分,即算法运行时间,我的代码的前100行,分别为逐个读入每个文件,并单纯的仅仅统计label 以及队列长度。由于数据量较大,将花费约7min左右的时间载入内存、统计的过程也是大约7min。合计14min 左右。
接下来的100 – 170行为特征提取部分,由于我特征较少,且未提取原始日志的特征,故,约会在30s左右的时间运行完毕。
170行至222行结束,就是紧张刺激的模型训练预测环节,包含两个模型,模型1在10核机器上约30s。模型2为5折,约2min。
共计约20min (大家可以自行运行检验)
计算资源部分,这里略有遗憾,由于提取标签的部分未采用流式读取,故内存峰值存在于读取单日日志到内存的时候。最大内存峰值点,为单个日志文件读入的最大内存消耗。(我可没把全量日志一起读入)约4G内存即可。
代码解析:
1、 定义函数get_base_data():这里读取并解析广告静态文件,以及测试集
2、 定义函数get_train_data():这里是分日读取日志并提取每个广告当日的统计,作为标签。从而获得训练集。
3、 定义函数get_test_expos():这里是获取23号测试集的曝光数据。
4、 接下来就是连续的特征提取
5、 接下来是定义lgb模型、标签转换以及5折训练预测。