大家好, 作为一名机器学习初学者, 前段时间参加了2021科大讯飞AI开发者大赛, 队伍名称"写个故事".
自己首次参加大数据比赛, 主要选择的相对容易结构化数据比赛, 在其中5个比赛中3个进入了前五名(广告点击率预估, 线下商店销量预测, 移动设备用户年龄和性别预测). 能取得这个名次, 除了运气成分(奖金不多, 高手都去了别的比赛), 还要感谢鱼佬, 阿水等几位大佬提供的baseline以及分享的很多资料.
科大讯飞2021广告点击率预估挑战赛
对于移动设备厂商而言,获取当前手机用户的人口属性信息是非常困难的。基于用户的手机及日常使用应用程序的偏好准确地预测其人口属性信息是提升个性化体验、构建精准用户画像的基础。
需要说明的是,本赛事数据已获得个人用户的充分认可和同意,并已进行适当的匿名处理以保护隐私。由于保密,我们不会提供有关如何获得性别和年龄数据的详细信息。
本次比赛有两个任务,分别对移动设备(device_id)进行性别和年龄的预测,这里包含二分类和回归两个问题,最终会将两个部分的分数结合起来进行排名。
代码都做了注释, 自己还不太擅长用定义函数的方式写, 相信大家都能看懂.
# =============================================================================
# # 导入工具包
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')
os.chdir('C:/Users/yyz/Desktop/比赛/广告点击率/data/广告点击率预估挑战赛_数据集/')
# =============================================================================
# # 读取数据并合并
# =============================================================================
df_tr = pd.read_csv('train.csv')
df_te = pd.read_csv( 'test.csv')
df_tr_te = pd.concat([df_tr,df_te],axis=0,ignore_index = True)
# 区分训练集和测试
df_tr_te['isClick'] = df_tr_te['isClick'].fillna(-1)
# 读取待提交的数据
df_sub = pd.read_csv('sample_submit.csv')
# 对日期进行分列
df_tr_te['hour'] = df_tr_te['date'].apply(lambda x: int(x.split(' ')[-1].split(':')[0]))
df_tr_te['day']= df_tr_te['date'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
# 对user_id计数小于等于3的归为1类
te = df_tr_te['user_id'].value_counts().reset_index()
lis_thr = te[te['user_id']<=3]['index'].unique().tolist()
df_tr_te['thr'] = np.where(df_tr_te['user_id'].isin(lis_thr),0,1)
# =============================================================================
# 特征工程
# =============================================================================
# 历史点击率
def _his_click_rate(df, f1, window_size = 2):
fea_name = '{}_his_{}_clickrate'.format(f1,window_size )
df[fea_name] = 0
for i in tqdm(range(3,8)):
df_t = df.loc[((df['day'] >= i-window_size) & (df['day'] < i))]
inds = df['day'] == i
df.loc[inds,fea_name] = df.loc[inds,f1].map(df_t.groupby(f1)['isClick'].mean())
return df
df_tr_te = _his_click_rate(df = df_tr_te, f1 = 'user_id', window_size = 5)
# 在baseline的基础上又增加了一个
df_tr_te['user_id_webpage_id'] = [str(i)+ str(j) for i,j in zip(df_tr_te['user_id'],df_tr_te['webpage_id'])]
df_tr_te = _his_click_rate(df = df_tr_te, f1 = 'user_id_webpage_id', window_size = 5)
# 窗口特征
df_tr_te['user_product_day_5mean'] = df_tr_te.groupby(['user_id','product','day'])['isClick'].transform(lambda x: x.rolling(3).mean().shift(1))
# 缺失值数据填充并替换
df_tr_te['gender'] = df_tr_te['gender'].fillna('NAN').map({'Female':1,'Male':0,'NAN':-1})
# 星期数据替换, 主要将周五 周六 周天归为一类
df_tr_te['xingqi'] = df_tr_te['day'].replace([2,3,4,5,6,7],[2,2,1,0,0,0])
# 单变量count特征
for c in ['user_id','product','hour','campaign_id','webpage_id','user_group_id','age_level',
'gender','day','product_category_id','user_depth']:
df_tr_te[c + '_cnt'] = df_tr_te.groupby(c)['id'].transform('count')
# 双变量的count特征
import itertools
lis_i = ['user_id','product','hour','campaign_id','webpage_id','user_group_id','age_level',
'gender','day','product_category_id','user_depth']
lis_i_re = list(itertools.permutations(lis_i, 2))
for c in lis_i_re:
df_tr_te[c[0] + c[1] + '_cnt'] = df_tr_te.groupby(list(c))['id'].transform('count')
# 处理时间(根据数据条数猜测是2021年数据)
df_tr_te['date'] = ['2021-' + i for i in df_tr_te['date']]
df_tr_te['date'] = pd.to_datetime(df_tr_te['date'])
# 计算按用户, 天, 小时的时间差
df_tr_te['user_time_hour'] = df_tr_te.groupby(['user_id','day','hour'])['date'].transform(lambda x: (x.max()-x.min()).total_seconds())
# 计算按用户, 天的时间差
df_tr_te['user_time_day'] = df_tr_te.groupby(['user_id','day'])['date'].transform(lambda x: (x.max()-x.min()).total_seconds())
# 一阶差分
df_tr_te['user_time_del'] = df_tr_te.groupby(['user_id'])['date'].transform(lambda x: (x.diff(periods=-1)))
df_tr_te['user_time_del'] = df_tr_te['user_time_del'].apply(lambda x: x.total_seconds())
# count计数
df_tr_te['user_id_webpage_id_product'] = df_tr_te.groupby(['user_id','product','webpage_id'])['id'].transform('count')
# 产品按用户, 天权重
df_tr_te['user_id_day_range'] = df_tr_te.groupby(['user_id','day'])['product'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 产品按用户权重
df_tr_te['user_id_range'] = df_tr_te.groupby(['user_id'])['product'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 网页按用户, 产品权重
df_tr_te['user_id_product_webpage_range'] = df_tr_te.groupby(['user_id','product'])['webpage_id'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 网页按用户, 活动权重
df_tr_te['user_id_campaign_id_webpage_range'] = df_tr_te.groupby(['user_id','campaign_id'])['webpage_id'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 不同组合的时间均值
lis_i_1 = ['user_id','product','campaign_id','webpage_id','product_category_id',
'user_group_id','age_level','gender','user_depth','var_1']
for c in lis_i_1:
df_tr_te[str(c) + '_user_time_hour_mean'] = df_tr_te.groupby(c)['user_time_hour'].transform('mean')
df_tr_te[str(c) + '_user_time_day_mean'] = df_tr_te.groupby(c)['user_time_hour'].transform('mean')
df_tr_te[str(c) + '_user_time_hour_sum'] = df_tr_te.groupby(c)['user_time_hour'].transform('sum')
df_tr_te[str(c) + '_user_time_day_sum'] = df_tr_te.groupby(c)['user_time_hour'].transform('sum')
# 性别, 年龄, 产品的平均用时
df_tr_te['yong_time_gender_age_level_product_category_id_ave'] = df_tr_te.groupby(['gender','age_level','product_category_id'])['user_time_hour'].transform('mean')
# 暴力增加2个特征的组合平均用时
lis_i_1 = ['user_id','product','campaign_id','webpage_id','product_category_id','user_group_id','age_level','gender','user_depth','var_1']
lis_i_re_1 = list(itertools.permutations(lis_i_1, 2))
for c in lis_i_re_1:
df_tr_te[c[0] + c[1] + '_user_time_hour_mean'] = df_tr_te.groupby(list(c))['user_time_hour'].transform('mean')
# nunique特征
for i in ['product','campaign_id','webpage_id','product_category_id']:
df_tr_te['day_'+str(i)+'_nunique'] = df_tr_te.groupby(['user_id','day'])[i].transform('nunique')
df_tr_te['day_'+str(i)+'_nunique_p%'] = df_tr_te['user_idday_cnt'] / df_tr_te['day_'+str(i)+'_nunique']
df_tr_te['day_web_nunique'] = df_tr_te.groupby(['user_id','day','hour'])['webpage_id'].transform('nunique')
# =============================================================================
# 建模
# =============================================================================
# cate_features = ['user_id','product','hour','campaign_id','webpage_id','user_group_id','age_level']
features = [i for i in df_tr_te.columns if i not in ['id','isClick','date','user_id_webpage_id']]
test= df_tr_te[df_tr_te['isClick']==-1]
train= df_tr_te[df_tr_te['isClick']!=-1]
x_train = train[features]
x_test = test[features]
y_train = train['isClick']
def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
folds = 5
seed = 2021
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2**6,
'lambda_l2': 10,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_freq': 4,
'learning_rate': 0.01,
'seed': 2021,
'nthread': 28,
'n_jobs':-1,
'silent': True,
'verbose': -1,
}
model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
#categorical_feature = categorical_feature,
verbose_eval=500,early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
train[valid_index] = val_pred
test += test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)
## 预测结果
df_sub['isClick'] = lgb_test
df_sub.to_csv('C:/Users/yyz/Desktop/比赛/广告点击率/baseline55_5zhe_re.csv', index=False)
现在两个月过去了, 看当初自己绞尽脑汁写的代码, 还有很多优化的地方:
以上便是对此赛题的一些总结, 希望对初入数据竞赛的朋友有帮助, 咱们赛场见!
如果你想学习更多, 鱼佬的新书《机器学习算法竞赛实战》可以了解下, 我是10月7日入手, 已经看了好几遍, 收获很多!