def missing_data(data):
total = data.isnull().sum()
percent = (data.isnull().sum()/data.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in data.columns:
dtype = str(data[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
众数:mode() | 平均数:mean() | 中位数:median
train.product_type[train.product_type.isnull()]=train.product_type.dropna().mode().values
# 计算所有特征值每两个之间的相关系数,并作图表示。
corrmat = train.corr()# 得到相关系数
f,ax = plt.subplots(figsize = (12,9))
sns.heatmap(corrmat, vmax = .8, square = True)#热点图
# 取出相关性最大的前十个,做出热点图表示
k = 10 # number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
time_format = '%Y-%m-%d %H:%M:%S'
def get_date(timestamp) :
time_local = time.localtime(timestamp)
dt = time.strftime(time_format,time_local)
return dt
train_df['action_date'] = train_df['timestamp_of_behavior'].apply(get_date)
train_df['action_date'] = pd.to_datetime(train_df['action_date'])
train_df['action_day'] = train_df['action_date'].dt.day
train_df['action_hour'] = train_df['action_date'].dt.hour
train_df['action_minute'] = train_df['action_date'].dt.minute
train_df['action_week'] = train_df['action_date'].dt.weekday
def encode_count(df,column_name):
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[column_name].values))
df[column_name] = lbl.transform(list(df[column_name].values))
return df
train_df = encode_count(train_df,'gender')
# 对数据按照uid和timestamp_of_behavior升序排序
train_df.sort_values(['uid','timestamp_of_behavior'],ascending=True,inplace=True)
train_df.reset_index(drop=True,inplace=True)
def merge_mean(df_1, df_2, columns, value, cname):
add = pd.DataFrame(df_1[df_1[value].notnull()].groupby(columns)[value].mean()).reset_index()
add.columns = columns + [cname]
df_2 = df_2.merge(add,on=columns,how="left")
return df_2
final_data_df = merge_mean(train_df,final_data_df,['uid'],'action_time_delta','action_time_delta_mean')
def mean_self_merge(df, columns, value, cname):
add = pd.DataFrame(df.groupby(columns)[value].mean()).reset_index()
add.columns = columns + [cname]
df = df.merge(add,on=columns,how="left")
return df
train_df = mean_self_merge(train_df,['vid'],'video_play_per','one_video_play_per')
train_df['action_time_diff'] = train_df.groupby('uid')['timestamp_of_behavior'].apply(lambda i:i.diff(1))
class CVClassifier():
def __init__(self, estimator, n_splits=5, stratified=True, num_round=77777, **params):
self.n_splits_ = n_splits
self.scores_ = []
self.clf_list_ = []
self.estimator_ = estimator
self.stratified_ = stratified
self.num_round_ = num_round
if params:
self.params_ = params
def cv(self, train_X, train_y):
if self.stratified_:
folds = StratifiedKFold(self.n_splits_, shuffle=True, random_state=seed)
else:
folds = KFold(self.n_splits_, shuffle=True, random_state=seed)
oof = np.zeros(len(train_y))
for fold, (train_idx, val_idx) in enumerate(folds.split(train_X, train_y)):
print('fold %d' % fold)
trn_data, trn_y = train_X.iloc[train_idx], train_y[train_idx]
val_data, val_y = train_X.iloc[val_idx], train_y[val_idx]
if self.estimator_ == 'lgbm':
train_set = lgb.Dataset(data=trn_data, label=trn_y)
val_set = lgb.Dataset(data=val_data, label=val_y)
clf = lgb.train(params=params, train_set=train_set, num_boost_round=num_round,
valid_sets=[train_set, val_set], verbose_eval=100, early_stopping_rounds=200)
oof[val_idx] = clf.predict(train_X.iloc[val_idx], num_iteration=clf.best_iteration)
elif self.estimator_ == 'xgb':
train_set = xgb.DMatrix(data=trn_data, label=trn_y)
val_set = xgb.DMatrix(data=val_data, label=val_y)
watchlist = [(train_set, 'train'), (val_set, 'valid')]
clf = xgb.train(self.params_, train_set, self.num_round_, watchlist,
early_stopping_rounds=200, verbose_eval=100)
oof[val_idx] = clf.predict(val_set, ntree_limit=clf.best_ntree_limit)
elif self.estimator_ == 'cat':
clf = CatBoostClassifier(self.num_round_, task_type='GPU', early_stopping_rounds=500, **self.params_)
clf.fit(trn_data, trn_y, eval_set=(val_data, val_y), cat_features=[], use_best_model=True, verbose=500)
oof[val_idx] = clf.predict_proba(val_data)[:, 1]
# sk-learn model
else:
clf = self.estimator_.fit(trn_data, trn_y)
try:
oof[val_idx] = clf.predict_proba(val_data)[:, 1]
except AttributeError:
oof[val_idx] = clf.decision_function(val_data)
self.clf_list_.append(clf)
fold_score = roc_auc_score(train_y[val_idx], oof[val_idx])
self.scores_.append(fold_score)
print('Fold score: {:<8.5f}'.format(fold_score))
self.oof_ = oof
self.score_ = roc_auc_score(train_y, oof)
print("CV score: {:<8.5f}".format(self.score_))
def predict(self, test_X):
self.predictions_ = np.zeros(len(test_X))
if self.estimator_ == 'lgbm':
self.feature_importance_df_ = pd.DataFrame()
for fold, clf in enumerate(self.clf_list_):
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold + 1
self.feature_importance_df_ = pd.concat([self.feature_importance_df_, fold_importance_df], axis=0)
self.predictions_ += clf.predict(test_X, num_iteration=clf.best_iteration) * (self.scores_[fold] / sum(self.scores_))
elif self.estimator_ == 'xgb':
for fold, clf in enumerate(self.clf_list_):
self.predictions_ += clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit) \
* (self.scores_[fold] / sum(self.scores_))
elif self.estimator_ == 'cat':
for fold, clf in enumerate(self.clf_list_):
self.predictions_ += clf.predict_proba(test_X)[:, 1] * (self.scores_[fold] / sum(self.scores_))
else:
for fold, clf in enumerate(self.clf_list_):
self.predictions_ += clf.predict_proba(test_X)[:, 1] * (self.scores_[fold] / sum(self.scores_))
# Class for Bayesian Optimisation
class CVForBO():
def __init__(self, model, train_X, train_y, test_X, base_params, int_params=[], n_splits=5, num_round=77777):
self.oofs_ = []
self.params_ = []
self.predictions_ = []
self.cv_scores_ = []
self.model_ = model
self.train_X_ = train_X
self.train_y_ = train_y
self.test_X_ = test_X
self.base_params_ = base_params
self.int_params_ = int_params
self.n_splits_ = n_splits
self.num_round_ = num_round
def cv(self, **opt_params):
for p in self.int_params_:
if p in opt_params:
opt_params[p] = int(np.round(opt_params[p]))
self.base_params_.update(opt_params)
cv_model = CVClassifier(self.model_, n_splits=self.n_splits_, num_round=self.num_round_, **self.base_params_)
cv_model.cv(self.train_X_, self.train_y_)
cv_model.predict(self.test_X_)
self.oofs_.append(cv_model.oof_)
self.predictions_.append(cv_model.predictions_)
self.params_.append(self.base_params_)
self.cv_scores_.append(cv_model.score_)
return cv_model.score_
def post_process(self, model_type=None, oof_path='inter_oofs.csv', pred_path='inter_preds.csv', params_path='inter_params.csv'):
if not model_type:
model_type=self.model_
cols = ['{}_{}_{}'.format(model_type, str(self.cv_scores_[k]).split('.')[-1][:5], k) for k in range(len(self.cv_scores_))]
self.oof_df = pd.DataFrame(np.array(self.oofs_).T, columns=cols)
self.pred_df = pd.DataFrame(np.array(self.predictions_).T, columns=cols)
self.params_df = pd.DataFrame(self.params_).T.rename(columns={c_old: c_new for c_old, c_new in enumerate(cols)})
self.oof_df.to_csv(oof_path)
self.pred_df.to_csv(pred_path)
self.params_df.to_csv(params_path)
def train_model(final_data_df,model_path):
# 获取所有特征列
print('Getting all the feature...')
lgb_feature_list = list(final_data_df.columns.drop(['uid']))
lgb_df = final_data_df[lgb_feature_list].copy()
target = 'whether_to_keep'
# 划分数据
print('Dividing dataset to trainset and valset...')
train,val = train_test_split(lgb_df,test_size=0.2,random_state=2018)
train_X = train.drop(target,1)
train_y = train[target]
val_X = val.drop(target,1)
val_y = val[target]
# 及时删除释放内存
del final_data_df
feature_name = lgb_feature_list.remove(target)
lgb_train = lgb.Dataset(train_X,train_y,feature_name=feature_name)
lgb_eval = lgb.Dataset(val_X, val_y, feature_name=feature_name,reference=lgb_train)
# 保存 Dataset 到 LightGBM 二进制文件将会使得加载更快速:
print('Saving trainset and valset...')
lgb_train.save_binary('./train.bin')
lgb_eval.save_binary('./val.bin')
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_thread': -1,
'num_leaves':70,
'max_depth':7,
'learning_rate':0.01,
'bagging_freq': 4,
'bagging_fraction': 0.6,
'feature_fraction': 0.6,
'lambda_l1':1,
'lambda_l2':1,
'num_boost_round':20000,
'data_random_seed':2017
}
## 训练
model = lgb.train(
params,
lgb_train,
valid_sets=lgb_eval,
early_stopping_rounds=100
)
### 保存模型
model.save_model(model_path,num_iteration=model.best_iteration)
# 保存模型重要性
importance = model.feature_importance()
names = model.feature_name()
with open('./feature_importance.txt', 'w+') as file:
for index, im in enumerate(importance):
string = names[index] + ', ' + str(im) + '\n'
file.write(string)