(1)2018科大讯飞AI营销算法大赛总结(冠军)https://zhuanlan.zhihu.com/p/47807544
(2)CTR预估入门及各种模型介绍 https://www.mayi888.com/archives/54482
(3)当GridSearch遇上XGBoost https://juejin.im/post/5b7669c4f265da281c1fbf96#comment
(4)XGBoost和LightGBM的参数以及调参 https://www.jianshu.com/p/1100e333fcab
(5)决策树模型,XGBoost,LightGBM和CatBoost模型可视化 https://blog.csdn.net/l_xzmy/article/details/81532281
(6)Jupyter Notebook,但这些插件你都会了吗 http://192.168.73.133/www.sohu.com/a/283161414_129720
(7)如何理解机器学习和统计中的AUC https://www.zhihu.com/question/39840928?from=profile_question_card
(8)Data competition Top Solution 数据竞赛Top解决方案开源整理 https://github.com/Smilexuhc/Data-Competition-TopSolution
import gc
import os
import time
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
import lightgbm as lgb
from scipy import stats
from scipy.signal import hann
import matplotlib.pyplot as plt
from scipy.signal import hilbert
from scipy.signal import convolve
from sklearn.svm import NuSVR, SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold,StratifiedKFold, RepeatedKFold
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['font.family'] = ['sans-serif']
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# LGB
def train_model_lgb(train_x,train_y,test_x,params=None):
oof = np.zeros(len(test_x))
prediction = np.zeros(len(test_x))
scores = []
# K折交叉验证
NFolds = 2
# kf = KFold(n_splits=NFolds)
kf = StratifiedKFold(n_splits=NFolds)
for kf_n, (train_index, valid_index) in enumerate(kf.split(train_x,train_y)):
print("kf_n = {}".format(kf_n))
if type(train_x) == np.ndarray:
x_train, x_valid = train_x[train_index], train_x[valid_index]
y_train, y_valid = train_y[train_index], train_y[valid_index]
else:
x_train, x_valid = train_x.iloc[train_index], train_x.iloc[valid_index]
y_train, y_valid = train_y.iloc[train_index], train_y.iloc[valid_index]
# create dataset for lightgbm
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
#train
#print("Start training...")
model = lgb.LGBMClassifier(**params, n_estimators=20000, n_jobs=-1)
model.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric='auc',
verbose=100, early_stopping_rounds=200)
#Prediction
y_pred = model.predict_proba(test_x, num_iteration=model.best_iteration_)[:,1]
prediction += y_pred
prediction /= NFolds
return model,prediction
params_0 = {
'learning_rate': 0.02,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'feature_fraction': 0.66,
'bagging_fraction': 0.8,
'bagging_freq': 2,
'num_leaves': 48,
'verbose': -1,
'max_depth': -1,
'lambda_l2': 1.6,
'lambda_l1': 3.7,
'is_unbalance': True,
'nthread': 8
}
model_lgb,prediction_lgb = train_model_lgb(train_x,train_y,test_x,params=params_0)
# LGB-OTHER
def train_model(train_data, test_data, y, params=None, model_type='lgb', is_class= False, plot_feature_importance=False, model=None):
NFOLDS = 2
#useful seeds:11,42
folds = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
# X = train_data.drop(['time_to_failure'], axis=1)
X = train_data
X_test = test_data
# X_test = test_data.drop(['time_to_failure','seg_id'], axis=1)
# test_segs = test_data['seg_id']
# y = train_data['time_to_failure']
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
prediction_all = np.zeros(NFOLDS*len(X_test))
scores = []
feature_importance = pd.DataFrame()
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
print('Fold', fold_n, 'started at', time.ctime())
if type(X) == np.ndarray:
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
else:
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
print(train_index)
print(valid_index)
print('-----')
if model_type == 'lgb':
if is_class == False:
model = lgb.LGBMRegressor(**params, n_estimators= 20000, n_jobs= -1)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
verbose=1000, early_stopping_rounds=2000)
y_pred_valid = model.predict(X_valid)
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
else:
model = lgb.LGBMClassifier(**params, n_estimators=20000, n_jobs=-1)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',
verbose=100, early_stopping_rounds=200)
y_pred_valid = model.predict_proba(X_valid, num_iteration=model.best_iteration_)[:,1]
y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1]
if model_type == 'xgb':
if is_class == False:
train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)
watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
else:
model = xgb.XGBClassifier(nthread=4, learning_rate=0.08, n_estimators=200, max_depth=5, gamma=0, subsample=0.9,colsample_bytree=0.5)
model.fit(X_train, y_train)
xgb_valid_auc = roc_auc_score(y_valid, model.predict(X_valid))
print("XGBoost valid AUC: %.5f" % xgb_valid_auc)
y_pred = model.predict(X_valid)
if model_type == 'sklearn':
model = model
model.fit(X_train, y_train)
y_pred_valid = model.predict(X_valid).reshape(-1,)
score = mean_absolute_error(y_valid, y_pred_valid)
print(f'Fold {fold_n}. MAE: {score:.4f}.')
y_pred = model.predict(X_test).reshape(-1,)
if model_type == 'cat':
model = CatBoostRegressor(iterations=20000, eval_metric='MAE', **params)
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)
y_pred_valid = model.predict(X_valid)
y_pred = model.predict(X_test)
if model_type == 'lgb':
if is_class == False:
oof[valid_index] = y_pred_valid.reshape(-1, )
scores.append(mean_absolute_error(y_valid, y_pred_valid))
else:
oof[valid_index] = y_pred_valid.reshape(-1,)
scores.append(roc_auc_score(y_valid, y_pred_valid))
print(model.best_score_['valid_1']['auc'])
# scores.append(model.best_score_['valid_1']['auc'])
# print(scores)
prediction += y_pred
prediction_all[fold_n*len(X_test):(fold_n+1)*len(X_test)] = y_pred
if model_type == 'lgb':
# feature importance
fold_importance = pd.DataFrame()
fold_importance["feature"] = X.columns
fold_importance["importance"] = model.feature_importances_
fold_importance["fold"] = fold_n + 1
feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
prediction /= NFOLDS
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
if model_type == 'lgb':
feature_importance["importance"] /= NFOLDS
if plot_feature_importance:
feature_importance_gb = feature_importance[["feature", "importance"]].groupby("feature").mean()
cols = feature_importance_gb.sort_values(by="importance", ascending=False)[:50].index
best_features = feature_importance_gb.loc[feature_importance_gb.index.isin(cols)]
best_features['feature'] = best_features.index
plt.figure(figsize=(16, 12))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LGB Features (avg over folds)')
return oof, prediction, feature_importance_gb,prediction_all
return oof, prediction, scores,prediction_all
else:
return oof, prediction, scores,prediction_all
# XGB
%%time
NFOLDS = 2
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)
X = train
X_test = test
y = labels
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
scores = []
feature_importance = pd.DataFrame()
prediction_all = np.zeros(NFOLDS*len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
print('Fold', fold_n, 'started at', time.ctime())
if type(X) == np.ndarray:
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
else:
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
print('-----')
model = xgb.XGBClassifier(max_depth=10,
learning_rate=0.01,
n_estimators=20000,
silent=True,
objective='binary:logistic',
nthread=-1,
gamma=0,
min_child_weight=1,
max_delta_step=0,
subsample=0.85,
colsample_bytree=0.7,
colsample_bylevel=1,
reg_alpha=0,
reg_lambda=1,
scale_pos_weight=1,
seed=1440,
missing=None)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',
verbose=100, early_stopping_rounds=500)
y_pred_valid = model.predict_proba(X_valid, ntree_limit=model.best_ntree_limit)[:,1]
y_pred = model.predict_proba(X_test, ntree_limit=model.best_ntree_limit)[:,1]
oof[valid_index] = y_pred_valid.reshape(-1,)
auc_score = roc_auc_score(y_valid, y_pred_valid)
scores.append(auc_score)
print("auc_score : ",auc_score)
prediction += y_pred
prediction_all[fold_n*len(X_test):(fold_n+1)*len(X_test)] = y_pred
# feature importance
fold_importance = pd.DataFrame()
fold_importance["feature"] = X.columns
fold_importance["importance"] = model.feature_importances_
fold_importance["fold"] = fold_n + 1
feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
prediction /= NFOLDS
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
#CATBOOST
%%time
NFOLDS = 2
folds = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
X = train
X_test = test
y = labels
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
scores = []
feature_importance = pd.DataFrame()
prediction_all = np.zeros(NFOLDS*len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
print('Fold', fold_n, 'started at', time.ctime())
if type(X) == np.ndarray:
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
else:
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
print('-----')
model = cb.CatBoostClassifier(
learning_rate=0.01,
max_depth=10,
# l2_leaf_reg=1,
reg_lambda=1,
n_estimators=20000,
loss_function='Logloss',
custom_metric='AUC',
eval_metric='AUC',
leaf_estimation_method='Gradient',
thread_count=-1,
scale_pos_weight=3,
random_seed=1995,
verbose=-1 )
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=100, early_stopping_rounds=500)
y_pred_valid = model.predict_proba(X_valid)[:,1]
y_pred = model.predict_proba(X_test)[:,1]
oof[valid_index] = y_pred_valid.reshape(-1,)
auc_score = roc_auc_score(y_valid, y_pred_valid)
scores.append(auc_score)
print("auc_score : ",auc_score)
prediction += y_pred
prediction_all[fold_n*len(X_test):(fold_n+1)*len(X_test)] = y_pred
# feature importance
fold_importance = pd.DataFrame()
fold_importance["feature"] = X.columns
fold_importance["importance"] = model.feature_importances_
fold_importance["fold"] = fold_n + 1
feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
prediction /= NFOLDS
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
def id_encode(df):
colums = ['siteid', 'slotid',
'uid', 'city', 'province', 'phonetype',
'adid', 'billid', 'primid', 'creativetype', 'spreadappid',
'contentid', 'firstclass', 'secondclass']
for feature in colums:
# print(feature)
df[feature] = LabelEncoder().fit_transform(df[feature])
return df
# 查看分布
plt.figure(figsize=(8,6))
#预测问题
# sns.distplot(train['label'])
#分类问题
sns.countplot(train['label'], palette='Set3')
print(train['label'].describe())
#skewness and kurtosis
print("Skewness: %f" % train['label'].skew())
print("Kurtosis: %f" % train['label'].kurt())
print("1: %f" % train[train['label']==1].shape[0])
print("0: %f" % train[train['label']==0].shape[0])
# print("-1: %f" % train['label'].kurt())
# 特征分布
def plot_feature_scatter(df1, df2, features):
i = 0
sns.set_style('whitegrid')
plt.figure()
features_l = len(features)
plt_num = int(np.ceil(np.sqrt(features_l)))
fig, ax = plt.subplots(plt_num,plt_num-1,figsize=(14,14))
for feature in features:
i += 1
plt.subplot(plt_num,plt_num-1,i)
plt.scatter(df1[feature], df2[feature], marker='+')
plt.xlabel(feature, fontsize=9)
plt.show();
ft = ['uid', 'adid', 'siteid', 'slotid', 'contentid', 'nettype',
'age', 'gender', 'city', 'province', 'phonetype', 'carrier', 'billid',
'primid', 'creativetype', 'intertype', 'spreadappid', 'firstclass',
'secondclass'
]
# plot_feature_scatter(train[::2000],test[::2000], features)
train_sp = train.sample(n=10000, random_state=2019, axis=0)
test_sp = test.sample(n=10000, random_state=2019, axis=0)
plot_feature_scatter(train_sp,test_sp, ft)
del train_sp,test_sp
gc.collect()
# 与label的相关性
x_cols=[col for col in train.columns if col not in ['label'] and train[col].dtype!='object']
labels=[]
values=[]
#判断各列和信用分的相关性
for col in x_cols:
labels.append(col)
values.append(np.corrcoef(train[col].values,train['label'].values)[0,1])
corr_df=pd.DataFrame({'col_labels':labels,'corr_values':values})
corr_df=corr_df.sort_values(by='corr_values')
ind=np.arange(len(labels))
width=0.5
fig,ax=plt.subplots(figsize=(6,15))
rects=ax.barh(ind,np.array(corr_df.corr_values.values),color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values,rotation='horizontal')
ax.set_xlabel('Correlation coefficient')
ax.set_title('Correlation coeficient of the variables')
# 热度图
corrmat = train.corr()
f, ax = plt.subplots(figsize=(30, 12))
sns.heatmap(corrmat, cbar=True, annot=True,square=True, fmt='.2f', annot_kws={'size': 10},vmax=0.8)
# 分布对比
def plot_kde(train, test, col, values=True):
fig,ax =plt.subplots(1,3,figsize=(15,5))
sns.distplot(train[col],color='tab:red',ax=ax[0], label='train')
sns.distplot(test[col],color='tab:blue',ax=ax[1], label='test')
sns.distplot(train[col],color='tab:red',ax=ax[2], label='train')
sns.distplot(test[col],color='tab:blue',ax=ax[2], label='test')
# plt.legend()
# plt.xlabel(col, fontsize=12)
# plt.ylabel('number', fontsize=12)
# sns.kdeplot(x_train_df[col],color='y',ax=ax[4])
# sns.kdeplot(test[col],color='b',ax=ax[4])
plt.show()
del train, col,test
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
data_path = 'path/'
os.listdir(data_path)
data = pd.read_excel(data_path+'data.xlsx',sheetname='Sheet2')
data.describe()
sns.boxplot(data=data)
# plt.grid()
plt.show()
sns.distplot(data['name7'],kde=True,hist=False)
# plt.grid()
plt.show()