机器学习参考文档

参考文档

(1)2018科大讯飞AI营销算法大赛总结(冠军)https://zhuanlan.zhihu.com/p/47807544

(2)CTR预估入门及各种模型介绍 https://www.mayi888.com/archives/54482

(3)当GridSearch遇上XGBoost  https://juejin.im/post/5b7669c4f265da281c1fbf96#comment

(4)XGBoost和LightGBM的参数以及调参  https://www.jianshu.com/p/1100e333fcab

(5)决策树模型,XGBoost,LightGBM和CatBoost模型可视化  https://blog.csdn.net/l_xzmy/article/details/81532281

(6)Jupyter Notebook,但这些插件你都会了吗  http://192.168.73.133/www.sohu.com/a/283161414_129720

(7)如何理解机器学习和统计中的AUC https://www.zhihu.com/question/39840928?from=profile_question_card

(8)Data competition Top Solution 数据竞赛Top解决方案开源整理  https://github.com/Smilexuhc/Data-Competition-TopSolution

 

 

参考代码

import gc
import os
import time
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
import lightgbm as lgb
from scipy import stats
from scipy.signal import hann
import matplotlib.pyplot as plt
from scipy.signal import hilbert
from scipy.signal import convolve
from sklearn.svm import NuSVR, SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold,StratifiedKFold, RepeatedKFold
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['font.family'] = ['sans-serif']

from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# LGB
def train_model_lgb(train_x,train_y,test_x,params=None):
    oof = np.zeros(len(test_x))
    prediction = np.zeros(len(test_x))
    scores = []
    # K折交叉验证
    NFolds = 2
#     kf = KFold(n_splits=NFolds)
    kf = StratifiedKFold(n_splits=NFolds)
    for kf_n, (train_index, valid_index) in enumerate(kf.split(train_x,train_y)):
        print("kf_n = {}".format(kf_n))
        if type(train_x) == np.ndarray:
            x_train, x_valid = train_x[train_index], train_x[valid_index]
            y_train, y_valid = train_y[train_index], train_y[valid_index]
        else:
            x_train, x_valid = train_x.iloc[train_index], train_x.iloc[valid_index]
            y_train, y_valid = train_y.iloc[train_index], train_y.iloc[valid_index]
        # create dataset for lightgbm  
        lgb_train = lgb.Dataset(x_train, y_train)  
        lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)  
        #train
        #print("Start training...")
        model = lgb.LGBMClassifier(**params, n_estimators=20000, n_jobs=-1)
        model.fit(x_train, y_train,
                    eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric='auc',
                    verbose=100, early_stopping_rounds=200)
     
        #Prediction   
        y_pred = model.predict_proba(test_x, num_iteration=model.best_iteration_)[:,1]
        prediction += y_pred   
    prediction /= NFolds   
    return model,prediction

params_0 = {
        'learning_rate': 0.02,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'feature_fraction': 0.66,
        'bagging_fraction': 0.8,
        'bagging_freq': 2,
        'num_leaves': 48,
        'verbose': -1,
        'max_depth': -1,
        'lambda_l2': 1.6,
        'lambda_l1': 3.7,
        'is_unbalance': True, 
        'nthread': 8    
    }
model_lgb,prediction_lgb = train_model_lgb(train_x,train_y,test_x,params=params_0)

# LGB-OTHER
def train_model(train_data, test_data, y, params=None,  model_type='lgb', is_class= False, plot_feature_importance=False, model=None):
    NFOLDS = 2
    #useful seeds:11,42
    folds = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
#    X = train_data.drop(['time_to_failure'], axis=1)
    X = train_data
    X_test = test_data
#    X_test = test_data.drop(['time_to_failure','seg_id'], axis=1)
#    test_segs = test_data['seg_id']
#    y = train_data['time_to_failure']
 
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    prediction_all = np.zeros(NFOLDS*len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        if type(X) == np.ndarray:
            X_train, X_valid = X[train_index], X[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            print(train_index)
            print(valid_index)
            print('-----')
            
        
        if model_type == 'lgb':
            if is_class == False:
                model = lgb.LGBMRegressor(**params, n_estimators= 20000, n_jobs= -1)
                model.fit(X_train, y_train,
                        eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
                        verbose=1000, early_stopping_rounds=2000)

                y_pred_valid = model.predict(X_valid)
                y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            else:
                model = lgb.LGBMClassifier(**params, n_estimators=20000, n_jobs=-1)
                model.fit(X_train, y_train,
                          eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',
                          verbose=100, early_stopping_rounds=200)

                y_pred_valid = model.predict_proba(X_valid, num_iteration=model.best_iteration_)[:,1]
                y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1]
            
        if model_type == 'xgb':
            if is_class == False:
                train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
                valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

                watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
                model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
                y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
                y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            else:
                model = xgb.XGBClassifier(nthread=4, learning_rate=0.08, n_estimators=200, max_depth=5, gamma=0, subsample=0.9,colsample_bytree=0.5)
                model.fit(X_train, y_train)
                xgb_valid_auc = roc_auc_score(y_valid, model.predict(X_valid))
                print("XGBoost valid AUC: %.5f" % xgb_valid_auc)
                y_pred = model.predict(X_valid)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = mean_absolute_error(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. MAE: {score:.4f}.')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric='MAE', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        if model_type == 'lgb':
            if is_class == False:
                oof[valid_index] = y_pred_valid.reshape(-1, )
                scores.append(mean_absolute_error(y_valid, y_pred_valid))
            else:
                oof[valid_index] = y_pred_valid.reshape(-1,)
                scores.append(roc_auc_score(y_valid, y_pred_valid))
                print(model.best_score_['valid_1']['auc'])
                # scores.append(model.best_score_['valid_1']['auc'])
#                print(scores)

        prediction += y_pred    
        prediction_all[fold_n*len(X_test):(fold_n+1)*len(X_test)] = y_pred
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    prediction /= NFOLDS
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    if model_type == 'lgb':
        feature_importance["importance"] /= NFOLDS
        if plot_feature_importance:
            feature_importance_gb = feature_importance[["feature", "importance"]].groupby("feature").mean()
            cols = feature_importance_gb.sort_values(by="importance", ascending=False)[:50].index
            best_features = feature_importance_gb.loc[feature_importance_gb.index.isin(cols)]
            best_features['feature'] = best_features.index
            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
            plt.title('LGB Features (avg over folds)')
            
            return oof, prediction, feature_importance_gb,prediction_all
        return oof, prediction, scores,prediction_all
    else:
        return oof, prediction, scores,prediction_all


# XGB
%%time
NFOLDS = 2
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)
X = train
X_test = test
y = labels

oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
scores = []
feature_importance = pd.DataFrame()
prediction_all = np.zeros(NFOLDS*len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    print('Fold', fold_n, 'started at', time.ctime())
    if type(X) == np.ndarray:
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
    else:
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        print('-----')
    model = xgb.XGBClassifier(max_depth=10,
        learning_rate=0.01,
        n_estimators=20000,
        silent=True,
        objective='binary:logistic',
        nthread=-1,
        gamma=0,
        min_child_weight=1,
        max_delta_step=0,
        subsample=0.85,
        colsample_bytree=0.7,
          colsample_bylevel=1,
        reg_alpha=0,
        reg_lambda=1,
        scale_pos_weight=1,
        seed=1440,
        missing=None)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',
              verbose=100, early_stopping_rounds=500)
    y_pred_valid = model.predict_proba(X_valid, ntree_limit=model.best_ntree_limit)[:,1]
    y_pred = model.predict_proba(X_test, ntree_limit=model.best_ntree_limit)[:,1]

    oof[valid_index] = y_pred_valid.reshape(-1,)
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    scores.append(auc_score)
    print("auc_score : ",auc_score)

    prediction += y_pred    
    prediction_all[fold_n*len(X_test):(fold_n+1)*len(X_test)] = y_pred
    
    # feature importance
    fold_importance = pd.DataFrame()
    fold_importance["feature"] = X.columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
prediction /= NFOLDS
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

#CATBOOST
%%time
NFOLDS = 2
folds = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
X = train
X_test = test
y = labels

oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
scores = []
feature_importance = pd.DataFrame()
prediction_all = np.zeros(NFOLDS*len(X_test))

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    print('Fold', fold_n, 'started at', time.ctime())
    if type(X) == np.ndarray:
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
    else:
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        print('-----')
    model = cb.CatBoostClassifier(
        learning_rate=0.01,
        max_depth=10,
#         l2_leaf_reg=1,
        reg_lambda=1,
        n_estimators=20000,
        loss_function='Logloss',
        custom_metric='AUC',
        eval_metric='AUC',
        leaf_estimation_method='Gradient',
        thread_count=-1,
        scale_pos_weight=3,
        random_seed=1995,
        verbose=-1 )
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=100, early_stopping_rounds=500)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    y_pred = model.predict_proba(X_test)[:,1]

    oof[valid_index] = y_pred_valid.reshape(-1,)
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    scores.append(auc_score)
    print("auc_score : ",auc_score)

    prediction += y_pred    
    prediction_all[fold_n*len(X_test):(fold_n+1)*len(X_test)] = y_pred
    
    # feature importance
    fold_importance = pd.DataFrame()
    fold_importance["feature"] = X.columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
prediction /= NFOLDS
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
def id_encode(df):
    colums = ['siteid', 'slotid',
                   'uid', 'city', 'province', 'phonetype',
                   'adid', 'billid', 'primid', 'creativetype', 'spreadappid',
                   'contentid', 'firstclass', 'secondclass']
    for feature in colums:
        # print(feature)
        df[feature] = LabelEncoder().fit_transform(df[feature])
    return df
# 查看分布
plt.figure(figsize=(8,6))
#预测问题
# sns.distplot(train['label'])
#分类问题
sns.countplot(train['label'], palette='Set3')
print(train['label'].describe())
#skewness and kurtosis
print("Skewness: %f" % train['label'].skew())
print("Kurtosis: %f" % train['label'].kurt())
print("1: %f" % train[train['label']==1].shape[0])
print("0: %f" % train[train['label']==0].shape[0])
# print("-1: %f" % train['label'].kurt())

# 特征分布
def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    features_l = len(features)
    plt_num =  int(np.ceil(np.sqrt(features_l)))
    fig, ax = plt.subplots(plt_num,plt_num-1,figsize=(14,14))
    
    for feature in features:
        i += 1
        plt.subplot(plt_num,plt_num-1,i)
        plt.scatter(df1[feature], df2[feature], marker='+')
        plt.xlabel(feature, fontsize=9)
    plt.show();

ft = ['uid', 'adid', 'siteid', 'slotid', 'contentid', 'nettype',
       'age', 'gender', 'city', 'province', 'phonetype', 'carrier', 'billid',
       'primid', 'creativetype', 'intertype', 'spreadappid', 'firstclass',
       'secondclass'
           ]
# plot_feature_scatter(train[::2000],test[::2000], features)
train_sp = train.sample(n=10000, random_state=2019, axis=0)
test_sp  = test.sample(n=10000, random_state=2019, axis=0)
plot_feature_scatter(train_sp,test_sp, ft)
del train_sp,test_sp
gc.collect()

# 与label的相关性
x_cols=[col for col in train.columns if col not in ['label'] and train[col].dtype!='object']
labels=[]
values=[]

#判断各列和信用分的相关性
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(train[col].values,train['label'].values)[0,1])
    
corr_df=pd.DataFrame({'col_labels':labels,'corr_values':values})
corr_df=corr_df.sort_values(by='corr_values')

ind=np.arange(len(labels))
width=0.5
fig,ax=plt.subplots(figsize=(6,15))
rects=ax.barh(ind,np.array(corr_df.corr_values.values),color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values,rotation='horizontal')
ax.set_xlabel('Correlation coefficient')
ax.set_title('Correlation coeficient of the variables')

# 热度图
corrmat = train.corr()
f, ax = plt.subplots(figsize=(30, 12))
sns.heatmap(corrmat, cbar=True, annot=True,square=True, fmt='.2f', annot_kws={'size': 10},vmax=0.8)


# 分布对比
def plot_kde(train, test, col, values=True):
    fig,ax =plt.subplots(1,3,figsize=(15,5))

    sns.distplot(train[col],color='tab:red',ax=ax[0], label='train')
    sns.distplot(test[col],color='tab:blue',ax=ax[1], label='test')     
    sns.distplot(train[col],color='tab:red',ax=ax[2], label='train')
    sns.distplot(test[col],color='tab:blue',ax=ax[2], label='test')  
#     plt.legend()
#     plt.xlabel(col, fontsize=12)
#     plt.ylabel('number', fontsize=12)
#    sns.kdeplot(x_train_df[col],color='y',ax=ax[4])
#    sns.kdeplot(test[col],color='b',ax=ax[4])
    
    plt.show()
    del train, col,test

import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

data_path = 'path/'
os.listdir(data_path)

data = pd.read_excel(data_path+'data.xlsx',sheetname='Sheet2')

data.describe()

sns.boxplot(data=data)
# plt.grid()
plt.show()

sns.distplot(data['name7'],kde=True,hist=False)
# plt.grid()
plt.show()

 

你可能感兴趣的:(机器学习)