GBDT+LR 原理及代码实现

文章目录

  • 一句话说清
  • 模型结构
  • GBDT思维导图
  • GBDT特征转换过程
  • GBDT+LR代码实现一
  • GBDT+LR代码实现二

一句话说清

GBDT+LR,就是利用GBDT自动进行特征筛选和组合,进而形成新的离散特征向量,再把该特征向量当做LR模型的输入,来实现预测。

模型结构

GBDT+LR 原理及代码实现_第1张图片
GBDT+LR 原理及代码实现_第2张图片

GBDT思维导图

GBDT+LR 原理及代码实现_第3张图片

GBDT特征转换过程

GBDT+LR 原理及代码实现_第4张图片
举例来说,如下图,GBDT由三颗子树构成,每颗子树有4个叶子节点,输入一个训练样本后,其先后落入“子树1”的第三个叶子节点,那么特征向量就是[0, 0, 1, 0]。“子树2”的第一个叶子节点,特征向量为[1, 0, 0, 0],“子树3”的第四个叶子节点,特征向量为[0, 0, 0, 1],最后连接所有特征向量,形成最终特征向量[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],即有几棵树就有几个1.
GBDT+LR 原理及代码实现_第5张图片

GBDT+LR代码实现一

'''
@Time : 2021/1/15 11:32 
@Author : WGS
@remarks : GBDT+LR
'''

import numpy as np

np.random.seed(10)

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

# 树的数量,默认10个
n_estimator = 10

# 构造分类数据
X, y = make_classification(n_samples=80000)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)



grd = GradientBoostingClassifier(n_estimators=n_estimator)   # GBDT建模

grd_enc = OneHotEncoder()   # onehot

grd_lm = LogisticRegression()   # LR

grd.fit(X_train, y_train)

# grd.apply(X_train)[:, :, 0]  # 返回叶子索引   [:, :, 0]:原来是三维,变成2维
grd_enc.fit(grd.apply(X_train)[:, :, 0])


grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)

y_pred_grd = grd.predict_proba(X_test)[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)



plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_grd, tpr_grd, label='GBDT')
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBDT + LR')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()


GBDT+LR代码实现二

'''
@Time : 2021/1/15 11:32 
@Author : WGS
@remarks : GBDT+LR
'''
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy import sparse

def preProcess():
    path = 'data/'
    print('读取数据...')
    df_train = pd.read_csv(path + 'train.csv')
    df_test = pd.read_csv(path + 'test.csv')
    print('读取结束')
    df_train.drop(['Id'], axis = 1, inplace = True)
    df_test.drop(['Id'], axis = 1, inplace = True)

    df_test['Label'] = -1

    data = pd.concat([df_train, df_test])
    data = data.fillna(-1)
    data.to_csv('data/data.csv', index = False)
    return data

def lr_predict(data, category_feature, continuous_feature): # 0.47181
    # 连续特征归一化
    print('开始归一化...')
    scaler = MinMaxScaler()
    for col in continuous_feature:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    print('归一化结束')
    
    # 离散特征one-hot编码
    print('开始one-hot...')
    for col in category_feature:
        onehot_feats = pd.get_dummies(data[col], prefix = col)
        data.drop([col], axis = 1, inplace = True)
        data = pd.concat([data, onehot_feats], axis = 1)
    print('one-hot结束')

    train = data[data['Label'] != -1]
    target = train.pop('Label')
    test = data[data['Label'] == -1]
    test.drop(['Label'], axis = 1, inplace = True)

    # 划分数据集
    print('划分数据集...')
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)
    print('开始训练...')
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
    print('tr-logloss: ', tr_logloss)
    val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
    print('val-logloss: ', val_logloss)
    print('开始预测...')
    y_pred = lr.predict_proba(test)[:, 1]
    print('写入结果...')
    res = pd.read_csv('data/test.csv')
    submission = pd.DataFrame({
     'Id': res['Id'], 'Label': y_pred})
    submission.to_csv('submission/submission_lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
    print('结束')

def gbdt_predict(data, category_feature, continuous_feature): # 0.44548
    # 离散特征one-hot编码
    print('开始one-hot...')
    for col in category_feature:
        onehot_feats = pd.get_dummies(data[col], prefix = col)
        data.drop([col], axis = 1, inplace = True)
        data = pd.concat([data, onehot_feats], axis = 1)
    print('one-hot结束')

    train = data[data['Label'] != -1]
    target = train.pop('Label')
    test = data[data['Label'] == -1]
    test.drop(['Label'], axis = 1, inplace = True)

    # 划分数据集
    print('划分数据集...')
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)

    print('开始训练..')
    gbm = lgb.LGBMClassifier(objective='binary',
                            subsample= 0.8,
                            min_child_weight= 0.5,
                            colsample_bytree= 0.7,
                            num_leaves=100,
                            max_depth = 12,
                            learning_rate=0.01,
                            n_estimators=10000,
                            )

    gbm.fit(x_train, y_train,
            eval_set = [(x_train, y_train), (x_val, y_val)],
            eval_names = ['train', 'val'],
            eval_metric = 'binary_logloss',
            early_stopping_rounds = 100,
            )
    tr_logloss = log_loss(y_train, gbm.predict_proba(x_train)[:, 1])
    val_logloss = log_loss(y_val, gbm.predict_proba(x_val)[:, 1])
    y_pred = gbm.predict_proba(test)[:, 1]
    print('写入结果...')
    res = pd.read_csv('data/test.csv')
    submission = pd.DataFrame({
     'Id': res['Id'], 'Label': y_pred})
    submission.to_csv('submission/submission_gbdt_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
    print('结束')

def gbdt_lr_predict(data, category_feature, continuous_feature): # 0.43616
    # 离散特征one-hot编码
    print('开始one-hot...')
    for col in category_feature:
        onehot_feats = pd.get_dummies(data[col], prefix = col)
        data.drop([col], axis = 1, inplace = True)
        data = pd.concat([data, onehot_feats], axis = 1)
    print('one-hot结束')

    train = data[data['Label'] != -1]
    target = train.pop('Label')
    test = data[data['Label'] == -1]
    test.drop(['Label'], axis = 1, inplace = True)

    # 划分数据集
    print('划分数据集...')
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)

    print('开始训练gbdt..')
    gbm = lgb.LGBMRegressor(objective='binary',
                            subsample= 0.8,
                            min_child_weight= 0.5,
                            colsample_bytree= 0.7,
                            num_leaves=100,
                            max_depth = 12,
                            learning_rate=0.05,
                            n_estimators=10,
                            )

    gbm.fit(x_train, y_train,
            eval_set = [(x_train, y_train), (x_val, y_val)],
            eval_names = ['train', 'val'],
            eval_metric = 'binary_logloss',
            # early_stopping_rounds = 100,
            )
    model = gbm.booster_
    print('训练得到叶子数')
    gbdt_feats_train = model.predict(train, pred_leaf = True)
    gbdt_feats_test = model.predict(test, pred_leaf = True)
    gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]
    df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) 
    df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)

    print('构造新的数据集...')
    train = pd.concat([train, df_train_gbdt_feats], axis = 1)
    test = pd.concat([test, df_test_gbdt_feats], axis = 1)
    train_len = train.shape[0]
    data = pd.concat([train, test])
    del train
    del test
    gc.collect()

    # # 连续特征归一化
    # print('开始归一化...')
    # scaler = MinMaxScaler()
    # for col in continuous_feature:
    #     data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    # print('归一化结束')

    # 叶子数one-hot
    print('开始one-hot...')
    for col in gbdt_feats_name:
        print('this is feature:', col)
        onehot_feats = pd.get_dummies(data[col], prefix = col)
        data.drop([col], axis = 1, inplace = True)
        data = pd.concat([data, onehot_feats], axis = 1)
    print('one-hot结束')

    train = data[: train_len]
    test = data[train_len:]
    del data
    gc.collect()

    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2018)
    # lr
    print('开始训练lr..')
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
    print('tr-logloss: ', tr_logloss)
    val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
    print('val-logloss: ', val_logloss)
    print('开始预测...')
    y_pred = lr.predict_proba(test)[:, 1]
    print('写入结果...')
    res = pd.read_csv('data/test.csv')
    submission = pd.DataFrame({
     'Id': res['Id'], 'Label': y_pred})
    submission.to_csv('submission/submission_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
    print('结束')

def gbdt_ffm_predict(data, category_feature, continuous_feature):
    # 离散特征one-hot编码
    print('开始one-hot...')
    for col in category_feature:
        onehot_feats = pd.get_dummies(data[col], prefix = col)
        data = pd.concat([data, onehot_feats], axis = 1)
    print('one-hot结束')

    feats = [col for col in data if col not in category_feature] # onehot_feats + continuous_feature
    tmp = data[feats]
    train = tmp[tmp['Label'] != -1]
    target = train.pop('Label')
    test = tmp[tmp['Label'] == -1]
    test.drop(['Label'], axis = 1, inplace = True)
    
    # 划分数据集
    print('划分数据集...')
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)

    print('开始训练gbdt..')
    gbm = lgb.LGBMRegressor(objective='binary',
                            subsample= 0.8,
                            min_child_weight= 0.5,
                            colsample_bytree= 0.7,
                            num_leaves=100,
                            max_depth = 12,
                            learning_rate=0.05,
                            n_estimators=10,
                            )

    gbm.fit(x_train, y_train,
            eval_set = [(x_train, y_train), (x_val, y_val)],
            eval_names = ['train', 'val'],
            eval_metric = 'binary_logloss',
            # early_stopping_rounds = 100,
            )
    model = gbm.booster_
    print('训练得到叶子数')
    gbdt_feats_train = model.predict(train, pred_leaf = True)
    gbdt_feats_test = model.predict(test, pred_leaf = True)
    gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]
    df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) 
    df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)

    print('构造新的数据集...')
    tmp = data[category_feature + continuous_feature + ['Label']]
    train = tmp[tmp['Label'] != -1]
    test = tmp[tmp['Label'] == -1]
    train = pd.concat([train, df_train_gbdt_feats], axis = 1)
    test = pd.concat([test, df_test_gbdt_feats], axis = 1)
    data = pd.concat([train, test])
    del train
    del test
    gc.collect()

    # 连续特征归一化
    print('开始归一化...')
    scaler = MinMaxScaler()
    for col in continuous_feature:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    print('归一化结束')

    data.to_csv('data/data.csv', index = False)
    return category_feature + gbdt_feats_name

def FFMFormat(df, label, path, train_len, category_feature = [], continuous_feature = []):
    index = df.shape[0]
    train = open(path + 'train.ffm', 'w')
    test = open(path + 'test.ffm', 'w')
    feature_index = 0
    feat_index = {
     }
    for i in range(index):
        feats = []
        field_index = 0
        for j, feat in enumerate(category_feature):
            t = feat + '_' + str(df[feat][i])
            if t not in  feat_index.keys():
                feat_index[t] = feature_index
                feature_index = feature_index + 1
            feats.append('%s:%s:%s' % (field_index, feat_index[t], 1))
            field_index = field_index + 1

        for j, feat in enumerate(continuous_feature):
            feats.append('%s:%s:%s' % (field_index, feature_index, df[feat][i]))
            feature_index = feature_index + 1
            field_index = field_index + 1

        print('%s %s' % (df[label][i], ' '.join(feats)))

        if i < train_len:
            train.write('%s %s\n' % (df[label][i], ' '.join(feats)))
        else:
            test.write('%s\n' % (' '.join(feats)))
    train.close()
    test.close()
   
    
if __name__ == '__main__':
    data = preProcess()
    continuous_feature = ['I'] * 13
    continuous_feature = [col + str(i + 1) for i, col in enumerate(continuous_feature)] 
    category_feature = ['C'] * 26
    category_feature = [col + str(i + 1) for i, col in enumerate(category_feature)] 
    # lr_predict(data, category_feature, continuous_feature)
    # gbdt_predict(data, category_feature, continuous_feature)
    # gbdt_lr_predict(data, category_feature, continuous_feature)
    category_feature = gbdt_ffm_predict(data, category_feature, continuous_feature)

    data = pd.read_csv('data/data.csv')
    df_train = pd.read_csv('data/train.csv')
    FFMFormat(data, 'Label', 'data/', df_train.shape[0], category_feature, continuous_feature)

你可能感兴趣的:(深度学习,推荐系统)