GBDT分类实战完全总结(二)

第二部分:sklearn分类实例

实例一:Feature transformations with ensembles of trees使用集成树的特征转换

import numpy as np
np.random.seed(10)
# seed( ) 用于指定随机数生成时所用算法开始的整数值。
# 1.如果使用相同的seed( )值,则每次生成的随即数都相同;
# 2.如果不设置这个值,则系统根据时间来自己选择这个值,此时每次生成的随机数因时间差异而不同。
# 3.设置的seed()值仅一次有效

import matplotlib.pyplot as plt
import time
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

startTime = time.time()
print('Step 1.Prepareing data...')
n_estimator = 10         # 迭代次数
X, y = make_classification(n_samples=80000)   # 样本生成,这里生成了80000个样本
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)   # 将80000个样本分成一半是训练集,一半是测试集
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)
print('Step 2.RT+LR...')
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
	random_state=0)
# 逻辑回归
rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)   # 构造函数 RT+LR
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
print('Step 3.RF+LR...')
# Supervised transformation based on random forests
# 随机森林
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]    # RF+LR
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

print('Step 4.GBT+LR...')
# 梯度提升决策树分类
grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()    # 数据预处理
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)    # 训练GRD模型
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)   # 画ROC曲线,sklearn里面有相应的函数,横轴假正例率,纵轴真正例率;
# ROC曲线反映了分类器对正例的覆盖能力和对负例的覆盖能力之间的权衡。

print('Step 5.GBT...')
# The gradient boosted model by itself
y_pred_grd = grd.

你可能感兴趣的:(ML,python,GBDT)