机器学习算法实现(3):手写AdaBoost与GBDT

【系列开头】开这个系列是因为最近学习某些算法纸上谈兵太久,算法流程背的再熟,没有实现过就没法真正的理解算法的细节。这个系列要实现算法的顺序为逻辑回归、决策树(CART)、AdaBoost、GBDT。其他算法根据后续学习情况进行添加。

AdaBoost是一种二分类的方法,实现起来十分简单,却能大大提升决策树的性能,不得不说十分神奇,至于为什么会有这种效果,书上都有证明,这里只给出算法实现。

代码的步骤主要就是计算样本权重和基分类器的权重。

  1. 初始化
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class AdaBoostClassifier:
    def __init__(self, n_estimators=5):
        self.n_estimators = n_estimators
        self.estimators_ = [DecisionTreeClassifier(max_depth=1) for _ in range(self.n_estimators)]
        self.estimators_weights_ = [None] * self.n_estimators
  1. 更新参数
    def __update_w(self, w, y_true, y_pred):
        weight_err = np.sum(w * (y_true != y_pred)) / np.sum(w)
        alpha = np.log(1 / weight_err - 1)
        w = w * np.exp(alpha * (y_true != y_pred))
        w = w / np.sum(w)

        return w, alpha
  1. 拟合与预测
    def fit(self, X_train, y_train):
        y_train[y_train == 0] = -1
        n_samples, n_features = X_train.shape
        w = np.array([1 / n_samples] * n_samples)
        for i in range(self.n_estimators):
            self.estimators_[i].fit(X_train, y_train, sample_weight=w)
            cur_pred = self.estimators_[i].predict(X_train)
            w, self.estimators_weights_[i] = self.__update_w(w, y_train, cur_pred)

    def predict(self, X_test):
        for i in range(self.n_estimators):
            if i == 0:
                y_pred = self.estimators_weights_[i] * self.estimators_[i].predict(X_test)
            else:
                y_pred = np.c_[y_pred, self.estimators_weights_[i] * self.estimators_[i].predict(X_test)]
        y_pred = np.sign(np.sum(y_pred, axis=1))
        y_pred[y_pred == -1] = 0
        return y_pred

GBDT看起来和Adaboost差不多,但还是有区别的。两者虽然都是通过加法模型来拟合数据,但是,AdaBoost是对简单分类算法的提升,而GBDT采用的是回归树,只是不同的问题,使用的损失函数不同。

  1. 初始化与GBDT
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from utils import SquareLoss, SoftMaxLoss

class GBDT:
    def __init__(self, n_estimators, learning_rate, min_samples_split, min_impurity, max_depth, regression):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.regression = regression

        # self.bar = progressbar.ProgressBar(widgets=bar_widgets)
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = SoftMaxLoss()

        self.trees = []
        for i in range(self.n_estimators):
            self.trees.append(DecisionTreeRegressor(min_samples_split=self.min_samples_split,
                                                    min_impurity_decrease=self.min_impurity,
                                                    max_depth=self.max_depth))

    def fit(self, X, y):
        self.trees[0].fit(X, y)
        y_pred = self.trees[0].predict(X)
        for i in range(1, self.n_estimators):
            gradient = -self.loss.gradient(y, y_pred)
            self.trees[i].fit(X, gradient)
            y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))

    def predict(self, X):
        y_pred = self.trees[0].predict(X)
        for i in range(1, self.n_estimators):
            y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
        if not self.regression:
            #求两个类别的概率
            y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1).reshape(-1, 1)
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred
  1. GBDT回归
class GBDTRegressor(GBDT):
    def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
                 min_var_red=1e-7, max_depth=4, debug=False):
        super(GBDTRegressor, self).__init__(n_estimators=n_estimators,
                                            learning_rate=learning_rate,
                                            min_samples_split=min_samples_split,
                                            min_impurity=min_var_red,
                                            max_depth=max_depth,
                                            regression=True)
  1. GBDT分类
lass GBDTClassifier(GBDT):
    def __init__(self, n_estimators=200, learning_rate=0.8, min_samples_split=2,
                 min_info_gain=1e-7, max_depth=2, debug=False):
        super(GBDTClassifier, self).__init__(n_estimators=n_estimators,
                                             learning_rate=learning_rate,
                                             min_samples_split=min_samples_split,
                                             min_impurity=min_info_gain,
                                             max_depth=max_depth,
                                             regression=False)
    def __to_categorical(self, y, n_col=None):
        if not n_col:
            n_col = np.amax(y) + 1
        one_hot = np.zeros((y.shape[0], n_col))
        one_hot[np.arange(y.shape[0]), y] = 1
        return one_hot

    def fit(self, X, y):
        y = self.__to_categorical(y)
        super(GBDTClassifier, self).fit(X, y)

你可能感兴趣的:(炼丹笔记)