《统计学习方法》第8章_提升方法

import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


# 参考《统计学习方法》 P138
class AdaBoost:
    def __init__(self, n_estimators=50, learning_rate=1.0):
        self.clf_num = n_estimators
        self.learning_rate = learning_rate

    def init_args(self, datasets, labels):
        self.X = datasets  # 特征空间
        self.Y = labels    # 标记空间
        self.M, self.N = datasets.shape  # 获取数据集的维度

        # 弱分类器数目和集合
        # 使用具有权值分布(weights)的训练数据集学习,得到基本的分类器
        self.clf_sets = []

        # 初始化训练数据集的权值分布(weights)
        self.weights = [1.0 / self.M] * self.M

        # 基本分类器G(x)的系数alpha
        self.alpha = []

    # 参考《统计学习方法》P140-141
    # 寻找合适的阈值v,使得G(x)在训练数据集上的误差率最小
    def _G(self, features, labels, weights):
        m = len(features)
        error = 100000.0  # 无穷大
        best_v = 0.0
        # 单维features
        features_min = min(features)
        features_max = max(features)
        n_step = (features_max - features_min + self.learning_rate) // self.learning_rate
        direct, compare_array = None, None
        for i in range(1, int(n_step)):
            # 假设弱分类器由xv产生
            # 阈值v使该分类器在训练数据集上分类误差率最低
            v = features_min + self.learning_rate * i

            if v not in features:
                # 误分类计算
                # 基于基本分类器G(x)后产生新的标记
                compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])
                # 误差率
                weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]])
                # 基于基本分类器G(x)后产生新的标记
                compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])
                # 误差率
                weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]])

                # 第一种方式产生的误差率小
                if weight_error_positive < weight_error_nagetive:
                    weight_error = weight_error_positive
                    _compare_array = compare_array_positive
                    direct = "positive"
                # 第二种方式产生的误差率小
                else:
                    weight_error = weight_error_nagetive
                    _compare_array = compare_array_nagetive
                    direct = "nagetive"

                if weight_error < error:
                    error = weight_error
                    compare_array = _compare_array
                    best_v = v
        # 返回最好的阈值、差生误差率的方式、误差率、新的标记空间
        return best_v, direct, error, compare_array

    # 计算alpha
    def _alpha(self, error):
        return 0.5 * np.log((1 - error) / error)

    # 规范化因子
    # 参考《统计学习方法》P139 公式(8.5)
    def _Z(self, weights, a, clf):
        return sum([weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M)])

    # 权值更新
    # 参考《统计学习方法》P139 公式(8.4)
    def _w(self, a, clf, Z):
        for i in range(self.M):
            self.weights[i] = self.weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) / Z

    # 基本分类器
    def G(self, x, v, direct):
        if direct == "positive":
            return 1 if x > v else -1
        else:
            return -1 if x > v else 1

    def fit(self, X, y):
        self.init_args(X, y)

        for epoch in range(self.clf_num):
            best_clf_error, best_v, clf_result = 100000, None, None
            # 根据特征维度, 选择误差最小的
            for j in range(self.N):
                features = self.X[:, j]
                # 分类阈值,分类误差,分类结果
                v, direct, error, compare_array = self._G(features, self.Y, self.weights)

                if error < best_clf_error:
                    best_clf_error = error
                    best_v = v
                    final_direct = direct
                    clf_result = compare_array
                    axis = j

                if best_clf_error == 0:
                    break

            # 计算G(x)系数α
            a = self._alpha(best_clf_error)
            self.alpha.append(a)
            # 记录分类器
            self.clf_sets.append((axis, best_v, final_direct))
            # 规范化因子
            Z = self._Z(self.weights, a, clf_result)
            # 更新权值
            self._w(a, clf_result, Z)

    def predict(self, feature):
        result = 0.0
        for i in range(len(self.clf_sets)):
            axis, clf_v, direct = self.clf_sets[i]
            f_input = feature[axis]
            # 参考《统计学习方法》P139 公式(8.6)
            result += self.alpha[i] * self.G(f_input, clf_v, direct)

            return 1 if result > 0 else -1

    def score(self, X_test, y_test):
        right_count = 0
        for i in range(len(X_test)):
            feature = X_test[i]
            if self.predict(feature) == y_test[i]:
                right_count += 1
        return right_count / len(X_test)


# 数据集
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    for i in range(len(data)):
        if data[i, -1] == 0:
            data[i, -1] = -1
    return data[:, :2], data[:, -1]

X, y= create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.33)
clf = AdaBoost()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

 

你可能感兴趣的:(统计学习方法)