《统计学习方法》第6章_逻辑斯蒂回归与最大熵模型

  • 逻辑斯蒂回归
# encoding:utf-8
from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


class LogisticReressionClassifier:
    def __init__(self, max_iter=200, learning_rate=0.01):
        self.max_iter = max_iter
        self.learning_rate = learning_rate

    def sigmoid(self, x):
        return 1 / (1 + exp(-x))

    def data_matrix(self, x):
        data_mat = []
        for d in x:
            data_mat.append([1.0, *d])
        return data_mat

    def fit(self, x, y):
        data_mat = self.data_matrix(x)
        self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
        for iter in range(self.max_iter):
            for i in range(len(x)):
                result = self.sigmoid(np.dot(data_mat[i], self.weights))
                error = y[i] - result
                self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
        print('LogisticRegression Model(learning_rate={}, max_iter={})'.format(self.learning_rate, self.max_iter))

    def score(self, x_test, y_test):
        right = 0
        x_test = self.data_matrix(x_test)
        for x, y in zip(x_test, y_test):
            result = np.dot(x, self.weights)
            if(result > 0 and y == 1) or (result < 0 and y == 0):
                right += 1
        return right / len(x_test)

"""数据集"""

def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    return data[:, :2], data[:, -1]

x, y = create_data()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

lr_clf = LogisticReressionClassifier()
lr_clf.fit(x_train, y_train)
lr_clf.score(x_test, y_test)

x_ponits = np.arange(4, 8)
y = -(lr_clf.weights[1] * x_ponits + lr_clf.weights[0]) / lr_clf.weights[2]
plt.plot(x_ponits, y)
plt.scatter(x[:50, 0], x[:50, 1], label='0', color="blue")
plt.scatter(x[50:, 0], x[50:, 1], label='1', color="orange")
plt.legend()
plt.show()
  • scikit-learn实例
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    return data[:, :2], data[:, -1]

x, y = create_data()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

clf = LogisticRegression(max_iter=200)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
print(clf.coef_, clf.intercept_)

x_ponits = np.arange(4, 8)
y = -(clf.coef_[0][0] * x_ponits + clf.intercept_) / clf.coef_[0][1]
plt.plot(x_ponits, y)
plt.plot(x[:50, 0], x[:50, 1], 'bo', color='blue', label='0')
plt.plot(x[50:, 0], x[50:, 1], 'bo', color='orange', label='1')
plt.xlabel("sepal length")
plt.ylabel("sepal width")
plt.legend()
plt.show()
  • 最大熵模型
# encoding:utf-8
import math
from copy import deepcopy

class MaxEntropy:
    def __init__(self, EPS=0.005):
        self.samples = []
        self.Y = set()  # 标签集合,相当于去重后的y
        self.numXY = {}  # key为(x,y),value为出现次数
        self.N = 0  # 样本数
        self.Ep = []  # 样本分布的特征期望值
        self.xyID = {}  # key记录(x,y),value记录id号
        self.n = 0  # 特征键值(x,y)的个数
        self.C = 0  # 最大特征数
        self.IDxy = {}  # key为(x,y), value为对应的id号
        self.w = []
        self.EPS = EPS  # 收敛条件
        self.lastw = []  # 上一次w参数值

    def loadData(self, dataset):
        self.samples = deepcopy(dataset)
        for items in self.samples:
            y = items[0]
            X = items[1:]
            self.Y.add(y)  # 集合中y若已存在则会自动忽略
            for x in X:
                if(x, y) in self.numXY:
                    self.numXY[(x, y)] += 1
                else:
                    self.numXY[(x, y)] = 1
        self.N = len(self.samples)
        self.n = len(self.numXY)
        self.C = max([len(sample) - 1 for sample in self.samples])
        self.w = [0] * self.n
        self.lastw = self.w[:]

        self.Ep = [0] * self.n
        """计算特征函数fi关于经验分布的期望"""
        for i, xy in enumerate(self.numXY):
            self.Ep[i] = self.numXY[xy] / self.N
            self.xyID[xy] = i
            self.IDxy[i] = xy

    """计算每个Z(x)值"""
    def Zx(self, X):
        zx = 0
        for y in self.Y:
            ss = 0
            for x in X:
                if (x, y) in self.numXY:
                    ss += self.w[self.xyID[(x, y)]]
            zx += math.exp(ss)
        return zx

    """计算每个P(y|x)"""
    def model_pyx(self, y, X):
        zx = self.Zx(X)
        ss = 0
        for x in X:
            if (x, y) in self.numXY:
                ss += self.w[self.xyID[(x , y)]]
        pyx = math.exp(ss) / zx
        return pyx

    """计算特征函数fi关于模型的期望"""
    def model_ep(self, index):
        x, y = self.IDxy[index]
        ep = 0
        for sample in self.samples:
            if x not in sample:
                continue
            pyx = self.model_pyx(y, sample)
            ep += pyx / self.N
        return ep

    """判断是否全部收敛"""
    def convergence(self):
        for last, now in zip(self.lastw, self.w):
            if abs(last - now) >= self.EPS:
                return False
        return True

    """计算预测概率"""
    def predict(self, X):
        Z = self.Zx(X)
        result = {}
        for y in self.Y:
            ss = 0
            for x in X:
                if (x, y) in self.numXY:
                    ss += self.w[self.xyID[(x, y)]]
            pyx = math.exp(ss) / Z
            result[y] = pyx
        return result

    def train(self, maxiter=1000):  # 训练数据
        for loop in range(maxiter):  # 最大训练次数
            print("iter:%d" % loop)
            self.lastw = self.w[:]
            for i in range(self.n):
                ep = self.model_ep(i)  # 计算第i个特征的模型期望
                self.w[i] += math.log(self.Ep[i] / ep) / self.C  # 更新参数
            print("w:", self.w)
            if self.convergence():  # 判断是否收敛
                break


dataset = [['no', 'sunny', 'hot', 'high', 'FALSE'],
           ['no', 'sunny', 'hot', 'high', 'TRUE'],
           ['yes', 'overcast', 'hot', 'high', 'FALSE'],
           ['yes', 'rainy', 'mild', 'high', 'FALSE'],
           ['yes', 'rainy', 'cool', 'normal', 'FALSE'],
           ['no', 'rainy', 'cool', 'normal', 'TRUE'],
           ['yes', 'overcast', 'cool', 'normal', 'TRUE'],
           ['no', 'sunny', 'mild', 'high', 'FALSE'],
           ['yes', 'sunny', 'cool', 'normal', 'FALSE'],
           ['yes', 'rainy', 'mild', 'normal', 'FALSE'],
           ['yes', 'sunny', 'mild', 'normal', 'TRUE'],
           ['yes', 'overcast', 'mild', 'high', 'TRUE'],
           ['yes', 'overcast', 'hot', 'normal', 'FALSE'],
           ['no', 'rainy', 'mild', 'high', 'TRUE']]

maxent = MaxEntropy()
x = ['overcast', 'mild', 'high', 'FALSE']
maxent.loadData(dataset)
maxent.train()


print("+++++++++++++++++++++++")
print('predict:', maxent.predict(x))

你可能感兴趣的:(统计学习方法,Vanish)