# encoding:utf-8
from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class LogisticReressionClassifier:
def __init__(self, max_iter=200, learning_rate=0.01):
self.max_iter = max_iter
self.learning_rate = learning_rate
def sigmoid(self, x):
return 1 / (1 + exp(-x))
def data_matrix(self, x):
data_mat = []
for d in x:
data_mat.append([1.0, *d])
return data_mat
def fit(self, x, y):
data_mat = self.data_matrix(x)
self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
for iter in range(self.max_iter):
for i in range(len(x)):
result = self.sigmoid(np.dot(data_mat[i], self.weights))
error = y[i] - result
self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
print('LogisticRegression Model(learning_rate={}, max_iter={})'.format(self.learning_rate, self.max_iter))
def score(self, x_test, y_test):
right = 0
x_test = self.data_matrix(x_test)
for x, y in zip(x_test, y_test):
result = np.dot(x, self.weights)
if(result > 0 and y == 1) or (result < 0 and y == 0):
right += 1
return right / len(x_test)
"""数据集"""
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0, 1, -1]])
return data[:, :2], data[:, -1]
x, y = create_data()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
lr_clf = LogisticReressionClassifier()
lr_clf.fit(x_train, y_train)
lr_clf.score(x_test, y_test)
x_ponits = np.arange(4, 8)
y = -(lr_clf.weights[1] * x_ponits + lr_clf.weights[0]) / lr_clf.weights[2]
plt.plot(x_ponits, y)
plt.scatter(x[:50, 0], x[:50, 1], label='0', color="blue")
plt.scatter(x[50:, 0], x[50:, 1], label='1', color="orange")
plt.legend()
plt.show()
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0, 1, -1]])
return data[:, :2], data[:, -1]
x, y = create_data()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf = LogisticRegression(max_iter=200)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
print(clf.coef_, clf.intercept_)
x_ponits = np.arange(4, 8)
y = -(clf.coef_[0][0] * x_ponits + clf.intercept_) / clf.coef_[0][1]
plt.plot(x_ponits, y)
plt.plot(x[:50, 0], x[:50, 1], 'bo', color='blue', label='0')
plt.plot(x[50:, 0], x[50:, 1], 'bo', color='orange', label='1')
plt.xlabel("sepal length")
plt.ylabel("sepal width")
plt.legend()
plt.show()
# encoding:utf-8
import math
from copy import deepcopy
class MaxEntropy:
def __init__(self, EPS=0.005):
self.samples = []
self.Y = set() # 标签集合,相当于去重后的y
self.numXY = {} # key为(x,y),value为出现次数
self.N = 0 # 样本数
self.Ep = [] # 样本分布的特征期望值
self.xyID = {} # key记录(x,y),value记录id号
self.n = 0 # 特征键值(x,y)的个数
self.C = 0 # 最大特征数
self.IDxy = {} # key为(x,y), value为对应的id号
self.w = []
self.EPS = EPS # 收敛条件
self.lastw = [] # 上一次w参数值
def loadData(self, dataset):
self.samples = deepcopy(dataset)
for items in self.samples:
y = items[0]
X = items[1:]
self.Y.add(y) # 集合中y若已存在则会自动忽略
for x in X:
if(x, y) in self.numXY:
self.numXY[(x, y)] += 1
else:
self.numXY[(x, y)] = 1
self.N = len(self.samples)
self.n = len(self.numXY)
self.C = max([len(sample) - 1 for sample in self.samples])
self.w = [0] * self.n
self.lastw = self.w[:]
self.Ep = [0] * self.n
"""计算特征函数fi关于经验分布的期望"""
for i, xy in enumerate(self.numXY):
self.Ep[i] = self.numXY[xy] / self.N
self.xyID[xy] = i
self.IDxy[i] = xy
"""计算每个Z(x)值"""
def Zx(self, X):
zx = 0
for y in self.Y:
ss = 0
for x in X:
if (x, y) in self.numXY:
ss += self.w[self.xyID[(x, y)]]
zx += math.exp(ss)
return zx
"""计算每个P(y|x)"""
def model_pyx(self, y, X):
zx = self.Zx(X)
ss = 0
for x in X:
if (x, y) in self.numXY:
ss += self.w[self.xyID[(x , y)]]
pyx = math.exp(ss) / zx
return pyx
"""计算特征函数fi关于模型的期望"""
def model_ep(self, index):
x, y = self.IDxy[index]
ep = 0
for sample in self.samples:
if x not in sample:
continue
pyx = self.model_pyx(y, sample)
ep += pyx / self.N
return ep
"""判断是否全部收敛"""
def convergence(self):
for last, now in zip(self.lastw, self.w):
if abs(last - now) >= self.EPS:
return False
return True
"""计算预测概率"""
def predict(self, X):
Z = self.Zx(X)
result = {}
for y in self.Y:
ss = 0
for x in X:
if (x, y) in self.numXY:
ss += self.w[self.xyID[(x, y)]]
pyx = math.exp(ss) / Z
result[y] = pyx
return result
def train(self, maxiter=1000): # 训练数据
for loop in range(maxiter): # 最大训练次数
print("iter:%d" % loop)
self.lastw = self.w[:]
for i in range(self.n):
ep = self.model_ep(i) # 计算第i个特征的模型期望
self.w[i] += math.log(self.Ep[i] / ep) / self.C # 更新参数
print("w:", self.w)
if self.convergence(): # 判断是否收敛
break
dataset = [['no', 'sunny', 'hot', 'high', 'FALSE'],
['no', 'sunny', 'hot', 'high', 'TRUE'],
['yes', 'overcast', 'hot', 'high', 'FALSE'],
['yes', 'rainy', 'mild', 'high', 'FALSE'],
['yes', 'rainy', 'cool', 'normal', 'FALSE'],
['no', 'rainy', 'cool', 'normal', 'TRUE'],
['yes', 'overcast', 'cool', 'normal', 'TRUE'],
['no', 'sunny', 'mild', 'high', 'FALSE'],
['yes', 'sunny', 'cool', 'normal', 'FALSE'],
['yes', 'rainy', 'mild', 'normal', 'FALSE'],
['yes', 'sunny', 'mild', 'normal', 'TRUE'],
['yes', 'overcast', 'mild', 'high', 'TRUE'],
['yes', 'overcast', 'hot', 'normal', 'FALSE'],
['no', 'rainy', 'mild', 'high', 'TRUE']]
maxent = MaxEntropy()
x = ['overcast', 'mild', 'high', 'FALSE']
maxent.loadData(dataset)
maxent.train()
print("+++++++++++++++++++++++")
print('predict:', maxent.predict(x))