最小二次法是均方误差最小化进行模型求解,极大似然估计则是利用了联合分布及似然函数 得到公式
$ E_{(w,b)}= $
利用最优化的思路当$ E_{(w,b)}= $最小时w,b的值
求解方法包括梯度下降法,根据推到公式直接解
损失函数:逐点计算平方损失误差,然后求平均数
导数: 西瓜书上 公式3.30,
这几个方法都没有用牛顿法,海森矩阵计算比较复杂,梯度下降法基本上就是乘了一个学习率,如果用海森矩阵不需要用学习率,相当于一个渐变步长的学习率,下降速度更快。
数据载入
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
X,y = datasets.load_iris(return_X_y=True)
X= X[:,0]
X.shape
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
def formula_calculate(X,y):
m=len(X)
X_mean=np.mean(X)
y_mean=np.mean(y)
w=np.sum(np.multiply(y,X-X_mean)/(np.sum(np.multiply(X,X))-np.sum(X)*np.sum(X)/m))
b=y_mean-w*X_mean
return w,b
formula_calculate(X_train,y_train)
plt.scatter(X_train,y_train)
# plt.plot(X_test,0.76*X_test-3.47)
plt.plot(X_test,0.37*X_test-1.15)
损失函数是系数的函数,另外还要传入数据的x,y
参考链接
def compute_cost(w, b, points):
total_cost = 0
M = len(points)
# 逐点计算平方损失误差,然后求平均数
for i in range(M):
x = points[i, 0]
y = points[i, 1]
total_cost += ( y - w * x - b ) ** 2
return total_cost/M
def grad_desc(points, initial_w, initial_b, alpha, num_iter,cost_limi):
w = initial_w
b = initial_b
# 定义一个list保存所有的损失函数值,用来显示下降的过程
cost_list = []
for i in range(num_iter):
cost = compute_cost(w, b, points)
cost_list.append( cost )
w, b = step_grad_desc( w, b, alpha, points )
if cost<=cost_limi:
break
return [w, b, cost_list]
def step_grad_desc( current_w, current_b, alpha, points ):
sum_grad_w = 0
sum_grad_b = 0
M = len(points)
# 对每个点,代入公式求和
for i in range(M):
x = points[i, 0]
y = points[i, 1]
# w 和b 的导数
sum_grad_w += ( current_w * x + current_b - y ) * x
sum_grad_b += current_w * x + current_b - y
# 用公式求当前梯度
grad_w = 2/M * sum_grad_w
grad_b = 2/M * sum_grad_b
# 梯度下降,更新当前的w和b
updated_w = current_w - alpha * grad_w
updated_b = current_b - alpha * grad_b
return updated_w, updated_b
initial_w=0
initial_b=0
alpha=0.01
num_iter=10000
points = np.array([X_train,y_train]).T
cost_limi=0.1
li_x=grad_desc(points, initial_w, initial_b, alpha, num_iter,cost_limi)
X,y = datasets.load_iris(return_X_y=True)
def Multi_calculate(X,y):
num_feature=X.shape[1]
beta = np.ones((num_feature+1))
X = np.c_[X,np.ones(len(X))]
beta=np.dot(np.dot(np.linalg.pinv(np.dot(X.T,X)),X.T),y)
return beta
Multi_calculate(X_train,y_train)
参考链接
def compute_cost(beta,X,Y):
'''计算当前损失函数值 ,平方损失误差的平均
input: X:
beta : w 和 b
Y: 实际值 label
output: err/m(float): 错误率
'''
X = np.c_[X,np.ones(len(X))]
total_cost= np.mean(np.power( Y - np.dot(X,beta),2 ))
return total_cost
def grad_desc(X,Y, alpha, num_iter,cost_limi=0):
'''梯度下降
input: X,Y
alpha:学习率
num_iter: 最大迭代次数
cost_limi:最小损失值
output: list,list[1]:更新后的beta,损失函数的list
'''
num_feature=X.shape[1]
beta = np.ones((num_feature+1))
X = np.array(X)
Y = np.array(Y)
# 定义一个list保存所有的损失函数值,用来显示下降的过程
cost_list = []
for i in range(num_iter):
cost = compute_cost(beta ,X,Y )
if i==0:
cost_between=cost_limi+1
else:
cost_between=np.abs(cost_list[-1]-cost)
cost_list.append(cost)
beta = step_grad_desc( beta, alpha, X,Y )
if cost_between<=cost_limi:
break
return [beta, cost_list]
def step_grad_desc( current_beta, alpha, X,Y ):
'''梯度更新
input: X,Y
current_beta :现在的 w 和 b
alpha:学习率
output: updated_beta: 更新后的 w 和 b'''
# 求导数,南瓜书14页
M = len(X)
X = np.c_[X,np.ones(len(X))]
# /M 是因为用M个样本更新的参数,要得到的是每个样本的参数,所以需要除以样本数
# 可以不除以M但是相对学习率需要下降几个数量级否则可能不收敛
grad_beta= 2*np.dot(X.T,np.dot(X,current_beta)-Y)/M
# print(grad_beta)
# 梯度下降,更新当前的w和b
updated_beta = current_beta - alpha * grad_beta
return updated_beta
initial_w=0
initial_b=0
alpha=0.01
num_iter=40000
X=X_train
Y=y_train
cost_limi=0
li_x=grad_desc(X,Y, alpha, num_iter,cost_limi)
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() # 创建回归器
regressor.fit(X_train, y_train) # 用训练集去拟合回归器
# Predicting the Test set results
y_pred = regressor.predict(X_test) #获得预测结果
迭代次数增加之后结果基本没差别
公式直接解 | 梯度下降 | sklearn |
---|---|---|
array([-0.10627533, -0.0397204 , 0.22894234, 0.61123074, 0.16149541]) | array([-0.10646638, -0.03981776, 0.22896909, 0.61130829, 0.16272824]) | [array([-0.10627533, -0.0397204 , 0.22894234, 0.61123074]), 0.16149541375178667] |
梯度下降明显不如直接解公式和sklearn 的运行速度快
逻辑回归西瓜树上有详细的公式可以参考这里不再列出
# 非线性可以用梯度下降,二分类问题
# https://blog.csdn.net/Weary_PJ/article/details/107380387
# 公式可以参考 :https://blog.csdn.net/Weary_PJ/article/details/107589709
def sigmoid(x):
''' Sigmoid函数
input: x(mat): feature * w
output: sigmoid(x)(mat): Sigmoid值
'''
return 1/(1+np.exp(-x))
def gradient_descent_batch(current_w, alpha,h, X,Y):
'''梯度更新和损失值
input: X,Y
current_w :现在的 w 和 b
alpha:学习率
output: updated_beta: 更新后的 w 和 b
err: 错误率'''
h = sigmoid(np.dot(X,current_w))
# 损失函数
cost= -np.mean(label_data*np.log(h) +\
(1 - label_data) * np.log(1 - h))
error = Y - h
w = current_w + alpha *np.dot(X.T,error) # 权值修正 batch gradient descent
return w,cost
def optimal_model_batch(feature_data, label_data,cost_limi, epoch, alpha):
'''利用梯度下降法训练LR模型
input: feature_data(mat)样例数据
label_data(mat)标签数据
epoch(int)训练的轮数
alpha(float)学习率
output: w(mat):权重
'''
X = np.c_[feature_data,np.ones(len(feature_data))]
y = label_data
n = np.shape(feature_data)[1] # 特征个数
w = np.ones((n+1)) # 初始化权重
i = 0
li_cost=[]
while i <= epoch:
i += 1
# sigma函数
w,cost = gradient_descent_batch(w, alpha,h, X,y) #梯度更新
if i==1:
cost_between=cost_limi+1
else:
cost_between= np.abs(li_cost[-1]-cost)
li_cost.append(cost)
if i % 100 == 0:
print("\t-----iter = " + str(i) + \
", train error rate = " + str(errorl))
elif cost_between<=cost_limi:
print("\t-----iter = " + str(i) + \
", train error rate = " + str(errorl))
break
return w,li_cost
y1 = y_train[(y_train==1)|(y_train==0)]
X1= X_train[(y_train==1)|(y_train==0),:]
alpha=0.01
num_iter=40000
X=X1
Y=y1
cost_limi=0.0001
li_x=lr_train_bgd(X,Y,cost_limi, num_iter, alpha)
plt.plot(li_x[1][:150])
# sklearn
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='none',solver="sag",tol=cost_limi)
clf.fit(X,Y)
[clf.coef_,clf.intercept_]
结果相差比较大,主要是逻辑回归方法太多了找不到针对的方法
J ( W ) = − 1 m [ ∑ i = 1 m ∑ j = 1 k I { y ( i ) = j } log e W j T X ( i ) ∑ l = 1 k e W l T X ( i ) ] J(W) = -\frac{1}{m} \left [\sum_{i=1}^{m} \sum_{j=1}^{k} I \{ y^{(i)}=j \} \log \frac{e^{W_j^TX^{(i)}}}{\sum _{l=1}^{k}e^{W_l^TX^{(i)}}} \right ] J(W)=−m1[i=1∑mj=1∑kI{y(i)=j}log∑l=1keWlTX(i)eWjTX(i)]
∂ J ( θ ) ∂ θ j = − 1 m [ ∑ i = 1 m ▽ θ j { ∑ j = 1 k I ( y ( i ) = j ) log e θ j T X ( i ) ∑ l = 1 k e θ l T X ( i ) } ] \frac{\partial J(\theta )}{\partial \theta _j} = -\frac{1}{m}\left [ \sum_{i=1}^{m}\triangledown _{\theta_j}\left \{ \sum_{j=1}^{k}I(y^{(i)}=j) \log\frac{e^{\theta_j^TX^{(i)}}}{\sum _{l=1}^{k}e^{\theta_l^TX^{(i)}}} \right \} \right ] ∂θj∂J(θ)=−m1[i=1∑m▽θj{j=1∑kI(y(i)=j)log∑l=1keθlTX(i)eθjTX(i)}]
如果 y ( i ) = j y^{(i)}=j y(i)=j
∂ J ( θ ) ∂ θ j = − 1 m ∑ i = 1 m [ ( 1 − e θ j T X ( i ) ∑ l = 1 k e θ l T X ( i ) ) X ( i ) ] \frac{\partial J(\theta )}{\partial \theta _j} = -\frac{1}{m}\sum_{i=1}^{m}\left [\left ( 1-\frac{e^{\theta_j^TX^{(i)}}}{\sum _{l=1}^{k}e^{\theta_l^TX^{(i)}}} \right )X^{(i)} \right ] ∂θj∂J(θ)=−m1i=1∑m[(1−∑l=1keθlTX(i)eθjTX(i))X(i)]
如果 y ( i ) ≠ j y^{(i)}\neq j y(i)=j
∂ J ( θ ) ∂ θ j = − 1 m ∑ i = 1 m [ ( − e θ j T X ( i ) ∑ l = 1 k e θ l T X ( i ) ) X ( i ) ] \frac{\partial J(\theta )}{\partial \theta _j} = -\frac{1}{m}\sum_{i=1}^{m}\left [\left ( -\frac{e^{\theta_j^TX^{(i)}}}{\sum _{l=1}^{k}e^{\theta_l^TX^{(i)}}} \right )X^{(i)} \right ] ∂θj∂J(θ)=−m1i=1∑m[(−∑l=1keθlTX(i)eθjTX(i))X(i)]
参考链接1,链接2
def softmax(a,k):
c = np.max(a,axis=1)
exp_a = np.exp(a - (np.ones([k,1])*c).T) # 防止溢出的对策
sum_exp_a = np.sum(exp_a,axis=1)
y = exp_a / (np.ones([k,1])*sum_exp_a).T
return y
def gradient_descent_batch(current_w, alpha,k, X,Y):
'''梯度更新和损失值
input: X,Y
current_w :现在的 w 和 b
alpha:学习率
output: updated_beta: 更新后的 w 和 b
err: 错误率'''
h = softmax(np.dot(X,current_w),k)
# 损失函数
cost= -np.mean(np.eye(k)[Y]*np.log(h))
error = np.eye(k)[Y] - h
w = current_w + (alpha/len(X))*np.dot(X.T,error) # 权值修正 batch gradient descent
return w,cost
def optimal_model_batch(feature_data, label_data,cost_limi, epoch, alpha):
'''利用梯度下降法训练LR模型
input: feature_data(mat)样例数据
label_data(mat)标签数据
epoch(int)训练的轮数
alpha(float)学习率
output: w(mat):权重
'''
k=np.unique(label_data).shape[0]
X = np.c_[feature_data,np.ones(len(feature_data))]
y = label_data
n = np.shape(feature_data)[1] # 特征个数
w = np.ones([n+1,k]) # 初始化权重
i = 0
li_cost=[]
while i <= epoch:
i += 1
# sigma函数
w,cost = gradient_descent_batch(w, alpha,k, X,y) #梯度更新
if i==1:
cost_between=cost_limi+1
else:
cost_between= np.abs(li_cost[-1]-cost)
li_cost.append(cost)
if i % 100 == 0:
print("\t-----iter = " + str(i) + \
", train error rate = " + str(cost))
elif cost_between<=cost_limi:
print("\t-----iter = " + str(i) + \
", train error rate = " + str(cost))
break
return w,li_cost
X,y = datasets.load_iris(return_X_y=True)
alpha=0.1
num_iter=40000
X=X_train
Y=y_train
cost_limi=0.000001
li_x=optimal_model_batch(X,Y,cost_limi, num_iter, alpha)
# sklearn
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='none',solver="sag",multi_class="multinomial",tol=cost_limi)
clf.fit(X,Y)
[clf.coef_,clf.intercept_]
其实线性判别和PCA有很大的相似之处,尤其都是针对某一个矩阵求解方程,但是在线性判别分析中特征向量有明确的意义是针对d维数据的d’投影,尤其是使类与类之间差别最大的结果,所以是一种监督降维的方法
对于多分类问题LDA:
W是 S w − 1 S b S_{w}^{-1}S_{b} Sw−1Sb的d’个最大非零广义特征值多对应的特征向量组成的矩阵 d’<=N-1
引用链接 基本没有改动
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
def LDA_dimensionality(X, y, k):
'''
X为数据集,y为label,k为目标维数
'''
label_ = list(set(y))
X_classify = {}
# 每一类对应的样本
for label in label_:
Xi = np.array([X[i] for i in range(len(X)) if y[i] == label])
X_classify[label] = Xi
mju = np.mean(X, axis=0)
mju_classify = {}
# 每一类的mju
for label in label_:
mjui = np.mean(X_classify[label], axis=0)
mju_classify[label] = mjui
# 计算类内散度矩阵
# Sw = np.dot((X - mju).T, X - mju)
Sw = np.zeros((len(mju), len(mju)))
for i in label_:
Sw += np.dot((X_classify[i] - mju_classify[i]).T,
X_classify[i] - mju_classify[i])
# 计算类间散度矩阵
# Sb=St-Sw
Sb = np.zeros((len(mju), len(mju)))
for i in label_:
Sb += len(X_classify[i]) * np.dot((mju_classify[i] - mju).reshape(
(len(mju), 1)), (mju_classify[i] - mju).reshape((1, len(mju))))
eig_vals, eig_vecs = np.linalg.eig(
np.linalg.inv(Sw).dot(Sb)) # 计算Sw-1*Sb的特征值和特征矩阵
# 按从小到大排序,输出排序指示值
sorted_indices = np.argsort(eig_vals)
# 反转
sorted_indices = sorted_indices[::-1]
# 提取前k个特征向量
topk_eig_vecs = eig_vecs[:, sorted_indices[0:k:1]]
"""https://blog.csdn.net/qq_41973536/article/details/82690242
s[i:j:k],i起始位置,j终止位置,k表示步长,默认为1
s[::-1]是从最后一个元素到第一个元素复制一遍(反向)
"""
"""也可使用如下方法"""
# # 按从小到大排序,输出排序指示值
# sorted_indices = np.argsort(eig_vals)
# # 提取前k个特征向量
# topk_eig_vecs = eig_vecs[:, sorted_indices[:-k - 1:-1]]
return topk_eig_vecs
if '__main__' == __name__:
iris = load_iris()
X = iris.data
y = iris.target
W = LDA_dimensionality(X, y, 2)
X_new = np.dot((X), W) # 估计值
plt.figure(1)
plt.scatter(X_new[:, 0], X_new[:, 1], marker='o', c=y)
plt.title("LDA reducing dimension - our method")
plt.xlabel("x1")
plt.ylabel("x2")
# 与sklearn中的LDA函数对比
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
X_new = lda.transform(X)
X_new = - X_new # 为了对比方便,取个相反数,并不影响分类结果
print(X_new)
plt.figure(2)
plt.scatter(X_new[:, 0], X_new[:, 1], marker='o', c=y)
plt.title("LDA reducing dimension - sklearn method")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()