(手写推导的公式传了好几次都传不上来)
以下是南非男性心脏病的一个逻辑回归例子:
传统的逻辑回归的代码应该将偏置加入到X中,再在weights的首位加个1,方便矩阵运算。我这里是分开算得
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
"""
强烈建议一步一步来,每一步看看输出的结果是不是自己想要的。
"""
#载入数据南非男性心脏病数据
SAHeart_df = pd.read_csv('SAHeart.csv')
SAHeart_df['famhist'] = pd.get_dummies(SAHeart_df['famhist'])#把类型变量famihst变成哑变量
#获得X,Y
#选择特征X
X = (SAHeart_df[['famhist','sbp','tobacco','ldl','adiposity','typea','obesity','alcohol','age']])
Y = np.array(SAHeart_df[['chd']])
#归一化特征
cols_to_Normal = ['sbp','tobacco','ldl','adiposity','typea','obesity','alcohol','age']
X_new = X[cols_to_Normal]
X_new = X_new[cols_to_Normal].apply(lambda rec:(rec-rec.mean())/rec.std(),axis = 0)
X_new['famhist'] = SAHeart_df['famhist']
X_new = np.array(X_new) #变成矩阵,方便后续的计算
def initialize_betas(dim):
b = random.random() #偏置
w = np.random.rand(dim)
return b,w
b,w = initialize_betas(X_new.shape[1])
"""
这个地方千万注意y_hat 的预测值是不能大于1或者小于0。如果出现了大于1 的数则说明公式打错了
"""
def sigmoid(b,w,X_new):
Z = b + np.matmul(X_new,w)
return (1/(1+np.exp(-Z)))
#预测值
y_hat = sigmoid(b,w,X_new)
#损失函数
def get_cost(y,y_hat):
return - np.sum(np.dot(y.T,np.log(1-y_hat)+ np.dot((1-y).T,np.log(1-y_hat)))) / (len(y))
# Reshape Y
Y=Y.reshape(len(Y), )
current_cost= get_cost(Y,y_hat)
def update_beta(b_0, w_0, y, y_hat, X_new, alpha):
# 偏置参数更新
db = np.sum(y_hat - y) / len(y)
b_0 = b_0 - alpha * db
# error
dw = np.dot((y_hat - y), X_new) / len(y) # dw是平均梯度,公式里的负号直接放到了式子里
w_0 = w_0 - alpha * dw #沿着负梯度方向更新
return b_0, w_0
if __name__ == '__main__':
num_iterations = 2500
alpha = 0.01
all_costs = []
b, w = initialize_betas(X_new.shape[1])
print("initial guess of b and w: ", b, w)
for each_iter in range(num_iterations):
y_hat = sigmoid(b, w, X_new)
current_cost = get_cost(Y, y_hat)
prev_b = b
prev_w = w
b, w = update_beta(prev_b, prev_w, Y, y_hat, X_new, alpha)
all_costs.append(current_cost)
if each_iter % 10 == 0:
print('Iteration: ', each_iter, 'Cost: ', current_cost)
each_iter += 1
# print('b_0:', b_0, 'b_1:',b_1,'b_2:',b_2,'b_3:',b_3,'b_4:', b_4, 'b_5:',b_5,'b_6:',b_6,'b_7:',b_7,'b_8:',b_8,'b_9:',b_9)
print("Final estimates of b and q are: ", b, w)
#画图
plt.scatter(range(num_iterations), all_costs)
plt.show()
下面是使用sklearn调包算的:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
#载入数据南非男性心脏病数据
SAHeart_df = pd.read_csv('SAHeart.csv')
SAHeart_df['famhist'] = pd.get_dummies(SAHeart_df['famhist'])#把类型变量famihst变成哑变量
#获得X,Y
#选择特征X
X = (SAHeart_df[['famhist','sbp','tobacco','ldl','adiposity','typea','obesity','alcohol','age']])
Y = np.array(SAHeart_df[['chd']])
#使用sklearn归一化特征
scaler = StandardScaler()
X = scaler.fit_transform(X)
#训练集,数据集
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)
clf = LogisticRegression(C=1,penalty='l2',fit_intercept= True,solver='newton-cg') # 实例化
lmodel = clf.fit(X, Y)
print(lmodel.intercept_)
print(lmodel.coef_)
score = clf.score(x_test,y_test)
print(score)