【学习笔记】西瓜书机器学习之第三章:逻辑回归 南非男性心脏病分类预测(二)

(手写推导的公式传了好几次都传不上来)
以下是南非男性心脏病的一个逻辑回归例子:


传统的逻辑回归的代码应该将偏置加入到X中,再在weights的首位加个1,方便矩阵运算。我这里是分开算得

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
"""
强烈建议一步一步来,每一步看看输出的结果是不是自己想要的。
"""
#载入数据南非男性心脏病数据
SAHeart_df = pd.read_csv('SAHeart.csv')
SAHeart_df['famhist'] = pd.get_dummies(SAHeart_df['famhist'])#把类型变量famihst变成哑变量


#获得X,Y
#选择特征X
X = (SAHeart_df[['famhist','sbp','tobacco','ldl','adiposity','typea','obesity','alcohol','age']])
Y = np.array(SAHeart_df[['chd']])

#归一化特征
cols_to_Normal = ['sbp','tobacco','ldl','adiposity','typea','obesity','alcohol','age']
X_new = X[cols_to_Normal]
X_new = X_new[cols_to_Normal].apply(lambda rec:(rec-rec.mean())/rec.std(),axis = 0)
X_new['famhist'] = SAHeart_df['famhist']
X_new = np.array(X_new) #变成矩阵,方便后续的计算

def initialize_betas(dim):
    b = random.random() #偏置
    w = np.random.rand(dim)
    return b,w
b,w = initialize_betas(X_new.shape[1])

"""
这个地方千万注意y_hat 的预测值是不能大于1或者小于0。如果出现了大于1 的数则说明公式打错了
"""
def sigmoid(b,w,X_new):
    Z = b + np.matmul(X_new,w)
    return (1/(1+np.exp(-Z)))

#预测值
y_hat = sigmoid(b,w,X_new)


#损失函数
def get_cost(y,y_hat):
    return - np.sum(np.dot(y.T,np.log(1-y_hat)+ np.dot((1-y).T,np.log(1-y_hat)))) / (len(y))

# Reshape Y
Y=Y.reshape(len(Y), )
current_cost= get_cost(Y,y_hat)

def update_beta(b_0, w_0, y, y_hat, X_new, alpha):
    # 偏置参数更新
    db = np.sum(y_hat - y) / len(y)
    b_0 = b_0 - alpha * db
    # error
    dw = np.dot((y_hat - y), X_new) / len(y) # dw是平均梯度,公式里的负号直接放到了式子里
    w_0 = w_0 - alpha * dw #沿着负梯度方向更新

    return b_0, w_0

if __name__ == '__main__':
    num_iterations = 2500
    alpha = 0.01
    all_costs = []
    b, w = initialize_betas(X_new.shape[1])
    print("initial guess of b and w: ", b, w)

    for each_iter in range(num_iterations):
        y_hat = sigmoid(b, w, X_new)
        current_cost = get_cost(Y, y_hat)
        prev_b = b
        prev_w = w
        b, w = update_beta(prev_b, prev_w, Y, y_hat, X_new, alpha)
        all_costs.append(current_cost)
        if each_iter % 10 == 0:
            print('Iteration: ', each_iter, 'Cost: ', current_cost)
            each_iter += 1

    # print('b_0:', b_0, 'b_1:',b_1,'b_2:',b_2,'b_3:',b_3,'b_4:', b_4, 'b_5:',b_5,'b_6:',b_6,'b_7:',b_7,'b_8:',b_8,'b_9:',b_9)
    print("Final estimates of b and q are: ", b, w)

    #画图
    plt.scatter(range(num_iterations), all_costs)
    plt.show()


下面是使用sklearn调包算的:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
#载入数据南非男性心脏病数据
SAHeart_df = pd.read_csv('SAHeart.csv')
SAHeart_df['famhist'] = pd.get_dummies(SAHeart_df['famhist'])#把类型变量famihst变成哑变量


#获得X,Y
#选择特征X
X = (SAHeart_df[['famhist','sbp','tobacco','ldl','adiposity','typea','obesity','alcohol','age']])
Y = np.array(SAHeart_df[['chd']])
#使用sklearn归一化特征
scaler = StandardScaler()
X = scaler.fit_transform(X)

#训练集,数据集
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)

clf = LogisticRegression(C=1,penalty='l2',fit_intercept= True,solver='newton-cg') # 实例化
lmodel = clf.fit(X, Y)
print(lmodel.intercept_)
print(lmodel.coef_)
score = clf.score(x_test,y_test)
print(score)

你可能感兴趣的:(机器学习,python,机器学习,逻辑回归,数据分析)