对于公式的推导,详情见以下内容,接下来我们实现一下对数几率回归的实现
https://blog.csdn.net/Willen_/article/details/88867986
对应的实现都有注释
import numpy as np
import h5py
def load_data(file_name):
'''
数据导入函数
:param file_name: (string)训练数据位置
:return: feature_data(mat)特征
lable_data(mat)标签
'''
fr = open(file_name)
feature_data =[];
lable_data = [];
for line in fr.readlines():
curLine = []
lineArr = line.split('\t')
for i in range(0,2):
curLine.append(float(lineArr[i]))
feature_data.append(curLine)
if len(lineArr)<3:
continue;
tempLine = []
for i in range(2,3):
tempLine.append(int(lineArr[i]))
lable_data.append(tempLine)
feature_mat = np.array(feature_data,dtype=float)
label_mat = np.array(lable_data,dtype=int)
fr.close()
return feature_mat,label_mat
def propagate(w,b,X,Y):
'''
:param w: parameter
:param b: para
:param X: input
:param Y: label
:return: grads
'''
m = Y.shape[0]
A = sig(np.dot(X,w)+b)
cost = error_rate(A,Y)
dW = 1.0/m *np.dot(X.T,(A-Y))
db = 1.0/m *np.sum(A-Y)
assert (dW.shape == w.shape)
assert (db.dtype == float)
assert (cost.shape == ())
grads = {
"dw":dW,
"db":db
}
return grads,cost
def error_rate(h, lable_data):
'''
计算当前损失函数值
:param h: (mat)预测值
:param lable_data:(mat)真实值
:return: error(float)错误率
'''
m = lable_data.shape[0]
cost = -1.0/m * np.sum(lable_data *np.log(h) + (1 - lable_data)*np.log(1-h))
cost = np.squeeze(cost)
return cost
def sig(x):
'''
sigmoid函数
:param x:(mat)feature_data * w
:return:(mat)sigmoid值
'''
z = 1.0/(1+ np.exp(-x));
return z;
def shuffle(feature,label):
'''
get the shuffle feature and label
:param feature: the input data (num, feature)
:param label: the lable (num, 1)
:return: shuffle_feature(num,feature),shuffle_label(num,1)
'''
m = feature.shape[0]
permutation = list(np.random.permutation(m))
shuffle_feature = feature[permutation,:]
shuffle_label = label[permutation,:]
return shuffle_feature,shuffle_label
def lr_train_bgd(feature_data, lable_data, maxCycle, alpha):
'''
利用梯度下降法训练Logistic回归模型
:param feature_data: (mat)样本数据
:param lable_data: (mat)标签数据
:param maxCycle: (int)最大迭代次数
:param alpha: (float)学习率
:return: w(mat)权值
'''
w = np.random.randn(feature_data.shape[1],1)
b = 0
for i in range(maxCycle):
grads,cost =propagate(w, b, feature_data, lable_data)
dW = grads["dw"]
db = grads["db"]
if i%50 == 0:
print(cost)
w = w- alpha*dW;
b = b- alpha*db;
return w,b;
def save_model(name, w,b):
f = h5py.File(name,'w')
f.create_dataset('w',data=w)
f.create_dataset('b', data=b)
f.close()
if __name__ == "__main__":
# 1. 导入数据
print("------1. load data------")
feature_data, lable_data = load_data("train_data.txt")
feature_data, lable_data = shuffle(feature_data, lable_data)
# 2. 训练模型
print("------2. train------")
w,b = lr_train_bgd(feature_data, lable_data, 1000000, 0.0001)
# 保存最终模型
print("------3. save model------")
save_model("weight", w,b)
测试代码:
import numpy as np
import h5py
import train as at
import matplotlib.pyplot as plt
def loadParameter(name):
'''
:param name: h5py filename(string)
:return:
'''
W_file = h5py.File(name,'r')
w = np.array(W_file['w'])
b = np.array(W_file['b'])
W_file.close()
return w,b
def predict(w,b,feature_data):
'''
:param w:
:param b:
:param feature_data:
:return:
'''
out = at.sig(np.dot(feature_data,w)+b)
out = out>0.5
return out
if __name__ == "__main__":
# 1. 导入数据
print("------1. load data------")
feature_data, lable_data = at.load_data("test_data.txt")
print("------2. load model------")
w,b = loadParameter("weight")
x = np.arange(0,10)
y = at.sig(x)
print("------3. predict------")
out = predict(w, b, feature_data)
print(out)
数据集:https://download.csdn.net/download/willen_/11110641