对数几率logistics回归-数据集+代码实现

对数几率logistics回归-数据集+代码实现

对于公式的推导,详情见以下内容,接下来我们实现一下对数几率回归的实现

https://blog.csdn.net/Willen_/article/details/88867986

对应的实现都有注释

import numpy as np
import h5py
def load_data(file_name):
    '''
    数据导入函数
    :param file_name: (string)训练数据位置
    :return: feature_data(mat)特征
            lable_data(mat)标签
    '''
    fr = open(file_name)
    feature_data =[];
    lable_data = [];
    for line in fr.readlines():
        curLine = []
        lineArr = line.split('\t')
        for i in range(0,2):
            curLine.append(float(lineArr[i]))
        feature_data.append(curLine)
        if len(lineArr)<3:
            continue;
        tempLine = []
        for i in range(2,3):
            tempLine.append(int(lineArr[i]))
        lable_data.append(tempLine)
    feature_mat = np.array(feature_data,dtype=float)
    label_mat = np.array(lable_data,dtype=int)
    fr.close()
    return feature_mat,label_mat


def propagate(w,b,X,Y):
    '''
    :param w: parameter
    :param b: para
    :param X: input
    :param Y: label
    :return: grads
    '''
    m = Y.shape[0]
    A = sig(np.dot(X,w)+b)
    cost = error_rate(A,Y)
    dW = 1.0/m *np.dot(X.T,(A-Y))
    db = 1.0/m *np.sum(A-Y)
    assert (dW.shape == w.shape)
    assert (db.dtype == float)
    assert (cost.shape == ())
    grads = {
        "dw":dW,
        "db":db
    }
    return grads,cost

def error_rate(h, lable_data):
    '''
    计算当前损失函数值
    :param h: (mat)预测值
    :param lable_data:(mat)真实值
    :return: error(float)错误率
    '''
    m = lable_data.shape[0]
    cost = -1.0/m * np.sum(lable_data *np.log(h) + (1 - lable_data)*np.log(1-h))
    cost = np.squeeze(cost)
    return cost

def sig(x):
    '''
    sigmoid函数
    :param x:(mat)feature_data * w
    :return:(mat)sigmoid值
    '''
    z = 1.0/(1+ np.exp(-x));
    return z;

def shuffle(feature,label):
    '''
    get the shuffle feature and label
    :param feature: the input data (num, feature)
    :param label:  the lable (num, 1)
    :return: shuffle_feature(num,feature),shuffle_label(num,1)
    '''
    m = feature.shape[0]
    permutation = list(np.random.permutation(m))
    shuffle_feature = feature[permutation,:]
    shuffle_label = label[permutation,:]
    return shuffle_feature,shuffle_label

def lr_train_bgd(feature_data, lable_data, maxCycle, alpha):
    '''
    利用梯度下降法训练Logistic回归模型
    :param feature_data: (mat)样本数据
    :param lable_data: (mat)标签数据
    :param maxCycle: (int)最大迭代次数
    :param alpha: (float)学习率
    :return: w(mat)权值
    '''
    w =  np.random.randn(feature_data.shape[1],1)
    b = 0
    for i in range(maxCycle):
        grads,cost =propagate(w, b, feature_data, lable_data)
        dW = grads["dw"]
        db = grads["db"]
        if i%50 == 0:
            print(cost)
        w = w- alpha*dW;
        b = b- alpha*db;
    return w,b;

def save_model(name, w,b):
    f = h5py.File(name,'w')
    f.create_dataset('w',data=w)
    f.create_dataset('b', data=b)
    f.close()


if __name__ == "__main__":
    # 1. 导入数据
    print("------1. load data------")
    feature_data, lable_data = load_data("train_data.txt")
    feature_data, lable_data = shuffle(feature_data, lable_data)
    # 2. 训练模型
    print("------2. train------")
    w,b = lr_train_bgd(feature_data, lable_data, 1000000, 0.0001)
    # 保存最终模型
    print("------3. save model------")
    save_model("weight", w,b)


测试代码:

import numpy as np
import h5py
import train as at
import matplotlib.pyplot as plt
def loadParameter(name):
    '''
    :param name:  h5py filename(string)
    :return:
    '''
    W_file = h5py.File(name,'r')
    w = np.array(W_file['w'])
    b = np.array(W_file['b'])
    W_file.close()
    return w,b

def predict(w,b,feature_data):
    '''
    :param w:
    :param b:
    :param feature_data:
    :return:
    '''
    out = at.sig(np.dot(feature_data,w)+b)
    out = out>0.5
    return out

if __name__ == "__main__":
    # 1. 导入数据
    print("------1. load data------")
    feature_data, lable_data = at.load_data("test_data.txt")
    print("------2. load model------")
    w,b = loadParameter("weight")

    x = np.arange(0,10)
    y = at.sig(x)

    print("------3. predict------")
    out = predict(w, b, feature_data)
    print(out)

 

 数据集:https://download.csdn.net/download/willen_/11110641

你可能感兴趣的:(机器学习)