Logistic Regression

Handcrafted Gradient Descent Optimizer to solve logistic regression.

Sample Code

# import library
import os,sys
import numpy as np
from random import shuffle
import argparse
from math import log, floor
import pandas as pd


# IO File
def load_data(train_data_path, train_label_path, test_data_path):
    X_train = pd.read_csv(train_data_path, sep = ',', header = 0)
    X_train = np.array(X_train.values)
    Y_train = pd.read_csv(train_label_path, sep = ',', header = 0)
    Y_train = np.array(Y_train.values)
    X_test = pd.read_csv(test_data_path, sep = ',', header = 0)
    X_test = np.array(X_test.values)
    return (X_train, Y_train, X_test)



# define shuffle
def _shuffle(X, Y):
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])


# define normalize
def normalize(X_all, X_test):
    # Feature normalization with train and test X
    X_train_test = np.concatenate((X_all, X_test))
    mu = (sum(X_train_test) / X_train_test.shape[0])  # 求出每一列的平均值
    sigma = np.std(X_train_test, axis=0)  # axis=0计算每一列的标准差
    # print(mu.shape)      # (106,)
    # print(sigma.shape)   # (106,)
    mu = np.tile(mu, (X_train_test.shape[0], 1))
    sigma = np.tile(sigma, (X_train_test.shape[0], 1))
    # print(mu.shape)      # (48842, 106)
    # print(sigma.shape)   # (48842, 106)
    X_train_test_normed = (X_train_test - mu) / sigma  # 标准差标准化

    # Split to train, test again
    X_all = X_train_test_normed[0:X_all.shape[0]]
    X_test = X_train_test_normed[X_all.shape[0]:]
    return X_all, X_test



# define split valid
def split_valid_set(X_all, Y_all, percentage):
    all_data_size = len(X_all)
    valid_data_size = int(floor(all_data_size * percentage))

    X_all, Y_all = _shuffle(X_all, Y_all)
    X_train, Y_train = X_all[0:valid_data_size], Y_all[0:valid_data_size]
    X_valid, Y_valid = X_all[valid_data_size:], Y_all[valid_data_size:]

    return X_train, Y_train, X_valid, Y_valid


# define sigmoid
def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-8, 1-(1e-8))



# get valid score
def valid(w, b, X_valid, Y_valid):
    valid_data_size = len(X_valid)

    z = (np.dot(X_valid, np.transpose(w)) + b)
    y = sigmoid(z)
    y_ = np.around(y)
    result = (np.squeeze(Y_valid) == y_)
    print('Validation acc = %f ' % (float (result.sum()) / valid_data_size))
    return


# train model
def train(X_all, Y_all, save_dir):
    # split a 10% - validation set from the train set
    valid_set_percentage = 0.9  # train set为前90%,valid为后10%
    X_train, Y_train, X_valid, Y_valid = split_valid_set(X_all, Y_all, valid_set_percentage)
    # print(X_train.shape)  # (29304, 106)
    # print(X_valid.shape)  # (3257, 106)

    # Initiallize parameter, hyperparameter
    w = np.zeros((106,))
    b = np.zeros((1,))
    l_rate = 0.1         
    batch_size = 32
    train_data_size = len(X_train)
    step_num = int(floor(train_data_size / batch_size))
    epoch_num = 1000      # 循环 1000 次
    save_param_iter = 50  # 每隔 50 个epoch保存一次权重 w 和 b

    # Start training
    total_loss = 0.0
    for epoch in range(0, epoch_num): 
        # Do validation and parameter saving  
        if epoch % save_param_iter == 0:
            print('===== Saving Param at epoch %d =====' % epoch)
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)

            np.savetxt(os.path.join(save_dir, 'w'), w)
            np.savetxt(os.path.join(save_dir, 'b'), [b, ])
            print('epoch avg loss = %f ' % (total_loss / (float (save_param_iter) * train_data_size))) # 这句话应该是有问题的,以后再修改吧

            total_loss = 0.0
            valid(w, b, X_valid, Y_valid)

        # Random shuffle
        # 每个 epoch 都要打乱顺序
        X_train, Y_train = _shuffle(X_train, Y_train)
        # print(X_train.shape)  # (29304, 106)
        # print(Y_train.shape)  # (29304, 1)
        
        # Train with batch
        for idx in range(step_num):  
            X = X_train[idx * batch_size : (idx + 1) * batch_size]
            Y = Y_train[idx * batch_size : (idx + 1) * batch_size]
            # print(X.shape) # (32, 106)
            # print(Y.shape) # (32, 1)

            z = np.dot(X, np.transpose(w)) + b # 在此处转不转都一样,因为w和np.transpose(w)的维度一样
            y = sigmoid(z)
            # print(w.shape)   # (106,)
            # print(np.transpose(w).shape)  # (106,)
            # print(z.shape)  # (32,)
            # print(y.shape)  # (32,)
            
            # 求交叉熵
            # 此处不能去掉 np.squeeze, 因为 (32,1) vs (32,) 不一样  
            cross_entropy = -1 * (np.dot(np.squeeze(Y), np.log(y)) + np.dot((1 - np.squeeze(Y)), np.log(1 - y))) 
            total_loss += cross_entropy
            # print(cross_entropy) # 一个数字
            # print(total_loss)    # 一个数字

            # 求 w, b 的梯度
            w_grad = np.mean( -1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis = 0) # axis=0,压缩行,对各列求平均值
            # print((np.squeeze(Y) - y).shape)   # (32,)
            # print((np.squeeze(Y) - y).reshape((batch_size, 1)).shape)  # (32,1)
            b_grad = np.mean( -1 * (np.squeeze(Y) - y))
            # print(w_grad.shape)  # (106,)
            # print(b_grad)        # 一个数字

            # SGD updating parameters
            w = w - l_rate * w_grad
            b = b - l_rate * b_grad
            # print(w.shape)         # (106,)
            # print(b.shape)         # (1,)
    return


# infer & output ans.csv
def infer(X_test, save_dir, output_dir):
    test_data_size = len(X_test)

    # load parameters
    print('=====Loading Param from %s =====' % save_dir)
    w = np.loadtxt(os.path.join(save_dir, 'w'))
    b = np.loadtxt(os.path.join(save_dir, 'b'))

    # predict
    z = (np.dot(X_test, np.transpose(w)) + b)
    y = sigmoid(z)
    y_ = np.around(y)

    print('=====Write output to %s =====' % output_dir)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    output_path = os.path.join(output_dir, 'log_prediction.csv')
    with open(output_path, 'w') as f:
        f.write('id,label\n')
        for i, v in enumerate(y_):
            f.write('%d,%d\n' % (i + 1, v)) 
    return


# main block
def main(opts):
    # Load feature and label
    X_all, Y_all, X_test = load_data(opts.train_data_path, opts.train_label_path, opts.test_data_path)
    # print(X_all.shape)   # (32561, 106)
    # print(Y_all.shape)   # (32561, 1)
    # print(X_test.shape)  # (16281, 106)

    # Normalization
    X_all, X_test = normalize(X_all, X_test)
    # print(X_all.shape)     # (32561, 106) 
    # print(X_test.shape)    # (16281, 106)

    # To train or to infer(推断)  # 此处的infer也就相当于test
    if opts.train:
        train(X_all, Y_all, opts.save_dir)
    elif opts.infer:
        infer(X_test, opts.save_dir, opts.output_dir)
    else:
        print("Error: Argument --train or --infer not found")
    return


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description = 'Logistic Regression with Gradient Descent Method')

    group = parser.add_mutually_exclusive_group() # 定义一个互斥组
    group.add_argument('--train', action = 'store_true', default = False, dest = 'train', help = 'Input --train to Train')
    group.add_argument('--infer', action = 'store_true', default = False, dest = 'infer', help = 'Input --infer to Infer')

    parser.add_argument('--train_data_path', type=str,default='E:\李宏毅\机器学习\hw2\X_train', dest='train_data_path',help='Path to training data')
    parser.add_argument('--train_label_path', type=str, default='E:\李宏毅\机器学习\hw2\Y_train', dest='train_label_path', help='Path to training data\'s label')
    parser.add_argument('--test_data_path', type=str, default='E:\李宏毅\机器学习\hw2\X_test', dest='test_data_path',  help='Path to testing data')
    parser.add_argument('--save_dir', type=str,  default='E:\李宏毅\机器学习\hw2\save', dest='save_dir', help='Path to save the model parameters')
    parser.add_argument('--output_dir', type=str, default='E:\李宏毅\机器学习\hw2\predict', dest='output_dir', help='Path to save the model parameters')
    opts = parser.parse_args()
    main(opts)

参考资料

  • Assignment 2 - Income Prediction
  • ml2017-hw2
  • argparse用法总结
  • 数据标准化的方法与意义

你可能感兴趣的:(Logistic Regression)