通过本篇博客记录一下Logistic regression的代码实现以及k-fold cross validation 的运用,数据集使用sklearn的breast cancer。
与sklearn的实现有一些不同。
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, recall_score, precision_score # evaluation metrics
class logreg_sgd_clf:
def __init__(self, eta, n_iter_max=10):
"""
:param eta: learning rate/learning speed
:param n_iter_max: max iteration
"""
self.eta = eta
self.n_iter_max = n_iter_max
self.w = None # learning weights
self.margin_max = 0 # maximum margin in the training
def fit(self, X, y):
"""
Using stochastic gradient algorithm to solve the logistic regression problem
:param X: 2d array of input examples in the rows
:param y: 1d vector of +1, -1 labels
:return:
"""
m, n = X.shape
# initialize the weights
w = np.zeros(n)
self.margin_max = 0
# iteration on the full data
for t in range(self.n_iter_max):
# iteration on the example
for i in range(m):
xymargin = y[i] * np.dot(w, X[i]) # functional margin
if xymargin > self.margin_max:
self.margin_max = xymargin
# compute the stochastic gradient
phi_logistic = 1 / (1 + np.exp(-(-xymargin)))
delta_J = -phi_logistic * y[i] * X[i]
w = w - self.eta * delta_J
self.w = w
def predict(self, X):
"""
:param X: 2d array of input examples in the rows
:return: predicted label y
"""
xw = np.dot(X, self.w)
# predict +1 prob
y_positive_prob = 1 / (1 + np.exp(-xw))
# predict -1 prob
y_negative_prob = 1 / (1 + np.exp(xw))
y = 2 * (y_positive_prob > y_negative_prob) - 1
return y
对数据的列进行了normaliztion,使得 L 2 L_2 L2范数等于1。
if __name__ == "__main__":
# load the data
X, y = load_breast_cancer(return_X_y=True) # X input, y output
# to convert the {0,1} output into {-1,+1}
y = 2 * y - 1
# learning parameters
n_iter_max = 50 # maximum iteration
eta = 0.1 # learning speed
n_fold = 5 # number of folds
# to split the data into 5-folds we need
cselection = KFold(n_splits=n_fold, random_state=None, shuffle=False)
scaling = 2
# 0 no scaling,
# 1 scaling by row wise L2 norm,
# 2 scaling the rows bt maximum absolute value, L infinite norm of columns
if scaling == 1:
x_norm = np.sqrt(np.sum(X ** 2, 1))
x_norm += (x_norm == 0)
X /= np.outer(x_norm, np.ones(X.shape[1]))
elif scaling == 2:
X /= np.outer(np.ones(len(X)), np.max(np.abs(X), 0))
# construct a learning object
clf_logistic = logreg_sgd_clf(eta, n_iter_max)
# initialize the learning results for all folds
x_f1 = np.zeros(n_fold)
x_precision = np.zeros(n_fold)
x_recall = np.zeros(n_fold)
x_margin = np.zeros(n_fold)
i_fold = 0
# cross-validation
for index_train, index_test in cselection.split(X):
X_train = X[index_train]
y_train = y[index_train]
X_test = X[index_test]
y_test = y[index_test]
m_train = X_train.shape[0]
m_test = X_test.shape[0]
print('Training size: {}'.format(m_train))
print('Test size: {}'.format(m_test))
clf_logistic.fit(X_train, y_train) # training
y_pred = clf_logistic.predict(X_test) # predicting
x_precision[i_fold] = precision_score(y_test, y_pred)
x_recall[i_fold] = recall_score(y_test, y_pred)
x_f1[i_fold] = f1_score(y_test, y_pred)
x_margin[i_fold] = clf_logistic.margin_max
print("Fold: {}, f1: {}, precision: {}, recall: {}".format(i_fold, x_f1[i_fold], x_precision[i_fold],
x_recall[i_fold]))
print("Maximum margin: {}".format(x_margin[i_fold]))
i_fold += 1
print("The average f1: {}".format(np.mean(x_f1)))
print("The average maximum margin: {}".format(np.mean(x_margin)))
Training size: 455
Test size: 114
Fold: 0, f1: 0.8823529411764706, precision: 0.8035714285714286, recall: 0.9782608695652174
Maximum margin: 23.610358317642522
Training size: 455
Test size: 114
Fold: 1, f1: 0.9285714285714286, precision: 0.8666666666666667, recall: 1.0
Maximum margin: 23.944116535275672
Training size: 455
Test size: 114
Fold: 2, f1: 0.9605263157894737, precision: 0.9358974358974359, recall: 0.9864864864864865
Maximum margin: 23.970679419264144
Training size: 455
Test size: 114
Fold: 3, f1: 0.9941520467836257, precision: 0.9883720930232558, recall: 1.0
Maximum margin: 25.323800784515893
Training size: 456
Test size: 113
Fold: 4, f1: 0.983050847457627, precision: 0.9666666666666667, recall: 1.0
Maximum margin: 19.15645360608217
The average f1: 0.949730715955725
The average maximum margin: 23.20108173255608