线性回归的鲁棒性很差,在整个是实数域内,其敏感性是一致的。
逻辑回归是一种减小预测范围,将预测值限制在[0,1]内。逻辑回归可以简单的理解为,在线性回归的基础上,套用了一个逻辑函数。其通常用于分类问题。例如,输出y>0.5时,认为是正类,否则为负类。由于将输出值限定在[0.1]内,所以可以认为输出值是一个概率。
线性回归:
我们采用python实现以上过程。其中,读取数据文件,效果评估用到了sklearn包提供 的一些工具,数据操作用到了numpy。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2017 - xiongjiezk
from sklearn import datasets, metrics
import math
import numpy as np
class LogitRegression:
def __init__(self, grad, learn_rate, epoch):
self.weights = []
self.bias = 0.0
self.grad = grad
self.learn_rate = learn_rate
self.epoch = epoch
def sigmoid(self, x):
return 1.0 / (1 + math.exp(-x))
def activate(self, x):
poly_value = np.dot(self.weights, x) + self.bias
return self.sigmoid(poly_value)
def rmse(self, y, y_pred):
return np.sum(np.square(np.subtract(y, y_pred)))
def fit(self, X, y):
self.weights = np.random.normal(size=np.shape(X[0]))
self.bias = 1.0
for iter in range(0, self.epoch):
y_pred = []
if self.grad == 'grad_decent':
for i in range(0, len(X)):
y_ = self.activate(X[i])
y_pred.append(y_)
delta = np.zeros(np.shape(self.weights))
for i in range(0, len(X)):
delta += np.array(y[i] - y_pred[i]) * np.array(X[i])
self.weights += self.learn_rate * delta
self.bias += self.learn_rate * np.sum(np.subtract(y, y_pred))
elif self.grad == 'stoc_grad_decent':
for i in range(0, len(X)):
y_ = self.activate(X[i])
self.weights += self.learn_rate * (y[i] - y_) * np.array(X[i])
self.bias += self.learn_rate * (y[i] - y_)
y_pred.append(y_)
else:
pass
loss = self.rmse(y, y_pred)
if iter % 100 == 0:
print('current epoch: %s, loss: %s' % (iter, loss))
def predict(self, X):
scores = []
for i in range(0, len(X)):
scores.append(self.activate(X[i]))
class_ = np.array([0, 1])
indices = (np.array(scores) > 0.5).astype(np.int)
return np.array(class_[indices])
if __name__ == '__main__':
data_and_labels = datasets.load_svmlight_file('E:/data/logit/train.txt')
X_train = np.reshape(data_and_labels[0].data, data_and_labels[0].shape)
y_train = data_and_labels[1]
test_and_labels = datasets.load_svmlight_file('E:/data/logit/test.txt')
X_test = np.reshape(test_and_labels[0].data, test_and_labels[0].shape)
y_test = test_and_labels[1]
logit = LogitRegression('grad_decent', 0.01, 1000)
logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)
print("Classification report for classifier %s:\n%s\n"
% ([logit.weights, logit.bias], metrics.classification_report(y_test, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, np.array(y_pred)))
运行效果如下:
current epoch: 800, loss: 2.45524645114
current epoch: 900, loss: 2.45569467863
Classification report for classifier [array([ 0.70676676, -1.68913992]), 12.64359503469959]:
precision recall f1-score support
0.0 1.00 0.88 0.93 8
1.0 0.92 1.00 0.96 12
avg / total 0.95 0.95 0.95 20
Confusion matrix:
[[ 7 1]
[ 0 12]]
训练数据文件
0 1:-0.017612 2:14.053064
1 1:-1.395634 2:4.662541
0 1:-0.752157 2:6.538620
0 1:-1.322371 2:7.152853
0 1:0.423363 2:11.054677
1 1:0.406704 2:7.067335
0 1:0.667394 2:12.741452
1 1:-2.460150 2:6.866805
0 1:0.569411 2:9.548755
0 1:-0.026632 2:10.427743
1 1:0.850433 2:6.920334
0 1:1.347183 2:13.175500
1 1:1.176813 2:3.167020
0 1:-1.781871 2:9.097953
1 1:-0.566606 2:5.749003
1 1:0.931635 2:1.589505
1 1:-0.024205 2:6.151823
1 1:-0.036453 2:2.690988
1 1:-0.196949 2:0.444165
1 1:1.014459 2:5.754399
1 1:1.985298 2:3.230619
1 1:-1.693453 2:-0.557540
0 1:-0.576525 2:11.778922
1 1:-0.346811 2:-1.678730
1 1:-2.124484 2:2.672471
0 1:1.217916 2:9.597015
0 1:-0.733928 2:9.098687
1 1:-3.642001 2:-1.618087
1 1:0.315985 2:3.523953
0 1:1.416614 2:9.619232
1 1:-0.386323 2:3.989286
1 1:0.556921 2:8.294984
0 1:1.224863 2:11.587360
1 1:-1.347803 2:-2.406051
1 1:1.196604 2:4.951851
0 1:0.275221 2:9.543647
0 1:0.470575 2:9.332488
0 1:-1.889567 2:9.542662
0 1:-1.527893 2:12.150579
0 1:-1.185247 2:11.309318
1 1:-0.445678 2:3.297303
1 1:1.042222 2:6.105155
0 1:-0.618787 2:10.320986
1 1:1.152083 2:0.548467
1 1:0.828534 2:2.676045
0 1:-1.237728 2:10.549033
1 1:-0.683565 2:-2.166125
1 1:0.229456 2:5.921938
0 1:-0.959885 2:11.555336
0 1:0.492911 2:10.993324
0 1:0.184992 2:8.721488
0 1:-0.355715 2:10.325976
0 1:-0.397822 2:8.058397
0 1:0.824839 2:13.730343
1 1:1.507278 2:5.027866
1 1:0.099671 2:6.835839
0 1:-0.344008 2:10.717485
1 1:1.785928 2:7.718645
0 1:-0.918801 2:11.560217
1 1:-0.364009 2:4.747300
1 1:-0.841722 2:4.119083
1 1:0.490426 2:1.960539
0 1:-0.007194 2:9.075792
0 1:0.356107 2:12.447863
0 1:0.342578 2:12.281162
1 1:-0.810823 2:-1.466018
1 1:2.530777 2:6.476801
0 1:1.296683 2:11.607559
0 1:0.475487 2:12.040035
0 1:-0.783277 2:11.009725
0 1:0.074798 2:11.023650
1 1:-1.337472 2:0.468339
0 1:-0.102781 2:13.763651
1 1:-0.147324 2:2.874846
0 1:0.518389 2:9.887035
0 1:1.015399 2:7.571882
1 1:-1.658086 2:-0.027255
1 1:1.319944 2:2.171228
1 1:2.056216 2:5.019981
1 1:-0.851633 2:4.375691
测试数据文件
0 1:-1.510047 2:6.061992
1 1:-1.076637 2:-3.181888
0 1:1.821096 2:10.283990
1 1:3.010150 2:8.401766
1 1:-1.099458 2:1.688274
1 1:-0.834872 2:-1.733869
1 1:-0.846637 2:3.849075
0 1:1.400102 2:12.628781
1 1:1.752842 2:5.468166
1 1:0.078557 2:0.059736
1 1:0.089392 2:-0.715300
0 1:1.825662 2:12.693808
0 1:0.197445 2:9.744638
1 1:0.126117 2:0.922311
1 1:-0.679797 2:1.220530
1 1:0.677983 2:2.556666
0 1:0.761349 2:10.693862
1 1:-2.168791 2:0.143632
0 1:1.388610 2:9.341997
0 1:0.317029 2:14.739025
参考资料:
https://www.cnblogs.com/sxron/p/5489214.html
http://blog.csdn.net/programmer_wei/article/details/52072939