说明
这里采用梯度下降法(gradient descent)学习logistic回归模型的参数,如果数据量比较大,速度会变慢,可以在迭代过程中改用其他学习方法。如果需要优化,请参考 @zouxy09 的文章
http://blog.csdn.net/zouxy09/article/details/20319673,不明白logistic回归原理的也可以参考这篇文章。
logistic回归最终需要求解的参数是w,即回归系数。且回归问题最终转化为无约束的优化问题,求解方法为梯度下降法,用来求最终的对数似然函数L(w)的最大值。
参考资料
@zouxy09 的”logistic回归“写的特别好,主要细节都说清楚了,链接地址: http://blog.csdn.net/zouxy09/article/details/20319673 。
代码
LR.py
from numpy import *
import matplotlib.pyplot as plt
def loadDataset(filename):
train_x = []
train_y = []
fileIn = open(filename)
for line in fileIn.readlines():
lineArr = line.strip().split()
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[2]))
return train_x, train_y
def sigmoid(inX):
return 1.0/(1+exp(-inX))
def trainLR(train_x,train_y,alpha=0.001,iternum=1000):
matx=mat(train_x)
maty=mat(train_y).transpose()
m,n = shape(matx)
weights = ones((n,1))
for i in range(iternum):
weights = weights + alpha * matx.transpose() * ( maty - sigmoid( matx * weights ))
return weights.getA() # convert mat to array
def testLR(test_x,test_y,weights):
test_size = len(test_y)
matchCount = 0
for i in range(test_size):
predict_y = 0
if sum([test_x[i][j]*weights[j] for j in range(len(weights))]) > 0:
predict_y = 1
if predict_y == int(test_y[i]):
matchCount += 1
return matchCount / float(test_size)
def plotLR(cur_x,cur_y,weights,figIndex):
ax = plotLR.fig.add_subplot(figIndex)
posx = []; posy = []
negx = []; negy = []
for i in range(len(cur_y)):
if int(cur_y[i]) == 1:
posx.append(cur_x[i][1])
posy.append(cur_x[i][2])
else:
negx.append(cur_x[i][1])
negy.append(cur_x[i][2])
ax.scatter(posx,posy,30,c='r',marker='s')
ax.scatter(negx,negy,30,c='b',marker='^')
minX = min(min(posx),min(negx))
maxX = max(max(posx),max(negx))
xs = arange( minX - 1.0 , maxX + 1.0 , 0.1)
ys=(-weights[0]-weights[1]*xs)/weights[2]
ax.plot(xs,ys)
plt.xlabel('X1')
plt.ylabel('X2')
if __name__ == '__main__':
print "Step 1. load dataset"
train_x,train_y = loadDataset("trainset.txt")
test_x,test_y = loadDataset("testset.txt")
print "\nStep 2. train"
weights = trainLR(train_x,train_y)
print "\nStep 3. test"
accuracy = testLR(test_x, test_y, weights)
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
print "\nStep 4. show the result"
plotLR.fig = plt.figure()
plotLR(train_x,train_y,weights,121)
plotLR(test_x,test_y,weights,122)
plt.show()
注:在testLR中,当w·x > 0 时,预测为正例,否则为负例。
运行结果