该代码基本上参考了zouxy09的专栏中的逻辑回归代码,他对于ML一些经典的算法都写得很不错,读后受益匪浅。
直接上代码:
#-*- coding:utf8 -*-
'''
逻辑回归1.0
Author:Stephen
2016.10.6
'''
from numpy import *
import matplotlib.pyplot as plt
import time
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
# input: train_x is a mat datatype, each row stands for one sample
# train_y is mat datatype too, each row is the corresponding label
# opts is optimize option include step and maximum number of iterations
def trainLogRegres(train_x, train_y, opts):
'训练逻辑分类器'
startTime = time.time() # calculate training time
numSamples, numFeatures = shape(train_x)
weights = ones((numFeatures, 1))
alpha = opts['alpha']
maxIter = opts['maxIter']
# optimize through gradient descent algorilthm
for k in range(maxIter):
if opts['optimizeType'] == 'GradDescent': # 梯度下降 梯度上升算法计算最佳回归系数
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'StoGradDescent': # 随机梯度下降
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'SmoothStoGradDescent': #随机梯度下降改进版本
# 1.alpha系数随着迭代逐渐减小 2.随机选择样本
randIndex = range(numSamples)
for i in range(numSamples):
alpha = 4 / (1.0 + k + i) + 0.01
temp = int(random.uniform(0, len(randIndex)))
output = sigmoid(train_x[temp, :] * weights)
error = train_y[temp, 0] - output
weights = weights + alpha * train_x[temp, :].transpose() * error
del(randIndex[temp])
else:
raise NameError('Not support optimize method type!')
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
return weights
def testLogRegres(weights, test_x, test_y):
'把训练好的参数带入到指定测试集中测试,算出精确度'
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
def showLogRegres(weights, train_x, train_y):
'notice: train_x and train_y is mat datatype'
numSamples, numFeatures = shape(train_x)
if numFeatures != 3:
print "Sorry! I can not draw because the dimension of your data is not 2!"
return 1
# draw all samples
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
elif int(train_y[i, 0]) == 1:
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
# draw the classify line
min_x = min(train_x[:, 1])[0, 0] #x矩阵中第2列中最小的元素 第一列全为1
max_x = max(train_x[:, 1])[0, 0] #x矩阵中第2列中最大的元素
weights = weights.getA() # 把参数矩阵转化成数组
x = arange(min_x, max_x, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
plt.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.axis([-4, 4, -5, 15])
plt.legend(('0', '1'), loc='upper left')
plt.title('LogisticRegression Algorithm')
plt.show()
def loadData():
train_x = []
train_y = []
fileIn = open(r'E:/PythonTest/ML/LogisticRegression/Example1/testSet.txt')
for line in fileIn.readlines():
lineArr = line.strip().split()
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[2]))
return mat(train_x), mat(train_y).transpose()
## step 1: load data
print "step 1: load data..."
train_x, train_y = loadData()
test_x = train_x
test_y = train_y
## step 2: training...
#optimizeType参数有:GradDescent, StoGradDescent, SmoothStoGradDescent
print "step 2: training..."
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'SmoothStoGradDescent'}
optimalWeights = trainLogRegres(train_x, train_y, opts)
## step 3: testing
print "step 3: testing..."
accuracy = testLogRegres(optimalWeights, test_x, test_y)
## step 4: show the result
print "step 4: show the result..."
print 'The classify accuracy is: %.2f%%' % (accuracy * 100)
showLogRegres(optimalWeights, train_x, train_y)
训练数据:
-0.017612 14.053064 0
-1.395634 4.662541 1
-0.752157 6.538620 0
-1.322371 7.152853 0
0.423363 11.054677 0
0.406704 7.067335 1
0.667394 12.741452 0
-2.460150 6.866805 1
0.569411 9.548755 0
-0.026632 10.427743 0
0.850433 6.920334 1
1.347183 13.175500 0
1.176813 3.167020 1
-1.781871 9.097953 0
-0.566606 5.749003 1
0.931635 1.589505 1
-0.024205 6.151823 1
-0.036453 2.690988 1
-0.196949 0.444165 1
1.014459 5.754399 1
1.985298 3.230619 1
-1.693453 -0.557540 1
-0.576525 11.778922 0
-0.346811 -1.678730 1
-2.124484 2.672471 1
1.217916 9.597015 0
-0.733928 9.098687 0
-3.642001 -1.618087 1
0.315985 3.523953 1
1.416614 9.619232 0
-0.386323 3.989286 1
0.556921 8.294984 1
1.224863 11.587360 0
-1.347803 -2.406051 1
1.196604 4.951851 1
0.275221 9.543647 0
0.470575 9.332488 0
-1.889567 9.542662 0
-1.527893 12.150579 0
-1.185247 11.309318 0
-0.445678 3.297303 1
1.042222 6.105155 1
-0.618787 10.320986 0
1.152083 0.548467 1
0.828534 2.676045 1
-1.237728 10.549033 0
-0.683565 -2.166125 1
0.229456 5.921938 1
-0.959885 11.555336 0
0.492911 10.993324 0
0.184992 8.721488 0
-0.355715 10.325976 0
-0.397822 8.058397 0
0.824839 13.730343 0
1.507278 5.027866 1
0.099671 6.835839 1
-0.344008 10.717485 0
1.785928 7.718645 1
-0.918801 11.560217 0
-0.364009 4.747300 1
-0.841722 4.119083 1
0.490426 1.960539 1
-0.007194 9.075792 0
0.356107 12.447863 0
0.342578 12.281162 0
-0.810823 -1.466018 1
2.530777 6.476801 1
1.296683 11.607559 0
0.475487 12.040035 0
-0.783277 11.009725 0
0.074798 11.023650 0
-1.337472 0.468339 1
-0.102781 13.763651 0
-0.147324 2.874846 1
0.518389 9.887035 0
1.015399 7.571882 0
-1.658086 -0.027255 1
1.319944 2.171228 1
2.056216 5.019981 1
-0.851633 4.375691 1
-1.510047 6.061992 0
-1.076637 -3.181888 1
1.821096 10.283990 0
3.010150 8.401766 1
-1.099458 1.688274 1
-0.834872 -1.733869 1
-0.846637 3.849075 1
1.400102 12.628781 0
1.752842 5.468166 1
0.078557 0.059736 1
0.089392 -0.715300 1
1.825662 12.693808 0
0.197445 9.744638 0
0.126117 0.922311 1
-0.679797 1.220530 1
0.677983 2.556666 1
0.761349 10.693862 0
-2.168791 0.143632 1
1.388610 9.341997 0
0.317029 14.739025 0
实现了改进后的随机梯度下降后,分类结果为:
Data Mining之路还很漫长,加油.