今天完成了疝马病的预测,结果符合书上预期,原理还是不太懂。有机会问问大神们吧。
实在没有精力贴过程了,贴个代码吧,操作和书上一样。
注意矩阵相乘用numpy的操作,mat。
注意在线算法和离线算法的区别。
这个数据集有数据缺失用0补齐,注意为什么用0,书上有介绍。
这个在数据缺失30%的情况下,这个正确率是可以接受的,改变参数之后能降低错误率20%左右。
代码如下:
#!/usr/bin/python #coding:utf-8 #logistic.py from numpy import * import numpy import operator #读取数据函数 def loadDataSet(): datamat = [] lavels = [] f = open('testSet.txt') for line in f.readlines(): line = line.strip().split() datamat.append([1.0, float(line[0]), float(line[1])]) lavels.append(int(line[2])) return datamat, lavels #阶跃函数 def sigmoid(inX): return 1.0 / (1 + exp(-inX)) #logistic回归梯度上升算法 def gradUp(dataset, lavels): datamat = numpy.mat(dataset) lavelsmat = numpy.mat(lavels).transpose() m, n = datamat.shape weights = ones((n, 1)) alpha = 0.001 cycles = 500 for i in range(cycles): h = sigmoid(datamat * weights) error = (lavelsmat - h) weights = weights + alpha * datamat.transpose() * error return numpy.array(weights) #logistic随机梯度上升算法(梯度上升算法改进) def newGradUp(data, lavels, cycles = 150): dataset = numpy.array(data) m, n = numpy.shape(dataset) weights = ones(n) for i in range(cycles): dataindex = range(m) for j in range(m): x = int(random.uniform(0, len(dataindex))) #这个地方的alpha不知道如何选择 alpha = 4.0 / (1.0 + i + j) + 0.01 h = sigmoid(sum(dataset[dataindex[x]] * weights)) error = lavels[dataindex[x]] - h weights = weights + alpha * error * dataset[dataindex[x]] del(dataindex[x]) return weights #画出最佳拟合直线 def plotBestFit(weights): import matplotlib.pyplot as plt datamat, lavels = loadDataSet() dataarr = numpy.array(datamat) n = dataarr.shape[0] x1 = [] x0 = [] y1 = [] y0 = [] for i in range(n): if lavels[i] == 1: x1.append(dataarr[i][1]) y1.append(dataarr[i][2]) else: x0.append(dataarr[i][1]) y0.append(dataarr[i][2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(x1, y1, s = 30, c = 'red', marker = 's') ax.scatter(x0, y0, s = 30, c = 'green') x = arange(-3.0, 3.0, 0.1) y = (-weights[0] - weights[1] * x) / weights[2] ax.plot(x, y) plt.xlabel('X1') plt.ylabel('X2') plt.show() #分类函数 def logisticType(inX, weights): p = sigmoid(sum(inX * weights)) if p > 0.5: return 1.0 else: return 0.0 #测试函数 def testLogistic(): ftrain = open('horseColicTraining.txt') ftest = open('horseColicTest.txt') trainset = [] trainlavels = [] for i in ftrain.readlines(): line = i.strip().split('\t') linearr = [] for i in range(21): linearr.append(float(line[i])) trainset.append(linearr) trainlavels.append(float(line[21])) trainwei = newGradUp(numpy.array(trainset), trainlavels, 500) errorcount = 0 numtest = 0 for linet in ftest.readlines(): numtest += 1 linetest = linet.strip().split('\t') larr = [] for i in range(21): larr.append(float(linetest[i])) if int(logisticType(larr, trainwei)) != int(linetest[21]): errorcount += 1 errorrate = (float(errorcount) / numtest) print "error rate is %f"%errorrate return errorrate #计算平均错误率 def aveErrorRate(): numtest = 10 errorsum = 0 for i in range(numtest): errorsum += testLogistic() print "the average error rate is %f"%(float(errorsum) / numtest)