机器学习day9 机器学习实战logistic回归和疝马病的预测

今天完成了疝马病的预测,结果符合书上预期,原理还是不太懂。有机会问问大神们吧。

实在没有精力贴过程了,贴个代码吧,操作和书上一样。

注意矩阵相乘用numpy的操作,mat。

注意在线算法和离线算法的区别。

这个数据集有数据缺失用0补齐,注意为什么用0,书上有介绍。


机器学习day9 机器学习实战logistic回归和疝马病的预测_第1张图片

这个在数据缺失30%的情况下,这个正确率是可以接受的,改变参数之后能降低错误率20%左右。

代码如下:

#!/usr/bin/python
#coding:utf-8

#logistic.py

from numpy import *
import numpy
import operator

#读取数据函数
def loadDataSet():
    datamat = []
    lavels = []
    f = open('testSet.txt')
    for line in f.readlines():
        line = line.strip().split()
        datamat.append([1.0, float(line[0]), float(line[1])])
        lavels.append(int(line[2]))
    return datamat, lavels

#阶跃函数
def sigmoid(inX):
    return 1.0 / (1 + exp(-inX))

#logistic回归梯度上升算法
def gradUp(dataset, lavels):
    datamat = numpy.mat(dataset)
    lavelsmat = numpy.mat(lavels).transpose()
    m, n = datamat.shape
    weights = ones((n, 1))
    alpha = 0.001
    cycles = 500
    for i in range(cycles):
        h = sigmoid(datamat * weights)
        error = (lavelsmat - h)
        weights = weights + alpha * datamat.transpose() * error
    return numpy.array(weights)

#logistic随机梯度上升算法(梯度上升算法改进)
def newGradUp(data, lavels, cycles = 150):
    dataset = numpy.array(data)
    m, n = numpy.shape(dataset)
    weights = ones(n)
    for i in range(cycles):
        dataindex = range(m)
        for j in range(m):
            x = int(random.uniform(0, len(dataindex)))
            #这个地方的alpha不知道如何选择
            alpha = 4.0 / (1.0 + i + j) + 0.01
            h = sigmoid(sum(dataset[dataindex[x]] * weights))
            error = lavels[dataindex[x]] - h
            weights = weights + alpha * error * dataset[dataindex[x]]
            del(dataindex[x])
    return weights            

#画出最佳拟合直线
def plotBestFit(weights):
    import matplotlib.pyplot as plt
    datamat, lavels = loadDataSet()
    dataarr = numpy.array(datamat)
    n = dataarr.shape[0]
    x1 = []
    x0 = []
    y1 = []
    y0 = []
    for i in range(n):
        if lavels[i] == 1:
            x1.append(dataarr[i][1])
            y1.append(dataarr[i][2])
        else:
            x0.append(dataarr[i][1])
            y0.append(dataarr[i][2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x1, y1, s = 30, c = 'red', marker = 's')
    ax.scatter(x0, y0, s = 30, c = 'green')
    x = arange(-3.0, 3.0, 0.1)
    y = (-weights[0] - weights[1] * x) / weights[2]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

#分类函数
def logisticType(inX, weights):
    p = sigmoid(sum(inX * weights))
    if p > 0.5:
        return 1.0
    else:
        return 0.0

#测试函数
def testLogistic():
    ftrain = open('horseColicTraining.txt')
    ftest = open('horseColicTest.txt')
    trainset = []
    trainlavels = []
    for i in ftrain.readlines():
        line = i.strip().split('\t')
        linearr = []
        for i in range(21):
            linearr.append(float(line[i]))
        trainset.append(linearr)
        trainlavels.append(float(line[21]))
    trainwei = newGradUp(numpy.array(trainset), trainlavels, 500)
    errorcount = 0
    numtest = 0
    for linet in ftest.readlines():
        numtest += 1
        linetest = linet.strip().split('\t')
        larr = []
        for i in range(21):
            larr.append(float(linetest[i]))
        if int(logisticType(larr, trainwei)) != int(linetest[21]):
            errorcount += 1
    errorrate = (float(errorcount) / numtest)
    print "error rate is %f"%errorrate
    return errorrate

#计算平均错误率
def aveErrorRate():
    numtest = 10
    errorsum = 0
    for i in range(numtest):
        errorsum += testLogistic()
    print "the average error rate is %f"%(float(errorsum) / numtest)        


你可能感兴趣的:(python,机器学习)