Logistic回归

原理

对分类边界建立回归公式，找到最佳拟合参数，以此来进行分类。

优点：

计算代价不高，易于理解和实现。

缺点：

容易欠拟合，分类精度可能不高。在数据不是完全线性可分的情况，永远不会收敛。

适用数据类型：

标称型数据和数值型数据

#加载数据集
from numpy import mat
def loadDataSet():
    dataMat = []
    labelMat = []
    fr = open('../../Reference Code/Ch05/testSet.txt')
    #type(fr.readlines()): list; 
    #type(fr.read()): str
    for line in fr.readlines():
        #line = '-0.017612\t14.053064\t0\n'
        #lineList = ['-0.017612', '14.053064', '0']
        lineList = line.strip().split() #需要对line进行分割处理，才能成为一个list
        dataMat.append([1.0,float(lineList[0]),float(lineList[1])])
        labelMat.append(int(lineList[2]))
    return mat(dataMat),mat(labelMat)
dataMat,labelMat = loadDataSet()

from numpy import *
#定义sigmoid函数
def sigmoid(inX):
    return 1.0/(1+exp(-inX))

#定义梯度上升算法
def gradAscent(dataMat,labelMat):
    labelMat = labelMat.T #转置变成m行1列
    m,n = shape(dataMat) #(m,n)
    weight = ones((n,1)) #(n,1)
    maxCycle = 500 #最大迭代次数
    a=0.001
    for k in range(maxCycle):
        h = sigmoid(dataMat*weight) #(m,1)
        err = labelMat - h #(m,1)
        weight = weight + a*dataMat.T*err
    return weight
weight = gradAscent(dataMat,labelMat)
weight

matrix([[ 4.12414349],
        [ 0.48007329],
        [-0.6168482 ]])

#画出决策边界
import matplotlib.pyplot as plt
def plotBestFit(weight):
    #加载数据
    dataMat,labelMat = loadDataSet()
    
    #筛选出y=1和y=0两类样本数据
    x1_1 = [];x2_1= [];
    x1_0 = [];x2_0= [];
    m = shape(dataList)[0] #m行
    for i in range(m):
        if labelList[i] == 1:
            x1_1.append(dataMat[i,1]);x2_1.append(dataMat[i,2]);
        else:
            x1_0.append(dataMat[i,1]);x2_0.append(dataMat[i,2]);

    #拟合的直线: w0+w1*x1+w2*x2=0
    weight = array(weight) #matrix 转化为 array格式
    x1 = arange(-4,4,0.001)#array
    x2 = (-weight[0]-weight[1]*x1)/weight[2]#array
    
    #画图
    plt.figure()
    plt.scatter(x1_1,x2_1,c='r',label='Class 1') #画出类别1的样本
    plt.scatter(x1_0,x2_0,c='g',label='Class 0') #画出类别0的样本
    plt.plot(x1,x2) #若x1或者x2是matrix格式，是画不出图的
    plt.legend()
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()
plotBestFit(weight)

output_2_0.png

梯度上升VS随机梯度上升

梯度上升：每次更新权重weight都要重新计算所有的样本，计算复杂度高；属于批处理；

代码：

for k in range(maxCycle):
        h = sigmoid(dataMat*weight) #(m,1)
        err = labelMat - h #(m,1)
        weight = weight + a*dataMat.transpose()*err

随机梯度上升：每次进来一个样本就更新一次权重weight；属于在线学习；

    for i in range(m):
        h = sigmoid(dataMat[i]*weight) #(1,1)
        err = labelMat[i] - h #(1,1)
        weight = weight + a*dataMat[i].transpose()*err#(n,1)
        ```


```python
#随机梯度上升算法
from numpy import *
def stocGradAscent(dataMat,labelMat):
    labelMat = labelMat.transpose() #转置变成m行1列
    m,n = shape(dataMat)
    a=0.01
    weight = ones((n,1)) #(n,1)
    for i in range(m):
        h = sigmoid(dataMat[i]*weight) #(1,1)
        err = labelMat[i] - h #(1,1)
        weight = weight + a*dataMat[i].transpose()*err#(n,1)
    return weight
weight = stocGradAscent(dataMat,labelMat)
weight

matrix([[ 1.01702007],
        [ 0.85914348],
        [-0.36579921]])

#随机梯度上升算法的决策边界
plotBestFit(weight)

output_5_0.png

#改进的随机梯度上升算法
import random
def uptatestocGradAscent(dataMat,labelMat,numIter = 150):
    labelMat=labelMat.T
    m,n = shape(dataMat)
    weight = ones((n,1))
    
    randomIndex = random.sample(range(m),m)
    for numiter in range(numIter):
        for i in range(m):
            a = 4/(1.0+numiter+i)+0.01 #采用动态的alpha，随着迭代次数而不断减小，可以缓解权重weight的波动
            index = randomIndex[i] #随机抽取样本来更新权重，可以减少权重的周期性波动
            h = sigmoid(dataMat[index]*weight)
            err = labelMat[index] - h
            weight = weight + a*dataMat[index].T*err
    return weight
weight = uptatestocGradAscent(dataMat,labelMat,numIter = 20)
weight

matrix([[11.53934818],
        [ 1.37987182],
        [-1.50178942]])

#改进的随机梯度上升算法的决策边界
plotBestFit(weight)

output_7_0.png

随机梯度上升VS改进的随机梯度上升

#随机梯度上升算法
from numpy import *
import matplotlib.pyplot as plt
def stocGradAscentPlot(dataMat,labelMat):
    labelMat = labelMat.transpose() #转置变成m行1列
    m,n = shape(dataMat)
    a=0.01
    weight = ones((n,1)) #(n,1)
    x0=[];x1=[];x2=[]
    
    numIter = 500
    for i in range(numIter):#迭代200次
        for i in range(m):
            h = sigmoid(dataMat[i]*weight) #(1,1)
            err = labelMat[i] - h #(1,1)
            weight = weight + a*dataMat[i].transpose()*err#(n,1)
        x0.append(float(weight[0]))
        x1.append(float(weight[1]))
        x2.append(float(weight[2]))

    #画图weight
    plt.figure(figsize=(8,10))
    plt.subplot(311)
    plt.plot(range(numIter),x0)
    plt.ylabel('X0')
#     plt.yticks(linspace(1,10,20))
    
    plt.subplot(312)
    plt.plot(range(numIter),x1)
    plt.ylabel('X1')
    
    plt.subplot(313)
    plt.plot(range(numIter),x2)
    plt.ylabel('X2')
    
    plt.tight_layout()
    plt.show()
stocGradAscentPlot(dataMat,labelMat)

output_9_0.png

#改进的随机梯度上升算法
import random
def uptatestocGradAscentPlot(dataMat,labelMat,numIter = 150):
    labelMat=labelMat.T
    m,n = shape(dataMat)
    weight = ones((n,1))
    
    randomIndex = random.sample(range(m),m)
    x0=[];x1=[];x2=[]

    for numiter in range(numIter):
        for i in range(m):
            a = 4/(1.0+numiter+i)+0.01 #采用动态的alpha，随着迭代次数而不断减小，可以缓解权重weight的波动
            index = randomIndex[i] #随机抽取样本来更新权重，可以减少权重的周期性波动
            h = sigmoid(dataMat[index]*weight)
            err = labelMat[index] - h
            weight = weight + a*dataMat[index].T*err
        x0.append(float(weight[0]))
        x1.append(float(weight[1]))
        x2.append(float(weight[2]))

    #画图weight
    plt.figure(figsize=(8,10))
    plt.subplot(311)
    plt.plot(range(numIter),x0)
    plt.ylabel('X0')
    
    plt.subplot(312)
    plt.plot(range(numIter),x1)
    plt.ylabel('X1')
    
    plt.subplot(313)
    plt.plot(range(numIter),x2)
    plt.ylabel('X2')
    
    plt.tight_layout()
    plt.show()
uptatestocGradAscentPlot(dataMat,labelMat)

output_10_0.png

从疝气病症预测病马的死亡率

#分类函数
def classify(inX,weight): #inX和weight都是矩阵形式
    res = sigmoid(float(inX*weight))
    # inX*weight 为一个矩阵，得到的res也是一个矩阵
    # float(inX*weight) 为一个数，得到的res为一个数
    if res > 0.5:
        return 1
    else:
        return 0

#读取数据
from numpy import mat
def createData():
    #训练数据
    fr = open('../../Reference Code/Ch05/horseColicTraining.txt')
    xTrain = [];yTrain=[]
    for line in fr.readlines():
        currentLine = line.strip().split()
        lineList = []
        
        lineList.append(1.0)
        for i in range(len(currentLine)-1):
            lineList.append(float(currentLine[i]))
        
#         for i in range(len(currentLine)-1):
#             lineList.append(float(currentLine[i]))
        xTrain.append(lineList) #构建训练样本集
        yTrain.append(float(currentLine[-1])) #构建训练标签集
     
    #测试数据
    fr = open('../../Reference Code/Ch05/horseColicTest.txt')
    xTest = [];yTest=[]
    for line in fr.readlines():
        currentLine = line.strip().split()
        lineList = []
        
        lineList.append(1.0)
        for i in range(len(currentLine)-1):
            lineList.append(float(currentLine[i]))
            
#         for i in range(len(currentLine)-1):
#             lineList.append(float(currentLine[i]))
        xTest.append(lineList) #构建训练样本集
        yTest.append(float(currentLine[-1])) #构建训练标签集
    return mat(xTrain),mat(yTrain),mat(xTest),mat(yTest)

#训练数据，拟合最佳参数weight,并测试结果
def colicTest():
    #创建数据
    xTrainMat,yTrainMat,xTestMat,yTestMat = createData()
    
    #拟合最佳参数weight
    weight = uptatestocGradAscent(xTrainMat,yTrainMat,150)

    #测试结果
    errCount = 0
    yTestMat = yTestMat.T #转置之后，才能用yTestMat[i]取到每个数
    for i in range(len(xTestMat)):
        res = classify(xTestMat[i],weight)
        if res != int(yTestMat[i]):
            errCount +=1
    errRate = errCount/len(yTestMat)
    print('The error rate of this test is: %f' %errRate)
    return errRate

xTrainMat,yTrainMat,xTestMat,yTestMat = createData()
xTrainMat

matrix([[ 1. ,  2. ,  1. , ...,  8.4,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ..., 85. ,  2. ,  2. ],
        [ 1. ,  2. ,  1. , ...,  6.7,  0. ,  0. ],
        ...,
        [ 1. ,  1. ,  1. , ...,  6.8,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  6. ,  3. ,  3.4],
        [ 1. ,  1. ,  1. , ..., 62. ,  1. ,  1. ]])

errRate = colicTest()
errRate

The error rate of this test is: 0.402985
0.40298507462686567

def multiTest(numTests = 10):
    errRate = 0.0
    for k in range(numTests):
        errRate += colicTest()
    print('after %d iterations the average error rate is:%f' %(numTests,errRate/numTests))
multiTest(numTests = 10)

The error rate of this test is 0.238806
The error rate of this test is 0.402985
The error rate of this test is 0.208955
The error rate of this test is 0.238806
The error rate of this test is 0.567164
The error rate of this test is 0.611940
The error rate of this test is 0.298507
The error rate of this test is 0.373134
The error rate of this test is 0.298507
The error rate of this test is 0.343284
after 10 iterations the average error rate is:0.358209

multiTest(numTests = 10)

The error rate of this test is: 0.432836
The error rate of this test is: 0.373134
The error rate of this test is: 0.298507
The error rate of this test is: 0.328358
The error rate of this test is: 0.701493
The error rate of this test is: 0.432836
The error rate of this test is: 0.343284
The error rate of this test is: 0.328358
The error rate of this test is: 0.268657
The error rate of this test is: 0.552239
after 10 iterations the average error rate is:0.405970

Logistic回归

原理

优点：

缺点：

适用数据类型：

梯度上升VS随机梯度上升

随机梯度上升VS改进的随机梯度上升

从疝气病症预测病马的死亡率

你可能感兴趣的:(Logistic回归)