机器学习-logistic回归训练数据集

类别:机器学习个人笔记
参考书籍:《统计学习》、《机器学习实战》、周志华大佬的西瓜书
相关数学公式推导见我上传的手写PDF
任务:
学习《机器学习实战》P78页及P79页程序清单5-1和5-2,完成以下问题:

1)导入数据集’testSet.txt’,用logistic回归训练数据集,并画出决策边界。

2)导入数据集’data.txt’,用logistic回归训练数据集,并画出决策边界。

# 导包
from numpy import *
import pandas as pd
import numpy as np
from math import exp
import matplotlib.pyplot as plt
def loadDataSet(fileName):
    dataMat = [];labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
        #print(shape(dataMat),shape(labelMat))
    return dataMat, labelMat

def sigmoid(inX):
    return 1.0/(1+np.exp(-inX))        
def gradAscent(dataMatIn,classLabels):
    dataMatrix = mat(dataMatIn)
    labelMat = mat(classLabels).transpose()
    m,n = shape(dataMatrix)
    alpha = 0.001
    maxCycles = 500  # 最大迭代次数
    weights = ones((n,1))
    for k in range(maxCycles):
        h = sigmoid(dataMatrix * weights)
        error = (labelMat - h)
        weights = weights + alpha * dataMatrix.transpose() * error     
    return weights
def plotBestFit(weights,filename):
    dataMat,labelMat = loadDataSet(filename)
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i]) == 1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
    ax.scatter(xcord2,ycord2,s=30,c='green')
    x = arange(-3.0,3.0,0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x,y)
    plt.xlabel('X1');plt.ylabel('X2')
    plt.legend(["decision boundary","1","0"],loc = 2)
    plt.title("(1)")
    plt.show()

1)导入数据集’testSet.txt’,用logistic回归训练数据集,并画出决策边界。

filename = r"testSet.txt"
dataMat,labelMat = loadDataSet(filename)
weights=gradAscent(dataMat,labelMat)
plotBestFit(weights.getA(),filename)

机器学习-logistic回归训练数据集_第1张图片

2)导入数据集’data.txt’,用logistic回归训练数据集,并画出决策边界。

def gradAscent2(dataMatIn, classLabels):
    dataMatrix = mat(dataMatIn)
    labelMat = mat(classLabels).transpose()
    m,n = shape(dataMatrix)
    for i in range (m):
        for j in range (n):
            dataMatrix[i,j] = dataMatrix[i,j]**2 #更新X 因拟合曲线非直线
    p,q = shape(labelMat)
    
    # 因为y是-1和1,所以更改y值
    for k in range(p):
        for l in range(q):
            labelMat[k,l] = (1+labelMat[k,l])/2  # 更新Y
    alpha = 0.001
    maxCycles = 10000
    weights = ones((n,1))
    for k in range(maxCycles):
        h = sigmoid(dataMatrix * weights)
        error = (labelMat - h)
        weights = weights + alpha * dataMatrix.transpose() * error
    #type(weights)
    #print(shape(labelMat),shape(classLabels))
    return weights
def plotBestFit2(weights,filename):
    dataMat,labelMat = loadDataSet(filename)  #数据读取
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):            #描绘数据点
        if int(labelMat[i]) == 1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
    ax.scatter(xcord2,ycord2,s=30,c='green')                       
    x = arange(-3.0,3.0,0.1)                            #描绘拟合曲线
    
    # 画圈圈
    y = sqrt((-weights[0]-weights[1]*x**2)/weights[2])        
    y1 = -sqrt((-weights[0]-weights[1]*x**2)/weights[2])
    ax.plot(x,y,"b",x,y1,"b")
    plt.xlabel('X1');plt.ylabel('X2')
    plt.legend(["decision boundary1","decision boundary2","1","0"],loc = 2)
    plt.title("(2)")
    plt.show()
filename2 = r"data.txt"
dataMat,labelMat = loadDataSet(filename2)
weights=gradAscent2(dataMat,labelMat)
plotBestFit2(weights.getA(),filename2)

机器学习-logistic回归训练数据集_第2张图片

def plotBestFit2(weights,dataMat,labelMat):
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):            #描绘数据点
        if int(labelMat[i]) == 1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
    ax.scatter(xcord2,ycord2,s=30,c='green')                       
    x = arange(-3.0,3.0,0.1)                            #描绘拟合曲线
    
    # 画圈圈
    y = sqrt((-weights[0]-weights[1]*x**2)/weights[2])        
    y1 = -sqrt((-weights[0]-weights[1]*x**2)/weights[2])
    ax.plot(x,y,"b",x,y1,"b")
    plt.xlabel('X1');plt.ylabel('X2')
    plt.legend(["decision boundary1","decision boundary2","1","0"],loc = 2)
    plt.title("(2)")
    plt.show()
weights=gradAscent2(dataMat,newlabelMat)
plotBestFit2(weights.getA(),dataMat,newlabelMat)

机器学习-logistic回归训练数据集_第3张图片

你可能感兴趣的:(Algorithm,机器学习,python,算法)