类别:机器学习个人笔记
参考书籍:《统计学习》、《机器学习实战》、周志华大佬的西瓜书
相关数学公式推导见我上传的手写PDF
任务:
学习《机器学习实战》P78页及P79页程序清单5-1和5-2,完成以下问题:
1)导入数据集’testSet.txt’,用logistic回归训练数据集,并画出决策边界。
2)导入数据集’data.txt’,用logistic回归训练数据集,并画出决策边界。
# 导包
from numpy import *
import pandas as pd
import numpy as np
from math import exp
import matplotlib.pyplot as plt
def loadDataSet(fileName):
dataMat = [];labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
#print(shape(dataMat),shape(labelMat))
return dataMat, labelMat
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def gradAscent(dataMatIn,classLabels):
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
m,n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500 # 最大迭代次数
weights = ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error
return weights
def plotBestFit(weights,filename):
dataMat,labelMat = loadDataSet(filename)
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
ax.scatter(xcord2,ycord2,s=30,c='green')
x = arange(-3.0,3.0,0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x,y)
plt.xlabel('X1');plt.ylabel('X2')
plt.legend(["decision boundary","1","0"],loc = 2)
plt.title("(1)")
plt.show()
filename = r"testSet.txt"
dataMat,labelMat = loadDataSet(filename)
weights=gradAscent(dataMat,labelMat)
plotBestFit(weights.getA(),filename)
def gradAscent2(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
m,n = shape(dataMatrix)
for i in range (m):
for j in range (n):
dataMatrix[i,j] = dataMatrix[i,j]**2 #更新X 因拟合曲线非直线
p,q = shape(labelMat)
# 因为y是-1和1,所以更改y值
for k in range(p):
for l in range(q):
labelMat[k,l] = (1+labelMat[k,l])/2 # 更新Y
alpha = 0.001
maxCycles = 10000
weights = ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error
#type(weights)
#print(shape(labelMat),shape(classLabels))
return weights
def plotBestFit2(weights,filename):
dataMat,labelMat = loadDataSet(filename) #数据读取
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n): #描绘数据点
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
ax.scatter(xcord2,ycord2,s=30,c='green')
x = arange(-3.0,3.0,0.1) #描绘拟合曲线
# 画圈圈
y = sqrt((-weights[0]-weights[1]*x**2)/weights[2])
y1 = -sqrt((-weights[0]-weights[1]*x**2)/weights[2])
ax.plot(x,y,"b",x,y1,"b")
plt.xlabel('X1');plt.ylabel('X2')
plt.legend(["decision boundary1","decision boundary2","1","0"],loc = 2)
plt.title("(2)")
plt.show()
filename2 = r"data.txt"
dataMat,labelMat = loadDataSet(filename2)
weights=gradAscent2(dataMat,labelMat)
plotBestFit2(weights.getA(),filename2)
def plotBestFit2(weights,dataMat,labelMat):
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n): #描绘数据点
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
ax.scatter(xcord2,ycord2,s=30,c='green')
x = arange(-3.0,3.0,0.1) #描绘拟合曲线
# 画圈圈
y = sqrt((-weights[0]-weights[1]*x**2)/weights[2])
y1 = -sqrt((-weights[0]-weights[1]*x**2)/weights[2])
ax.plot(x,y,"b",x,y1,"b")
plt.xlabel('X1');plt.ylabel('X2')
plt.legend(["decision boundary1","decision boundary2","1","0"],loc = 2)
plt.title("(2)")
plt.show()
weights=gradAscent2(dataMat,newlabelMat)
plotBestFit2(weights.getA(),dataMat,newlabelMat)