鸢尾花问题:
http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
对于以上给出的数据集,用模型进行训练,得到良好的分类器,分类模型。(要求用线性回归模型)
初看数据可以知道大概这么几个信息:首先数据是四维的,类别是三类,是一个多分类问题。因为题目要求用线性模型做,能想到的分类方式大概就是最朴素的线性回归分类,多项式回归分类,然后就是logistics多分类的处理,和softmax分类器。
因为此堂课老师的的要求是linear regression to train,而且上课也提到了在分类的时候,如何去对这三个类的标签进行一个赋值试探。所以怀疑初衷可能是用朴素的线性回归去做,但是另一方面,logistic分类和softmax其本质也是线性回归,只是嵌套了sigmoid函数和概率方法,因此应该也没有什么问题。但是还是先用线性回归去进行了一些实验。代码如下:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
def liner_Regression(data_x,data_y,learningRate,Loopnum):
Weight=np.ones(shape=(1,data_x.shape[1])) #The shape size of weight just follows the shape of data_x
baise=np.array([[1]])
for num in range(Loopnum):
WXPlusB = np.dot(data_x, Weight.T) + baise
loss=np.dot((data_y-WXPlusB).T,data_y-WXPlusB)/data_y.shape[0]
w_gradient = -(2/data_x.shape[0])*np.dot((data_y-WXPlusB).T,data_x)
baise_gradient = -2*np.dot((data_y-WXPlusB).T,np.ones(shape=[data_x.shape[0],1]))/data_x.shape[0]
Weight=Weight-learningRate*w_gradient
baise=baise-learningRate*baise_gradient
if num%50==0:
print('The loss is:',loss[0,0])
return (Weight,baise)
def test_square_error_computing(test_x,test_y,Weight,baise):
test_xMat = np.mat(test_x) # 创建xMat矩阵
test_yMat = np.mat(test_y).T # 创建yMat矩阵(行向量)
y_predict = np.dot(test_xMat,Weight.T)+baise
square_error = np.dot((test_yMat-y_predict).T,test_yMat-y_predict)/test_yMat.shape[0]
return square_error
def loadDataSet(fileName):
xArr = [];
yArr = []
for line in open(fileName).readlines():
curLine = line.strip().split()
# curLine = line.strip().split('\t') #中间有很多个空格、缩进或者tab,split的参数直接不用写就行
xonerow = [] # 添加1.0作为第一个系数,则第一个系数的权重用来代表y=wx+b中的b变量
for i in range(len(curLine) - 1):
xonerow.append(float(curLine[i])) # 最后一列为输出结果值y,前面的值为输入x值
xArr.append(xonerow)
yArr.append(float(curLine[-1])) # 添加最后一列为结果值
return xArr, yArr
if __name__== "__main__":
# np.seterr(divide='ignore', invalid='ignore')
# We type a order for input to choose which dataset we want to load.
print("Type the order you want")
order=input()
if order == '1':
data_x,data_y=loadDataSet('C:/Users/Carzolar/Desktop/DM_regression2.txt')
xMat = np.mat(data_x) # 创建xMat矩阵
yMat = np.mat(data_y).T # 创建yMat矩阵(行向量)
elif order == '2':
data_x,data_y=loadDataSet('C:/Users/Carzolar/Desktop/2.txt')
xMat = np.mat(data_x) # 创建xMat矩阵
yMat = np.mat(data_y).T # 创建yMat矩阵(行向量)
elif order == '3':
data_x,data_y=loadDataSet('C:/Users/Carzolar/Desktop/3.txt')
xMat = np.mat(data_x) # 创建xMat矩阵
yMat = np.mat(data_y).T # 创建yMat矩阵(行向量)
elif order == '4':
data_x,data_y=loadDataSet('C:/Users/Carzolar/Desktop/4.txt')
xMat = np.mat(data_x) # 创建xMat矩阵
yMat = np.mat(data_y).T # 创建yMat矩阵(行向量)
# Here some random dataset belongs to Gaussian distribution
# xMat=np.random.normal(0,10,[5,3])
# Weights=np.array([[3,4,6]])
# noise=np.random.normal(0,0.05,[5,1])
# yMat=np.dot(xMat,Weights.T)+5+noise
# print(xMat)
# print(yMat)
learningRate = 0.0001
Loopnum = 10000
res=liner_Regression(xMat,yMat,learningRate,Loopnum)
print('The learningrate is',learningRate,', and the times of loop is',Loopnum)
print('The muti-parameters are',res[0],', and the biase is',res[1])
# Parameters cut
biase=res[1]
b=biase[0,0]
w=res[0]
w0=w[0,0]
w1=w[0,1]
if order=='2':
w2 = w[0, 2]
w3 = w[0, 3]
if order == '3':
w2 = w[0, 2]
w3 = w[0, 3]
w4 = w[0, 4]
w5 = w[0, 5]
if order == '4':
w2 = w[0, 2]
w3 = w[0, 3]
w4 = w[0, 4]
w5 = w[0, 5]
w6 = w[0, 6]
w7 = w[0, 7]
# Display this plot(function and scatter), and load test files
x0=np.linspace(-1,1,50)
x1=np.linspace(-1,1,50)
if order == '1':
y = w0 * x0 + w1 * x1 + b
# And load test file which order==1
test_x, test_y = loadDataSet('C:/Users/Carzolar/Desktop/DM_regression2.txt')
if order == '2':
y = w0 * x0 + w1 * x1 + w2 * x0 ** 2 + w3 * x1 ** 2 + b
test_x, test_y = loadDataSet('C:/Users/Carzolar/Desktop/test_2.txt')
if order == '3':
y = w0 * x0 + w1 * x1 + w2 * x0 ** 2 + w3 * x1 ** 2 + w4 * x0 ** 3 + w5 * x1 ** 3 + b
test_x, test_y = loadDataSet('C:/Users/Carzolar/Desktop/test_3.txt')
if order == '4':
y = w0 * x0 + w1 * x1 + w2 * x0 ** 2 + w3 * x1 ** 2 + w4 * x0 ** 3 + w5 * x1 ** 3 + w6 * x0 ** 4 + w7 * x1 ** 4 + b
test_x, test_y = loadDataSet('C:/Users/Carzolar/Desktop/test_4.txt')
# 3D
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.plot(x0, x1, y)
# plt.show()
# Evaluate with test dataset and computing square error
standarderror = test_square_error_computing(test_x,test_y,w,biase)
print('The Standard error is ',standarderror[0,0],'for ','order',order)
上述代码实现了基本的线性回归,并且为了增大假设空间的搜索范围,也支持从1阶到4阶的多项式回归,但是实际效果一般。其主要影响的方面还是类别标签值的设定:
但是其实无法去量化地衡量这个’超参数‘的值,只能通过试探去做,发现1,2,3时的标准差是比较小的。因此可以用此类别标签去训练数据然后进行分类。
首先简单说一下原理,看的是一个知乎的答案,其描述得可以说是非常详细。。。。。
用logistic的原因很简单,不只是因为其是0,1函数分类,毕竟其他递增或递减的0,1可导函数也能做此操作。主要是因为广义线性回归模型的假设,首先二分类问题可以假设描述成一个伯努利分布,也就是x固定的时候,y是伯努利分布。而广义线性回归模型则可以推导出伯努利分布的唯一参数,也就是期望,就是sigmoid函数。具体推导过程就不详述了,就是用指数分布族的概念,网上有很多。
因此,logistic的二分类问题其实是完全符合概率估计的,也就是说其算出来的值也就是满足于该类的概率。因此logistic也应该用极大似然估计作为损失函数去进行优化。
然而这个地方是一个三分类问题,其实也可以用softmax去做,但是softmax更多的运用于神经网络中,倒数第二层输出的是一个向量,即多个结果,对应于每一类,这样再去判断对于每一类的概率(softmax),这里还是用logistics去做,只是方式有所不同。
logistics的多分类方法可以描述成1对多问题,也就是讲某一类的标签作为1,其他都是0,以此求出一个参数W1作为该类的概率估计参数,同理对其他类都求出W。最后用测试集进行测试,分别对每一个W进行验证,看哪一个概率最高,就选那一类作为标签。其代码如下:
from numpy import *
import numpy as np
import math
import matplotlib.pyplot as plt
#构造函数来获取数据
def loadDataSet(fileName):
xArr = [];
yArr = []
for line in open(fileName).readlines():
curLine = line.strip().split()
# curLine = line.strip().split('\t') #中间有很多个空格、缩进或者tab,split的参数直接不用写就行
xonerow = [] # 添加1.0作为第一个系数,则第一个系数的权重用来代表y=wx+b中的b变量
for i in range(len(curLine) - 1):
xonerow.append(float(curLine[i])) # 最后一列为输出结果值y,前面的值为输入x值
xArr.append(xonerow)
yArr.append(int(curLine[-1])) # 添加最后一列为结果值
return xArr, yArr
# loadDataSet2
def loadDataSet2(fileName):
xArr = [];
for line in open(fileName).readlines():
curLine = line.strip().split()
# curLine = line.strip().split('\t') #中间有很多个空格、缩进或者tab,split的参数直接不用写就行
xonerow = [] # 添加1.0作为第一个系数,则第一个系数的权重用来代表y=wx+b中的b变量
for i in range(len(curLine)):
xonerow.append(float(curLine[i])) # 最后一列为输出结果值y,前面的值为输入x值
xArr.append(xonerow)
return xArr
# def loadDataSet(fileName):
# data_x=[];data_y=[]
# # fr=open('machinelearninginaction/Ch05/testSet.txt')
# for line in open(fileName).readlines():
# lineArr=line.strip().split()
# data_x.append([1.0,float(lineArr[0]),float(lineArr[1])])#特征数据集,添加1是构造常数项x0
# data_y.append(int(lineArr[-1]))#分类数据集
# return data_x,data_y
def sigmoid(X):
return 1/(1+exp(-X))
#Logistic regression to gradient ascend
def gradAscent(data_x,data_y):
data_xrix=mat(data_x) #(m,n)
data_y=mat(data_y).transpose() #(m,1)
m,n=shape(data_xrix)
Weights=ones((n,1)) #initialization(n,1)
alpha=0.001 #define the step
maxCycles=500 #times of loop
#We can also define a regularization parameter to constrain some huge weight
reg_lambda = math.exp(-8)
for i in range(maxCycles):
h = sigmoid(data_xrix * Weights) #f(thetax)
# print(h.shape)
error = data_y - h #y-h,(m,1)
Weights=(1-reg_lambda)*Weights + alpha * data_xrix.transpose() * error #Gradient ascend
return Weights
def Judgefunction(test_y,folder):
val=[]
rel=[]
for i in range(test_y.shape[0]):
val.append(test_y[i,0])
if val[i]<0.5:
rel.append('P'+folder)
else:
rel.append('N'+folder)
return rel
#Calculating the weights and cut them into training data and test data for cross validation
def weights_calculate(data_x,data_y,times): #folder means should label which probability we train
# traning data (120 items in our dataset from three folders)
training_data_x1 = data_x[times:times + 40]
training_data_x2 = data_x[times+50:times + 90]
training_data_x3 = data_x[times+100:times + 140]
training_data_x1 = np.vstack((training_data_x1,training_data_x2))
training_data_x1 = np.vstack((training_data_x1, training_data_x3))
training_data_x = training_data_x1
training_data_y1 = data_y[times:times + 40]
training_data_y2 = data_y[times+50:times + 90]
training_data_y3 = data_y[times+100:times + 140]
training_data_y1 = np.hstack((training_data_y1, training_data_y2))
training_data_y1 = np.hstack((training_data_y1, training_data_y3))
training_data_y = list(training_data_y1)
# print(training_data_y.shape)
#train data
Weights = gradAscent(training_data_x, training_data_y) #tranning weights for this folder
# test data
if times==0:
test_data_x1 = data_x[times+40:times+50]
test_data_x2 = data_x[times+90:times+100]
test_data_x3 = data_x[times+140:times+150]
test_data_x1 = np.vstack((test_data_x1, test_data_x2))
test_data_x1 = np.vstack((test_data_x1, test_data_x3))
test_data_x = test_data_x1
test_data_y1 = data_y[times + 40:times + 50]
test_data_y2 = data_y[times + 90:times + 100]
test_data_y3 = data_y[times + 140:times + 150]
test_data_y1 = np.hstack((test_data_y1, test_data_y2))
test_data_y1 = np.hstack((test_data_y1, test_data_y3))
test_data_y = test_data_y1
elif times==10:
test_data_x1 = data_x[50-(times + 40):times]
test_data_x2 = data_x[times+50:times+60]
test_data_x3 = data_x[times+100:times + 110]
test_data_x1 = np.vstack((test_data_x1, test_data_x2))
test_data_x1 = np.vstack((test_data_x1, test_data_x3))
test_data_x = test_data_x1
test_data_y1 = data_y[50-(times + 40):times]
test_data_y2 = data_y[times+50:times+60]
test_data_y3 = data_y[times+100:times + 110]
test_data_y1 = np.hstack((test_data_y1, test_data_y2))
test_data_y1 = np.hstack((test_data_y1, test_data_y3))
test_data_y = test_data_y1
predict_y = sigmoid(test_data_x * Weights) # The probability of test_data match to current folder(test_x,1) (30,1)
return predict_y, test_data_y,Weights
if __name__== "__main__":
data_x, data_y = loadDataSet('C:/Users/Carzolar/Desktop/DM_1.txt')
data_x2,data_y2 = loadDataSet('C:/Users/Carzolar/Desktop/DM_2.txt')
data_x3, data_y3 = loadDataSet('C:/Users/Carzolar/Desktop/DM_3.txt')
train_folder = 40
test_folder = 10
# get the predict_result and test_result
predict_y,test_y,weights = weights_calculate(data_x,data_y,0)
predict_y2,test_y2,weights2 = weights_calculate(data_x2,data_y2,0)
predict_y3,test_y3,weights3 = weights_calculate(data_x3,data_y3,0)
# Compare these three probabilities to select the labels to test_data
res=[]
for i in range(predict_y.shape[0]):
if max(predict_y[i,0],predict_y2[i,0],predict_y3[i,0])==predict_y[i,0]:
res.append(1)
elif max(predict_y[i,0],predict_y2[i,0],predict_y3[i,0])==predict_y2[i,0]:
res.append(2)
elif max(predict_y[i,0],predict_y2[i,0],predict_y3[i,0])==predict_y3[i,0]:
res.append(3)
test_y_rel = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
count = 0
for j in range(len(res)):
if res[j] == test_y_rel[j]:
count += 1
CV_n = count / len(res)
print('The probability of training folder', train_folder, 'and test folder', test_folder, 'in current loop is', CV_n)
print('Where the weights are: ', '\n', weights, '\n', 'and', '\n', weights2, '\n', 'and', '\n', weights3)
#display the res
display_res=[]
for i in range(len(res)):
if res[i]==1:
display_res.append('Iris-setosa')
elif res[i]==2:
display_res.append('Iris-versicolor')
elif res[i]==3:
display_res.append('Iris-virginica')
print('For this test_data, the prediction is:')
print(display_res)
通过使用cross validation进行验证,以每个类中的10个作为验证集,其他作为训练集,循环测试,发现准确率还是非常高的。
(0.9666667,0.9666667,0.9666667,1,1,1,1,1,1,1,1,1,1,1,1,1..........0.9666667...........)验证集的准确率都在这个水准以上。比最简单的线性回归要好很多。。