之前我们遇到的逻辑回归是用来处理一个二分类问题,例如垃圾邮件的分类判别,疾病恶性与否的判别等等;逻辑回归是非常强大的一个机器学习算法,它可以推广到一个多分类的问题,而且我们在平时遇到的问题也不是简单的二分类问题,往往是一个多分类问题;这一节我们将介绍逻辑回归的推广:Softmax Regression
多分类情况下,标签就不止一个了,而是y1,y2,y3,...... yk 我们假定属于第k类的概率为φk,并且为了避免冗余性保持他们之间的独立性:
同时我们作如下的设定:k-1行 k列的向量
(T(y))i 表示T(y)向量的第i个元素,同时我们引入一个非常有趣的计量式子:
<span style="font-size:14px;">-0.017612 14.053064 2 -1.395634 4.662541 3 -0.752157 6.53862 3 -1.322371 7.152853 3 0.423363 11.054677 2 0.406704 7.067335 3 0.667394 12.741452 2 -2.46015 6.866805 3 0.569411 9.548755 0 -0.026632 10.427743 2 0.850433 6.920334 3 1.347183 13.1755 2 1.176813 3.16702 3 -1.781871 9.097953 2 -0.566606 5.749003 3 0.931635 1.589505 1 -0.024205 6.151823 3 -0.036453 2.690988 1 -0.196949 0.444165 1 1.014459 5.754399 3 1.985298 3.230619 3 -1.693453 -0.55754 1 -0.576525 11.778922 2 -0.346811 -1.67873 1 -2.124484 2.672471 1 1.217916 9.597015 0 -0.733928 9.098687 0 -3.642001 -1.618087 1 0.315985 3.523953 3 1.416614 9.619232 0 -0.386323 3.989286 3 0.556921 8.294984 0 1.224863 11.58736 2 -1.347803 -2.406051 1 1.196604 4.951851 3 0.275221 9.543647 0 0.470575 9.332488 0 -1.889567 9.542662 2 -1.527893 12.150579 2 -1.185247 11.309318 2 -0.445678 3.297303 3 1.042222 6.105155 3 -0.618787 10.320986 2 1.152083 0.548467 1 0.828534 2.676045 3 -1.237728 10.549033 2 -0.683565 -2.166125 1 0.229456 5.921938 3 -0.959885 11.555336 2 0.492911 10.993324 2 0.184992 8.721488 0 -0.355715 10.325976 2 -0.397822 8.058397 0 0.824839 13.730343 2 1.507278 5.027866 3 0.099671 6.835839 3 -0.344008 10.717485 2 1.785928 7.718645 0 -0.918801 11.560217 2 -0.364009 4.7473 3 -0.841722 4.119083 3 0.490426 1.960539 1 -0.007194 9.075792 0 0.356107 12.447863 2 0.342578 12.281162 2 -0.810823 -1.466018 1 2.530777 6.476801 3 1.296683 11.607559 2 0.475487 12.040035 2 -0.783277 11.009725 2 0.074798 11.02365 2 -1.337472 0.468339 1 -0.102781 13.763651 2 -0.147324 2.874846 3 0.518389 9.887035 0 1.015399 7.571882 0 -1.658086 -0.027255 1 1.319944 2.171228 1 2.056216 5.019981 3 -0.851633 4.375691 3 -1.510047 6.061992 3 -1.076637 -3.181888 1 1.821096 10.28399 0 3.01015 8.401766 0 -1.099458 1.688274 1 -0.834872 -1.733869 1 -0.846637 3.849075 3 1.400102 12.628781 2 1.752842 5.468166 3 0.078557 0.059736 1 0.089392 -0.7153 1 1.825662 12.693808 2 0.197445 9.744638 0 0.126117 0.922311 1 -0.679797 1.22053 1 0.677983 2.556666 1 0.761349 10.693862 0 -2.168791 0.143632 1 1.38861 9.341997 0 0.317029 14.739025 2 -2.65887965178 0.658328066452 1 -2.30615885683 11.5036718065 2 -2.83005963556 7.30810428189 3 -2.30319006285 3.18958964564 1 -2.31349250532 4.41749905123 3 -2.71157223048 0.21599278192 1 -2.99935111344 14.5766538514 2 -2.50329272687 12.7274016382 2 -2.14191210185 9.75999136268 2 -2.21409612618 9.25234159289 2 -2.0503599261 1.87312594247 1 -2.99747377006 2.82404034943 1 -2.39019233623 1.88778487771 1 -2.00981101171 13.0015287952 2 -2.06105014551 7.26924117028 3 -2.94028883652 10.8418044558 2 -2.56811396636 1.31240093493 1 -2.89942462914 7.47932555859 3 -2.83349151782 0.292728283929 1 -2.16467022383 4.62184237142 3 2.02604290795 6.68200376515 3 2.3755881562 9.3838379637 0 2.48299208843 9.75753701005 0 2.65108044441 9.39059526201 0 2.49422603944 11.856131521 0 2.47215954581 4.83431641068 3 2.26731525725 5.64891602081 3 2.33628075296 10.4603294628 0 2.4548064459 9.90879879651 0 2.13147505967 8.99561368732 0 2.86925733903 4.26531919929 3 2.05715970133 4.97240425903 3 2.14839753847 8.91032469409 0 2.17630437606 5.76122354509 3 2.86205491781 11.630342945 0</span>
<span style="font-size:14px;">from numpy import * import matplotlib.pyplot as plt class SoftmaxRegression: def __init__(self): self.dataMat = [] self.labelMat = [] #数组 self.weights = [] self.M = 0 self.N = 0 self.K = 0 self.alpha = 0.001 def loadDataSet(self,inputfile): for line in open(inputfile,'r'): items = line.strip().split() self.dataMat.append([1.0, float(items[0]), float(items[1])]) #构造测试集 【1,x, y】的形式 self.labelMat.append(int(items[2])) #构造标签集 此时为数组? self.K = len(set(self.labelMat)) #利用set函数去除重复元素 返回类别个数 self.dataMat = mat(self.dataMat) self.labelMat = mat(self.labelMat).transpose() #矩阵化并转置 self.M,self.N = shape(self.dataMat) self.weights = mat(ones((self.N,self.K))) # N=3数据,K=4类别 # self.weights = [[-1.19792777,6.05913226,-4.44164147,3.58043698], # [ 1.78758743,0.47379819,0.63335518,1.1052592 ], # [ 1.48741185,-0.18748907,1.79339685,0.90668037]] 输出四个概率h1,h2,h3,h4比较大小? def likelihoodfunc(self): likelihood = 0.0 for i in range(self.M): t = exp(self.dataMat[i]*self.weights) # t为四个数 就等于那个小e likelihood += log(t[0,self.labelMat[i,0]]/sum(t)) # t[0,self.labelMat[i,0]]就等于e;注意只算一次因为属于其中一类就肯定不属于另外三类中的了 乘值都变为1 # i样本属于四个类别当中的其中一个 print (likelihood) #我们的权值是不断变化调整的 def gradientAscent(self): for l in range(10): error = exp(self.dataMat*self.weights) # 4列 rowsum = -error.sum(axis=1) # axis=1表示每一行的求和 h1,h2,h3,h4 rowsum = rowsum.repeat(self.K, axis=1) #复制4次 error = error/rowsum #对应属于每一类的概率 for m in range(self.M): error[m,self.labelMat[m,0]] += 1 #体现我们预测的与实际的差 错误率进行了归“ 一”化 self.weights = self.weights + self.alpha * self.dataMat.transpose()* error #矩阵为3*M M*4 体现矩阵的思想 self.likelihoodfunc() print (self.weights) def stochasticGradientAscent_V0(self): for l in range(500): # 500次循环迭代 for i in range(self.M): # 随机梯度上升每次只更新一个样本点 error = exp(self.dataMat[i]*self.weights) rowsum = -error.sum(axis=1) rowsum = rowsum.repeat(self.K, axis=1) error = error/rowsum error[0,self.labelMat[i,0]] += 1 self.weights = self.weights + self.alpha * self.dataMat[i].transpose()* error #矩阵为3*M M*4 体现矩阵的思想 self.likelihoodfunc() print (self.weights) def stochasticGradientAscent_V1(self): #一次只更新一个样本 for l in range(500): idxs = arange(self.M) #索引 for i in range(self.M): alpha = 4.0/(1.0+l+i)+0.01 #改进的随机梯度上升算法 rdmidx = int(random.uniform(0,len(idxs))) #随机选取m个样本中的其中一个 索引 error = exp(self.dataMat[rdmidx]*self.weights) rowsum = -error.sum(axis=1) rowsum = rowsum.repeat(self.K, axis=1) error = error/rowsum #平均错误率 error[0,self.labelMat[rdmidx,0]] += 1 #此时的error只有一行 self.weights = self.weights + alpha * self.dataMat[rdmidx].transpose()* error delete(idxs,[rdmidx],None) #避免重复 注意delete的用法 idxs是数组 后面跟角标大小 然后是axis self.likelihoodfunc() print (self.weights) def classify(self,X): p = X * self.weights return p.argmax(1)[0,0] #返回的就是类别标签0123 def test(self): xcord0 = []; ycord0 = [] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] xcord3 = []; ycord3 = [] for i in range(50): for i in arange(80): #4000个样本 x = random.uniform(-3.0, 3.0) y = random.uniform(0.0, 15.0) c = self.classify(mat([[1.0,x,y]])) if c==0: xcord0.append(x); ycord0.append(y) if c==1: xcord1.append(x); ycord1.append(y) if c==2: xcord2.append(x); ycord2.append(y) if c==3: xcord3.append(x); ycord3.append(y) #得到数集 fig1 = plt.figure('fig1') ax = fig1.add_subplot(111) ax.scatter(xcord0, ycord0, s=20, c='yellow', marker='s') ax.scatter(xcord1, ycord1, s=20, c='blue') ax.scatter(xcord2, ycord2, s=20, c='red') ax.scatter(xcord3, ycord3, s=20, c='black') plt.title('inference') plt.xlabel('X1') plt.ylabel('X2'); plt.show('fig1') def test0(self): #主要功能就是将我们的训练集数据可视化 而上一个test则是测试了4000个数据的分类 xcord0 = []; ycord0 = [] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] xcord3 = []; ycord3 = [] for i in range(self.M): if self.labelMat[i]==0: xcord0.append(self.dataMat[i,1]); ycord0.append(self.dataMat[i,2]) elif self.labelMat[i]==1: xcord1.append(self.dataMat[i,1]); ycord1.append(self.dataMat[i,2]) elif self.labelMat[i]==2: xcord2.append(self.dataMat[i,1]); ycord2.append(self.dataMat[i,2]) else: xcord3.append(self.dataMat[i,1]); ycord3.append(self.dataMat[i,2]) fig2 = plt.figure('fig2') ax = fig2.add_subplot(111) ax.scatter(xcord0, ycord0, s=20, c='yellow', marker='s') ax.scatter(xcord1, ycord1, s=20, c='blue') ax.scatter(xcord2, ycord2, s=20, c='red') ax.scatter(xcord3, ycord3, s=20, c='black') plt.title('train data') plt.xlabel('X1') plt.ylabel('X2'); plt.show('fig2') if __name__=='__main__': #开始调用运行 inputfile = 'C:\\Python34\\SoftInput.txt' myclassification = SoftmaxRegression() #实例化为myclassification myclassification.loadDataSet(inputfile) # myclassification.gradientAscent() myclassification.stochasticGradientAscent_V1() # myclassification.stochasticGradientAscent_V0() myclassification.test() myclassification.test0()</span>