用L-BFGS法训练最大熵模型做分类

用L-BFGS法训练最大熵模型做分类_第1张图片

作者:金良([email protected]) csdn博客: http://blog.csdn.net/u012176591

def changelabels(labels):
    #对标签进行处理,使得第一个标签为0,第二个标签为1,依次递增
    index = 0
    labeldict = dict()
    for i in range(np.shape(labels)[0]):
        if labels[i] not in labeldict:#如果该标签未曾出现过
            labeldict[labels[i]] = index#在字典中加入该标签
            index += 1
        labels[i] = labeldict[labels[i]]   

def getprobs(data,weights):
    #计算当前参数下data属于各个类别的概率
    attrinum = np.shape(data)[0]
    labelnum = np.shape(weights)[0]/attrinum

    exps = np.zeros(labelnum)
    for label in range(labelnum):
        exps[label] = np.exp(np.dot(weights[int(label*attrinum):int((label+1)*attrinum)],data))
    exps /= np.sum(exps)
    return exps


def getpriorfeatureE(labels,datas):
    # 计算特征函数的先验分布的期望值
    labelnum = np.shape(list(set(labels)))[0]
    attrinum = np.shape(datas)[1]
    samplenum = np.shape(labels)[0]
    featurenum = labelnum*attrinum

    priorfeatureE = np.zeros(featurenum)
    for label in range(labelnum):
        #print datas[labels==label]
        priorfeatureE[int(label*attrinum):int((label+1)*attrinum)] = np.sum(datas[labels==label],axis=0)
    priorfeatureE /= 1.0*samplenum
    return priorfeatureE

def getpostfeatureE(datas,weights):
    #计算特征函数的后验分布的期望值
    attrinum = np.shape(datas[0])[0]
    labelnum = np.shape(weights)[0]/attrinum
    samplenum = np.shape(datas)[0]

    postfeatureE = np.zeros(featurenum)
    for i in range(samplenum):
        probs = getprobs(datas[i],weights)
        for label in range(labelnum):
            postfeatureE[int(label*attrinum):int((label+1)*attrinum)] += 1.0/samplenum*probs[label]*datas[i]
    return postfeatureE

def getgradients(datas,weights,priorfeatureE):
    #计算当前参数下的各参数的梯度向量

    postfeatureE = getpostfeatureE(datas,weights)

    return postfeatureE-priorfeatureE

def funcobject(labels,datas,weights):
    # 目标函数是对对数似然函数取负,故要使其最小化
    labelnum = np.shape(list(set(labels)))[0]
    samplenum = np.shape(labels)[0]

    part0 = 0
    part1 = 0
    for i in range(samplenum):
        part0 += 1.0/samplenum*np.log(np.sum([np.exp(np.dot(datas[i],weights[int(label*attrinum):int((label+1)*attrinum)])) for label in range(labelnum)]))
        part1 += 1.0/samplenum*np.dot(datas[i],weights[int(labels[i]*attrinum):int((labels[i]+1)*attrinum)])

    return part0 - part1  

def getpredictlabels(datas,weights):
    # 根据训练得到的参数预测样本的类别
    samplenum = np.shape(datas)[0]
    predictlabels = np.array([np.nan]*samplenum)
    for i in range(samplenum):
        probs = getprobs(datas[i],weights)
        label = probs.argmax()
        predictlabels[i] = label
    return predictlabels 

def twoloop(s, y, rho,gk):
    # 被lbfgs函数调用
    n = len(s) #向量序列的长度

    if np.shape(s)[0] >= 1:
        #h0是标量,而非矩阵
        h0 = 1.0*np.dot(s[-1],y[-1])/np.dot(y[-1],y[-1])
    else:
        h0 = 1

    a = np.empty((n,))

    q = gk.copy() 
    for i in range(n - 1, -1, -1): 
        a[i] = rho[i] * np.dot(s[i], q)
        q -= a[i] * y[i]
    z = h0*q

    for i in range(n):
        b = rho[i] * np.dot(y[i], z)
        z += s[i] * (a[i] - b)

    return z   

def lbfgs(fun,gfun,x0,datas,labels,priorfeatureE,m=5,maxk = 20):
    # fun和gfun分别是目标函数及其一阶导数,x0是初值,m为储存的序列的大小
    rou = 0.55
    sigma = 0.4
    epsilon = 1e-5
    k = 0
    n = np.shape(x0)[0] #自变量的维度

    s, y, rho = [], [], []

    while k < maxk :
        gk = gfun(datas,x0,priorfeatureE)
        if np.linalg.norm(gk) < epsilon:
            break

        dk = -1.0*twoloop(s, y, rho,gk)

        m0=0;
        mk=0
        while m0 < 20: # 用Armijo搜索求步长
            if fun(labels,datas,x0+rou**m0*dk) < fun(labels,datas,x0)+sigma*rou**m0*np.dot(gk,dk): 
                mk = m0
                break
            m0 += 1


        x = x0 + rou**mk*dk
        sk = x - x0
        yk = gfun(datas,x,priorfeatureE) - gk   

        if np.dot(sk,yk) > 0: #增加新的向量
            rho.append(1.0/np.dot(sk,yk))
            s.append(sk)
            y.append(yk)
        if np.shape(rho)[0] > m: #弃掉最旧向量
            rho.pop(0)
            s.pop(0)
            y.pop(0)

        k += 1
        x0 = x

    return x0,fun(labels,datas,x0)#,k#分别是最优点坐标,最优值,迭代次数

测试用例:

datasets = np.loadtxt('wine.txt',delimiter=',')
labels,datas= datasets[:,0],datasets[:,1:]
changelabels(labels) #对标签进行处理,处理后的标签为0,1,2等等
datas = (datas - datas.min(axis=0))/(datas.max(axis=0) - datas.min(axis=0)) #各个属性进行归一化处理


labelnum = np.shape(list(set(labels)))[0]
attrinum = np.shape(datas)[1]
samplenum = np.shape(labels)[0]
featurenum = labelnum*attrinum

#参数初始化
weights = np.zeros(featurenum)

#特征的先验期望
priorfeatureE = getpriorfeatureE(labels,datas)

#用L-BFGS优化算法求解最优参数
weights,likelyfuncvalue = lbfgs(fun = funcobject,gfun = getgradients,x0 = weights,datas = datas,labels = labels,priorfeatureE = priorfeatureE,m=10,maxk = 20)

#根据训练的参数预测各个样本的类别
predictlabels = getpredictlabels(datas,weights)
  • BFGS优化算法的理解以及LBFGS源码求解最优化问题
    http://blog.csdn.net/pizibing880909/article/details/21184715

你可能感兴趣的:(分类,最大熵,L-BFGS,拟牛顿)