概率潜在语义分析(Probabilistic Latent Semantic Analysis,PLSA)—无监督学习方法、概率模型、生成模型、共现模型、非线性模型、参数化模型、批量学习

定义

输入:设单词集合为 W = { ω 1 , ω 2 , ⋯   , ω M } W = \{ \omega_1,\omega_2,\cdots,\omega_M \} W={ω1,ω2,,ωM},文本集合为 D = { d 1 , d 2 , ⋯   , d N } D = \{ d_1,d_2,\cdots,d_N \} D={d1,d2,,dN},话题集合为 Z = { z 1 , z 2 , ⋯   , z N } Z= \{ z_1,z_2,\cdots,z_N \} Z={z1,z2,,zN},共现数据 { n ( ω i , d j ) } , i = 1 , 2 , ⋯   , M , j = 1 , 2 , ⋯   , N \{ n(\omega_i,d_j) \},i=1,2,\cdots,M,j=1,2,\cdots,N {n(ωi,dj)},i=1,2,,M,j=1,2,,N;

输出: P ( ω i ∣ z k ) P(\omega_i|z_k) P(ωizk) P ( z k ∣ d j ) P(z_k|d_j) P(zkdj)

(1)设置参数 P ( ω i ∣ z k ) P(\omega_i|z_k) P(ωizk) P ( z k ∣ d j ) P(z_k|d_j) P(zkdj)的初始值。

(2)迭代执行以下E步,M步,直到收敛为止。

    E步:
P ( z k ∣ ω i , d j ) = P ( ω i ∣ z k ) P ( z k ∣ d j ) ∑ k = 1 K P ( ω i ∣ z k ) P ( z k ∣ d j ) P(z_k | \omega_i,d_j) = \frac{P(\omega_i|z_k)P(z_k|d_j)}{\sum_{k=1}^K P(\omega_i|z_k)P(z_k|d_j)} P(zkωi,dj)=k=1KP(ωizk)P(zkdj)P(ωizk)P(zkdj)
    M步:
P ( ω i ∣ z k ) = ∑ j = 1 N n ( ω i , d j ) P ( z k ∣ ω i , d j ) ∑ m = 1 M ∑ j = 1 N n ( ω m , d j ) P ( z k ∣ ω m , d j ) P(\omega_i|z_k) = \frac{\sum_{j=1}^N n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{\sum_{m=1}^M \sum_{j=1}^N n(\omega_m,d_j)P(z_k|\omega_m,d_j)} P(ωizk)=m=1Mj=1Nn(ωm,dj)P(zkωm,dj)j=1Nn(ωi,dj)P(zkωi,dj)
P ( z k ∣ d j ) = ∑ i = 1 M n ( ω i , d j ) P ( z k ∣ ω i , d j ) n ( d j ) P(z_k | d_j) = \frac{\sum_{i=1}^M n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{n(d_j)} P(zkdj)=n(dj)i=1Mn(ωi,dj)P(zkωi,dj)

输入空间

X= [ x 11 x 12 ⋯ x 1 n x 21 x 22 ⋯ x 2 n ⋮ ⋮ ⋮ ⋮ x m 1 x m 2 ⋯ x m n ] \left[ \begin{array}{cccc} x_{11} & x_{12} & \cdots & x_{1n} \\ x_{21} & x_{22} & \cdots & x_{2n}\\ \vdots & \vdots & \vdots & \vdots \\ x_{m1} & x_{m2} & \cdots & x_{mn} \end{array} \right] x11x21xm1x12x22xm2x1nx2nxmn

import numpy as np
import pandas as pd
import string
# import nltk
# nltk.download('stopwords') #离线下载地址:https://download.csdn.net/download/nanxiaotao/89743735,注需放置对应ENV的/nltk_data/corpora/stopwords目录下
from nltk.corpus import stopwords
import time
def load_data(file):
    '''
    数据集  下载地址:https://download.csdn.net/download/nanxiaotao/89743739
    INPUT:
    file - (str) 数据文件的路径
    
    OUTPUT:
    org_topics - (list) 原始话题标签列表
    text - (list) 文本列表
    words - (list) 单词列表
    
    '''
    df = pd.read_csv(file)
    org_topics = df['category'].unique().tolist()
    df.drop('category', axis=1, inplace=True)
    n = df.shape[0]
    text = []
    words = []
    for i in df['text'].values:
        t = i.translate(str.maketrans('', '', string.punctuation))
        t = [j for j in t.split() if j not in stopwords.words('english')]
        t = [j for j in t if len(j) > 3]
        text.append(t)
        words.extend(set(t))
    words = list(set(words))
    return org_topics, text, words
org_topics, text, words = load_data('bbc_text.csv')  #加载数据
def frequency_counter(text, words):
    '''
    构建单词-文本矩阵
    INPUT:
    text - (list) 文本列表
    words - (list) 单词列表
    
    OUTPUT:
    words - (list) 出现频次为前1000的单词列表
    X - (array) 单词-文本矩阵
    
    '''
    words_cnt = np.zeros(len(words))  #用来保存单词的出现频次
    X = np.zeros((1000, len(text)))  #定义m*n的矩阵,其中m为单词列表中的单词个数,为避免运行时间过长,这里只取了出现频次为前1000的单词,因此m为1000,n为文本个数
    #循环计算words列表中各单词出现的词频
    for i in range(len(text)):
        t = text[i]  #取出第i条文本
        for w in t:
            ind = words.index(w)  #取出第i条文本中的第t个单词在单词列表中的索引
            words_cnt[ind] += 1  #对应位置的单词出现频次加一
    sort_inds = np.argsort(words_cnt)[::-1]  #对单词出现频次降序排列后取出其索引值
    words = [words[ind] for ind in sort_inds[:1000]]  #将出现频次前1000的单词保存到words列表
    #构建单词-文本矩阵
    for i in range(len(text)):
        t = text[i]  #取出第i条文本
        for w in t:
            if w in words:  #如果文本t中的单词w在单词列表中,则将X矩阵中对应位置加一
                ind = words.index(w)
                X[ind, i] += 1
    return words, X
print('Original Topics:')
print(org_topics)  #打印原始的话题标签列表
words, X = frequency_counter(text, words)  #取频次前1000的单词重新构建单词列表,并构建单词-文本矩阵

算法

    (1)E步:
P ( z k ∣ ω i , d j ) = P ( ω i ∣ z k ) P ( z k ∣ d j ) ∑ k = 1 K P ( ω i ∣ z k ) P ( z k ∣ d j ) P(z_k | \omega_i,d_j) = \frac{P(\omega_i|z_k)P(z_k|d_j)}{\sum_{k=1}^K P(\omega_i|z_k)P(z_k|d_j)} P(zkωi,dj)=k=1KP(ωizk)P(zkdj)P(ωizk)P(zkdj)
    (2)M步:
P ( ω i ∣ z k ) = ∑ j = 1 N n ( ω i , d j ) P ( z k ∣ ω i , d j ) ∑ m = 1 M ∑ j = 1 N n ( ω m , d j ) P ( z k ∣ ω m , d j ) P(\omega_i|z_k) = \frac{\sum_{j=1}^N n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{\sum_{m=1}^M \sum_{j=1}^N n(\omega_m,d_j)P(z_k|\omega_m,d_j)} P(ωizk)=m=1Mj=1Nn(ωm,dj)P(zkωm,dj)j=1Nn(ωi,dj)P(zkωi,dj)
P ( z k ∣ d j ) = ∑ i = 1 M n ( ω i , d j ) P ( z k ∣ ω i , d j ) n ( d j ) P(z_k | d_j) = \frac{\sum_{i=1}^M n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{n(d_j)} P(zkdj)=n(dj)i=1Mn(ωi,dj)P(zkωi,dj)

def do_plsa(X, K, words, iters = 10):
    '''
    概率潜在语义分析 EM算法进行PLSA模型的参数估计
    INPUT:
    X - (array) 单词-文本矩阵
    K - (int) 设定的话题数
    words - (list) 出现频次为前1000的单词列表
    iters - (int) 设定的迭代次数
    
    OUTPUT:
    P_wi_zk - (array) 话题zk条件下产生单词wi的概率数组
    P_zk_dj - (array) 文本dj条件下属于话题zk的概率数组
    
    '''
    M, N = X.shape  #M为单词数,N为文本数
    P_wi_zk = np.random.rand(K, M)
    for k in range(K):
        P_wi_zk[k] /= np.sum(P_wi_zk[k])
    P_zk_dj = np.random.rand(N, K)
    for n in range(N):
        P_zk_dj[n] /= np.sum(P_zk_dj[n])
    P_zk_wi_dj = np.zeros((M, N, K))
    #迭代执行E步和M步
    for i in range(iters):
        print('{}/{}'.format(i+1, iters))  
        #执行E步
        for m in range(M):
            for n in range(N):
                sums = 0
                for k in range(K):
                    P_zk_wi_dj[m, n, k] = P_wi_zk[k, m] * P_zk_dj[n, k]
                    sums += P_zk_wi_dj[m, n, k]
                P_zk_wi_dj[m, n, :] = P_zk_wi_dj[m, n, :] / sums
        #执行M步,计算P(wi|zk)
        for k in range(K):
            s1 = 0
            for m in range(M):
                P_wi_zk[k, m] = 0
                for n in range(N):
                    P_wi_zk[k, m] += X[m, n] * P_zk_wi_dj[m, n, k]
                s1 += P_wi_zk[k, m]
            P_wi_zk[k, :] = P_wi_zk[k, :] / s1
        #执行M步,计算P(zk|dj)
        for n in range(N):
            for k in range(K):
                P_zk_dj[n, k] = 0
                for m in range(M):
                    P_zk_dj[n, k] += X[m, n] * P_zk_wi_dj[m, n, k]
                P_zk_dj[n, k] = P_zk_dj[n, k] / np.sum(X[:, n])
    return P_wi_zk, P_zk_dj
K = 5  #设定话题数为5
P_wi_zk, P_zk_dj = do_plsa(X, K, words, iters = 10)  #采用EM算法对PLSA模型进行参数估计
#打印出每个话题zk条件下出现概率最大的前10个单词,即P(wi|zk)在话题zk中最大的10个值对应的单词,作为对话题zk的文本描述
for k in range(K):
    sort_inds = np.argsort(P_wi_zk[k])[::-1]  #对话题zk条件下的P(wi|zk)的值进行降序排列后取出对应的索引值
    topic = []  #定义一个空列表用于保存话题zk概率最大的前10个单词
    for i in range(10):
        topic.append(words[sort_inds[i]])  
    topic = ' '.join(topic)  #将10个单词以空格分隔,构成对话题zk的文本表述
    print('Topic {}: {}'.format(k+1, topic))  #打印话题zk

你可能感兴趣的:(Artificial,Intelligence,人工智能,机器学习,概率潜在语义分析,PLSA)