输入:设单词集合为 W = { ω 1 , ω 2 , ⋯ , ω M } W = \{ \omega_1,\omega_2,\cdots,\omega_M \} W={ω1,ω2,⋯,ωM},文本集合为 D = { d 1 , d 2 , ⋯ , d N } D = \{ d_1,d_2,\cdots,d_N \} D={d1,d2,⋯,dN},话题集合为 Z = { z 1 , z 2 , ⋯ , z N } Z= \{ z_1,z_2,\cdots,z_N \} Z={z1,z2,⋯,zN},共现数据 { n ( ω i , d j ) } , i = 1 , 2 , ⋯ , M , j = 1 , 2 , ⋯ , N \{ n(\omega_i,d_j) \},i=1,2,\cdots,M,j=1,2,\cdots,N {n(ωi,dj)},i=1,2,⋯,M,j=1,2,⋯,N;
输出: P ( ω i ∣ z k ) P(\omega_i|z_k) P(ωi∣zk)和 P ( z k ∣ d j ) P(z_k|d_j) P(zk∣dj)。
(1)设置参数 P ( ω i ∣ z k ) P(\omega_i|z_k) P(ωi∣zk)和 P ( z k ∣ d j ) P(z_k|d_j) P(zk∣dj)的初始值。
(2)迭代执行以下E步,M步,直到收敛为止。
E步:
P ( z k ∣ ω i , d j ) = P ( ω i ∣ z k ) P ( z k ∣ d j ) ∑ k = 1 K P ( ω i ∣ z k ) P ( z k ∣ d j ) P(z_k | \omega_i,d_j) = \frac{P(\omega_i|z_k)P(z_k|d_j)}{\sum_{k=1}^K P(\omega_i|z_k)P(z_k|d_j)} P(zk∣ωi,dj)=∑k=1KP(ωi∣zk)P(zk∣dj)P(ωi∣zk)P(zk∣dj)
M步:
P ( ω i ∣ z k ) = ∑ j = 1 N n ( ω i , d j ) P ( z k ∣ ω i , d j ) ∑ m = 1 M ∑ j = 1 N n ( ω m , d j ) P ( z k ∣ ω m , d j ) P(\omega_i|z_k) = \frac{\sum_{j=1}^N n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{\sum_{m=1}^M \sum_{j=1}^N n(\omega_m,d_j)P(z_k|\omega_m,d_j)} P(ωi∣zk)=∑m=1M∑j=1Nn(ωm,dj)P(zk∣ωm,dj)∑j=1Nn(ωi,dj)P(zk∣ωi,dj)
P ( z k ∣ d j ) = ∑ i = 1 M n ( ω i , d j ) P ( z k ∣ ω i , d j ) n ( d j ) P(z_k | d_j) = \frac{\sum_{i=1}^M n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{n(d_j)} P(zk∣dj)=n(dj)∑i=1Mn(ωi,dj)P(zk∣ωi,dj)
X= [ x 11 x 12 ⋯ x 1 n x 21 x 22 ⋯ x 2 n ⋮ ⋮ ⋮ ⋮ x m 1 x m 2 ⋯ x m n ] \left[ \begin{array}{cccc} x_{11} & x_{12} & \cdots & x_{1n} \\ x_{21} & x_{22} & \cdots & x_{2n}\\ \vdots & \vdots & \vdots & \vdots \\ x_{m1} & x_{m2} & \cdots & x_{mn} \end{array} \right] x11x21⋮xm1x12x22⋮xm2⋯⋯⋮⋯x1nx2n⋮xmn
import numpy as np
import pandas as pd
import string
# import nltk
# nltk.download('stopwords') #离线下载地址:https://download.csdn.net/download/nanxiaotao/89743735,注需放置对应ENV的/nltk_data/corpora/stopwords目录下
from nltk.corpus import stopwords
import time
def load_data(file):
'''
数据集 下载地址:https://download.csdn.net/download/nanxiaotao/89743739
INPUT:
file - (str) 数据文件的路径
OUTPUT:
org_topics - (list) 原始话题标签列表
text - (list) 文本列表
words - (list) 单词列表
'''
df = pd.read_csv(file)
org_topics = df['category'].unique().tolist()
df.drop('category', axis=1, inplace=True)
n = df.shape[0]
text = []
words = []
for i in df['text'].values:
t = i.translate(str.maketrans('', '', string.punctuation))
t = [j for j in t.split() if j not in stopwords.words('english')]
t = [j for j in t if len(j) > 3]
text.append(t)
words.extend(set(t))
words = list(set(words))
return org_topics, text, words
org_topics, text, words = load_data('bbc_text.csv') #加载数据
def frequency_counter(text, words):
'''
构建单词-文本矩阵
INPUT:
text - (list) 文本列表
words - (list) 单词列表
OUTPUT:
words - (list) 出现频次为前1000的单词列表
X - (array) 单词-文本矩阵
'''
words_cnt = np.zeros(len(words)) #用来保存单词的出现频次
X = np.zeros((1000, len(text))) #定义m*n的矩阵,其中m为单词列表中的单词个数,为避免运行时间过长,这里只取了出现频次为前1000的单词,因此m为1000,n为文本个数
#循环计算words列表中各单词出现的词频
for i in range(len(text)):
t = text[i] #取出第i条文本
for w in t:
ind = words.index(w) #取出第i条文本中的第t个单词在单词列表中的索引
words_cnt[ind] += 1 #对应位置的单词出现频次加一
sort_inds = np.argsort(words_cnt)[::-1] #对单词出现频次降序排列后取出其索引值
words = [words[ind] for ind in sort_inds[:1000]] #将出现频次前1000的单词保存到words列表
#构建单词-文本矩阵
for i in range(len(text)):
t = text[i] #取出第i条文本
for w in t:
if w in words: #如果文本t中的单词w在单词列表中,则将X矩阵中对应位置加一
ind = words.index(w)
X[ind, i] += 1
return words, X
print('Original Topics:')
print(org_topics) #打印原始的话题标签列表
words, X = frequency_counter(text, words) #取频次前1000的单词重新构建单词列表,并构建单词-文本矩阵
(1)E步:
P ( z k ∣ ω i , d j ) = P ( ω i ∣ z k ) P ( z k ∣ d j ) ∑ k = 1 K P ( ω i ∣ z k ) P ( z k ∣ d j ) P(z_k | \omega_i,d_j) = \frac{P(\omega_i|z_k)P(z_k|d_j)}{\sum_{k=1}^K P(\omega_i|z_k)P(z_k|d_j)} P(zk∣ωi,dj)=∑k=1KP(ωi∣zk)P(zk∣dj)P(ωi∣zk)P(zk∣dj)
(2)M步:
P ( ω i ∣ z k ) = ∑ j = 1 N n ( ω i , d j ) P ( z k ∣ ω i , d j ) ∑ m = 1 M ∑ j = 1 N n ( ω m , d j ) P ( z k ∣ ω m , d j ) P(\omega_i|z_k) = \frac{\sum_{j=1}^N n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{\sum_{m=1}^M \sum_{j=1}^N n(\omega_m,d_j)P(z_k|\omega_m,d_j)} P(ωi∣zk)=∑m=1M∑j=1Nn(ωm,dj)P(zk∣ωm,dj)∑j=1Nn(ωi,dj)P(zk∣ωi,dj)
P ( z k ∣ d j ) = ∑ i = 1 M n ( ω i , d j ) P ( z k ∣ ω i , d j ) n ( d j ) P(z_k | d_j) = \frac{\sum_{i=1}^M n(\omega_i,d_j)P(z_k|\omega_i,d_j)}{n(d_j)} P(zk∣dj)=n(dj)∑i=1Mn(ωi,dj)P(zk∣ωi,dj)
def do_plsa(X, K, words, iters = 10):
'''
概率潜在语义分析 EM算法进行PLSA模型的参数估计
INPUT:
X - (array) 单词-文本矩阵
K - (int) 设定的话题数
words - (list) 出现频次为前1000的单词列表
iters - (int) 设定的迭代次数
OUTPUT:
P_wi_zk - (array) 话题zk条件下产生单词wi的概率数组
P_zk_dj - (array) 文本dj条件下属于话题zk的概率数组
'''
M, N = X.shape #M为单词数,N为文本数
P_wi_zk = np.random.rand(K, M)
for k in range(K):
P_wi_zk[k] /= np.sum(P_wi_zk[k])
P_zk_dj = np.random.rand(N, K)
for n in range(N):
P_zk_dj[n] /= np.sum(P_zk_dj[n])
P_zk_wi_dj = np.zeros((M, N, K))
#迭代执行E步和M步
for i in range(iters):
print('{}/{}'.format(i+1, iters))
#执行E步
for m in range(M):
for n in range(N):
sums = 0
for k in range(K):
P_zk_wi_dj[m, n, k] = P_wi_zk[k, m] * P_zk_dj[n, k]
sums += P_zk_wi_dj[m, n, k]
P_zk_wi_dj[m, n, :] = P_zk_wi_dj[m, n, :] / sums
#执行M步,计算P(wi|zk)
for k in range(K):
s1 = 0
for m in range(M):
P_wi_zk[k, m] = 0
for n in range(N):
P_wi_zk[k, m] += X[m, n] * P_zk_wi_dj[m, n, k]
s1 += P_wi_zk[k, m]
P_wi_zk[k, :] = P_wi_zk[k, :] / s1
#执行M步,计算P(zk|dj)
for n in range(N):
for k in range(K):
P_zk_dj[n, k] = 0
for m in range(M):
P_zk_dj[n, k] += X[m, n] * P_zk_wi_dj[m, n, k]
P_zk_dj[n, k] = P_zk_dj[n, k] / np.sum(X[:, n])
return P_wi_zk, P_zk_dj
K = 5 #设定话题数为5
P_wi_zk, P_zk_dj = do_plsa(X, K, words, iters = 10) #采用EM算法对PLSA模型进行参数估计
#打印出每个话题zk条件下出现概率最大的前10个单词,即P(wi|zk)在话题zk中最大的10个值对应的单词,作为对话题zk的文本描述
for k in range(K):
sort_inds = np.argsort(P_wi_zk[k])[::-1] #对话题zk条件下的P(wi|zk)的值进行降序排列后取出对应的索引值
topic = [] #定义一个空列表用于保存话题zk概率最大的前10个单词
for i in range(10):
topic.append(words[sort_inds[i]])
topic = ' '.join(topic) #将10个单词以空格分隔,构成对话题zk的文本表述
print('Topic {}: {}'.format(k+1, topic)) #打印话题zk