from pprint import pprint
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora
stopWordsList = set('for a of the and to in'.split())
with open('./Data/mycorpus.txt', encoding='utf-8') as f:
texts = [[word for word in line.lower().split() if word not in stopWordsList]for line in f]
dictionary = corpora.Dictionary.load('./Data/sampleDict.dict')
corpus = [dictionary.doc2bow(doc) for doc in texts]
pprint(corpus)
[[(0, 1), (1, 1), (2, 1)],
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
[(2, 1), (5, 1), (7, 1), (8, 1)],
[(1, 1), (5, 2), (8, 1)],
[(3, 1), (6, 1), (7, 1)],
[(9, 1)],
[(9, 1), (10, 1)],
[(9, 1), (10, 1), (11, 1)],
[(4, 1), (10, 1), (11, 1)]]
有了上一个教程的基础,我们现在获得了词袋表示的语料库corpus
。不过这个词袋模型的效用显然一般。在本教程中,我们将对原始的表示法进行变换,从而达到以下目的:
- 在语料库中找出隐藏的结构、发现单词之间的关系,并以一种更语义化的方式描述文档
- 使文档表示更紧凑(降维)。这既提高了效率(新的表示法消耗的资源更少),又提高了效力(忽略了边际数据趋势,减少了噪音)
对原始的词袋进行变换,往往需要用到一些模型,这些模型在models
模块中定义,形如models.xxxModel
,一般而言我们需要将旧有的语料库corpus
喂给这些模型进行训练从而达到变换的目的。
from gensim import models
# 训练一个模型
tfidf = models.TfidfModel(corpus)
有了这个训练好的模型,我们就可以进行向量空间的转换,比如我们现在有一个文档形如[([(0, 1), (1, 1)])]
,则我们可以这样来对其进行变换。这里,我们以Tf-Idf模型作为案例,该模型可以提取文档的关键词,并赋予高表征的词以较大的权值,从而让向量更好的表征文档的特性:
docBow = [(0, 1), (1, 1)]
print(tfidf[docBow])
[(0, 0.7071067811865476), (1, 0.7071067811865476)]
可以看到我们只需要使用[]
就可以很方便地通过训练好的模型完成变换,现在不妨试着对整个语料库进行变换:
for doc in corpus:
print(tfidf[doc])
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
或者也可以像下面这样:
corpusTdIdf = tfidf[corpus]
for vec in corpusTdIdf:
print(vec)
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
需要注意的是,调用model[corpus]
只在旧的语料库文档流周围创建一个包装器——实际的转换是在文档迭代过程中实时完成的。在调用corpus_transform = model[corpus]
时,实际上并不能对整个语料库进行转换,因为这意味着将结果存储在主存中——而这这与gensim的内存不确定性目标相矛盾。
所以,如果你有可能会将对转换后的corpus_transform
进行多次迭代,并且转换代价高昂,那么最好首先将转换结果序列化到磁盘,然后继续使用它(否则每次迭代都会重复进行一次转换)。
对原始的语料库向量空间的变换也可以是一个接一个形成一条流水线一样的。比如在刚才的Td-Idf模型的基础上,我们又可以训练一个LSI(隐式语义索引)模型,又称LSA(隐式语义分析)。利用它我们构造一个隐式的2-D空间(2-D是因为我们设置了num_topics=2)。
lsi = models.LsiModel(corpusTdIdf, num_topics=2, id2word=dictionary)
corpusLsi = lsi[corpusTdIdf]
with open('./Data/mycorpus.txt', encoding='utf-8') as f:
rawDocs = f.readlines()
for i, v in enumerate(corpusLsi):
print('-'.center(70, '-'))
print(rawDocs[i])
print(v)
----------------------------------------------------------------------
Human machine interface for lab abc computer applications
[(0, 0.06600783396090126), (1, 0.520070330636185)]
----------------------------------------------------------------------
A survey of user opinion of computer system response time
[(0, 0.19667592859142144), (1, 0.7609563167700049)]
----------------------------------------------------------------------
The EPS user interface management system
[(0, 0.08992639972446115), (1, 0.7241860626752514)]
----------------------------------------------------------------------
System and human system engineering testing of EPS
[(0, 0.07585847652177896), (1, 0.6320551586003434)]
----------------------------------------------------------------------
Relation of user perceived response time to error measurement
[(0, 0.10150299184979877), (1, 0.5737308483002953)]
----------------------------------------------------------------------
The generation of random binary unordered trees
[(0, 0.7032108939378318), (1, -0.16115180214025432)]
----------------------------------------------------------------------
The intersection graph of paths in trees
[(0, 0.8774787673119838), (1, -0.16758906864658976)]
----------------------------------------------------------------------
Graph minors IV Widths of trees and well quasi ordering
[(0, 0.9098624686818584), (1, -0.14086553628718593)]
----------------------------------------------------------------------
Graph minors A survey
[(0, 0.6165825350569278), (1, 0.05392907566389622)]
二维的隐式维度意味着什么?我们可以利用models.LsiModel.print_topics()
来检查一下这个过程到底产生了什么变化:
lsi.print_topics(2)
[(0,
'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
(1,
'0.460*"system" + 0.373*"user" + 0.332*"eps" + 0.328*"interface" + 0.320*"response" + 0.320*"time" + 0.293*"computer" + 0.280*"human" + 0.171*"survey" + -0.161*"trees"')]
结合上面两个运行结果进行分析。根据LSI来看,“tree”、“graph”、“minors”都是相关的词语(而且在第一主题的方向上贡献最多),而第二主题实际上与所有的词语都有关系。如我们所料,前五个文档与第二个主题的关联更强,而其他四个文档与第一个主题关联最强(从比较第一第二两个维度对应的值的大小得出):
像往常一样,我们亦可以使用save和load将模型存储于硬盘中:
lsi.save(r'./Data/model.lsi')
lsi = models.LsiModel.load(r'./Data/model.lsi')
Gensim中可以使用的变换还有很多,它实现了几种常见的向量空间模型算法,在下面的说明中,bow_corpus
表示词袋表示法的语料库,tfidf_corpus
表示经过Td-Idf变换的语料库。
normalize
)使L2范数 ∥v⃗ ∥2=1 ‖ v → ‖ 2 = 1 。model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)
model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
另外注意,LSI训练的独特之处是它支持所谓的在线学习——我们仅需提供更多的训练文本即可。我们只需要调用add_documents
方法就可以了:
model.add_documents(another_tfidf_corpus) # 现在LSI已经使用tfidf_corpus + another_tfidf_corpus进行过训练了
lsi_vec = model[tfidf_vec] # 将新文档转化到LSI空间不会影响该模型
# ...
model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
lsi_vec = model[tfidf_vec]
# ...
更多的内容,请看gensim.models.lsimodel
的帮助文档。
model = rpmodel.RpModel(tfidf_corpus, num_topics=500)
model = ldamodel.LdaModel(bow_corpus, id2word=dictionary, num_topics=100)
model = hdpmodel.HdpModel(bow_corpus, id2word=dictionary)