官网上实现
>>> from gensim.models import AuthorTopicModel
>>> from gensim.corpora import mmcorpus
>>> from gensim.test.utils import common_dictionary, datapath, temporary_file
>>> author2doc = {
... 'john': [0, 1, 2, 3, 4, 5, 6],
... 'jane': [2, 3, 4, 5, 6, 7, 8],
... 'jack': [0, 2, 4, 6, 8]
... }
>>>
>>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
>>>
>>> with temporary_file("serialized") as s_path:
... model = AuthorTopicModel(
... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
... serialized=True, serialization_path=s_path
... )
...
... model.update(corpus, author2doc) # update the author-topic model with additional documents
>>>
>>> # construct vectors for authors
>>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
发现,各种尝试不知道testcorpus.mm怎么生成的
corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
经过尝试,去掉了线性转化,修改成
from gensim.models import AuthorTopicModel
from gensim.corpora import mmcorpus, Dictionary
from gensim.test.utils import datapath, temporary_file, common_dictionary, common_corpus
author2doc = {'A john\n': [0, 1, 2, 3, 4, 5, 6],'B jane\n': [2, 3, 4, 5, 6, 7, 8],'C_jack': [0, 8, 4, 6, 8],'Reeves':[2,2,1,2,4],'abc':[1,2,2,2,4,5,5,6,7,8],'12':[2,2,4],'chen':[1,2,3],'qian':[2,6,7],'li':[4,6,8]}
print(len(common_corpus))
print(len(author2doc))
model = AuthorTopicModel(common_corpus,author2doc=author2doc,num_topics=4)
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
print(author_vecs)
注意替换成自己的数据会出现list index out of range的错误
File "/home/miachen/download/topicModelling-master/venv/lib/python3.7/site-packages/gensim/models/atmodel.py", line 830, in <listcomp>
chunk = [self.corpus[d] for d in chunk_doc_idx]
IndexError: list index out of range
主要是因为没有注意AuthorTopicModel的参数author2doc,
author2doc = {'A john\n': [0, 1, 2, 3, 4, 5, 6],'B jane\n': [2, 3, 4, 5, 6, 7, 8],'C_jack': [0, 8, 4, 6, 8],'Reeves':[2,2,1,2,4],'abc':[1,2,2,2,4,5,5,6,7,8],'12':[2,2,4],'chen':[1,2,3],'qian':[2,6,7],'li':[4,6,8]}
指的是文档0到6的作者都是A john\n,0-6并不是代表词的id号,一般情况下,词的总数肯定大于文档数,之前在代码中出现的列表数字大于文档数,所以才会出现列表溢出情况,把作者的列表数字改成小于等于文档数,运行正常。如文档数为5,作者对应的列表为A john\n’: [0, 1, 2, 3, 4, 5, 6], A john不能对应第六,第七个文档,把5,6删除或改小.
下面是数据在文本里,我用该模型进行实现代码。
from gensim.corpora import Dictionary
from gensim.models import AuthorTopicModel
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
f = open("../data/paperdata/test.txt")
doc_complete=[]
for line in f:
doc_complete.append(line)
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
doc_clean = [clean(doc).split() for doc in doc_complete]
print(doc_clean)
import gensim
from gensim import corpora
# 创建语料的词语词典,每个单独的词语都会被赋予一个索引
dictionary = Dictionary(doc_clean)
# 使用上面的词典,将转换文档列表(语料)变成 DT 矩阵
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
f = open("../data/paperdata/author_name_test.txt")
author_name=set()
author_list=[]
author2doc={}
count=0
for line in f:
if line not in author_name:
author_name.add(line)
author_list=[]
author_list.append(count)
count = count + 1
else:
author_list.append(count)
count = count + 1
author2doc[line]=author_list
model = AuthorTopicModel(doc_term_matrix,author2doc=author2doc,num_topics=4)
model.update(doc_term_matrix, author2doc)
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
print(author_vecs)
测试数据用的很简单,创建 author_name_test.txt 文本,格式如
Liskov B.
O_Toole
中文数据集没找到,只能英文, 创建test.txt文本,格式如
Computers and Intractibility: A Guide to the Theory of NP-Completeness, W.H.
A Study of Three Alternative Workstation-Server Architectures for Object-Oriented Database Systems
author_name_test的作者与test的文章是对应从属关系,第一列作者写了第一列文章.
最后打印结果
[[(0, 0.022871068671247143), (1, 0.022894117627199218), (2,
0.9313992797777058), (3, 0.022835533923847944)], [(0, 0.017535824512193033), (1, 0.017555274089571166), (2, 0.01749425993473785), (3, 0.947414641463498)]]**
结果表示,有2个list,(即两个作者).每个list里有4个键值对应.(即4个关键次)第一个作者分配到id=0的关键词的概率为0.022871068671247143,分配到id=1的关键词为0.022894117627199218,以此类推.关键词的id可以在其它文件对应相应的具体单词.
数据集不好找,可以从这个地址下载
Original dataset source: https://people.cs.umass.edu/~mccallum/data.html