#coding:utf-8
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
from gensim import corpora,models,similarities
#将文本的原始特征转化为词袋模型对应的向量表达
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
#输出的向量代表了每个单词在这个文档中出现的次数
print corpus
#支持流式处理,使用协程
class MyCorpus(object):
def __init__(self,texts):
self.texts = texts
def __iter__(self):
for text in texts:
yield text
#新建对象并传入Dictionary中
Corpus = MyCorpus(texts)
dictionary = corpora.Dictionary(Corpus)
#dictionary的doc2bow能将给定的文本向量化
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus
#对向量化后的文本建立tf-idf模型,对corpus中出现的所有特征计算IDF值
tfidf = models.TfidfModel(corpus)
#给定的一个文档向量,计算出对应的tf-idf的向量
doc_bow = [(0,1),(1,1)]
print tfidf[doc_bow]
#变换
doc_ = ["graph","minors","trees"]
doc_vec = dictionary.doc2bow(doc_)
print tfidf
print tfidf[doc_vec]
#备注
#gensim内置的LDA,rp,hdp是以bow向量或者tf-idf向量的语料作为输入,生成对应的主题向量
#使用lsi计算文档相似度
#注意这里输入的corpus必须是bow向量
lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics=2)
documents = lsi_model[corpus]
print documents
query_vec = lsi_model[doc_vec]
#dictionary只是将doc自然语言的文本==>BOW向量,这是模型支持的格式
#lsi_model 将查询的BOW向量==>LSI向量
#初始的语料库成为了bow向量后也被lsi_model转化为了LSI向量
print doc_vec
print query_vec
#待检索的文档向量初始化一个相似度计算的对象
index = similarities.MatrixSimilarity(documents)
#用index计算相似度,传入需要比较的文本的LSI类型的向量
#返回查询的文档和所有文档的相似度
print index[query_vec]
#coding:utf-8
from gensim.models.word2vec import Word2Vec
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
class MySentences(object):
def __init__(self,filename):
self.filename = filename
def __iter__(self):
f = open(self.filename)
for line in f.readlines():
yield line.strip().split(" ")
#创建一个对象后传入到Word2Vec训练得到一个模型
#默认条件下的min_count=5,小于5的词会被全部删除
sentences = MySentences("E:\\nltkbase\\mycorpus.txt")
model = Word2Vec(sentences,min_count=1)
print model
print model["computer"]
#由于API中的LabeledSentence每次只能针对一个句子加上标签,所以我们新建一个类批量转换
from gensim.models.doc2vec import LabeledSentence
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
labels = 0
f = open(self.filename)
for line in f.readlines():
labels =+ 1
yield LabeledSentence(line.split(" "),["label_" + str(labels)])
sents = LabeledLineSentence("E:\\nltkbase\\mycorpus.txt")
senLst = [sent for sent in sents]
'''
建立词典的时候要向build_vocab方法传入一个list,list中的每个元素都是LabeledSentences的结果
在训练的时候为了随机性,每次都要将list打乱
'''
import random
from gensim.models import Doc2Vec
model = Doc2Vec(dm=1,size=100,window=5,negative=5,hs=0,min_count=1,workers=4)
model.build_vocab(senLst)
for epoch in range(10):
random.shuffle(senLst)
model.train(senLst)
print model.most_similar("computer")
print model["human"]
参考文献:
关于doc2vec:
http://linanqiu.github.io/2015/10/07/word2vec-sentiment/
关于LSI和LDA
http://www.cnblogs.com/iloveai/p/gensim_tutorial.html
关于word2vec
http://www.cnblogs.com/iloveai/p/gensim_tutorial2.html
mycorpus.txt的内容:
Human machine interface for lab abc computer applications
A survey of user opinion of computer system response time
The EPS user interface management system
System and human system engineering testing of EPS
Relation of user perceived response time to error measurement
The generation of random binary unordered trees
The intersection graph of paths in trees
Graph minors IV Widths of trees and well quasi ordering
Graph minors A survey