gensim是基于python的自然语言处理库,可以自动的从文档中提取特征,语义信息等等。包括向量空间模型,word2vec, LSI, LDA, 转换之类的操作,非常方便。下面总结一些其基本用法,具体的教程参见:
import logging
#loading cofigurations
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora
from collections import defaultdict
from pprint import pprint ##pretty-printer
from six import iteritems
import logging ## 引入日志配置
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
stoplist = set('for a of the and to in'.split()) ##停留词
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
frequency = defaultdict(int) ##设置默认词频
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] >1] ##删除仅仅出现一次的词
for text in texts]
[['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
>>>dictionary = corpora.Dictionary(texts)
>>>'C:\Users\lenovo\Desktop\deerwester.dict') ##保存字典,方便以后直接加载调用
>>>print dictionary
Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']
>>> print(dictionary.token2id)
{'minors': 11, 'graph': 10, 'system': 5, 'trees': 9, 'eps': 8, 'computer': 0,
'survey': 4, 'user': 7, 'human': 1, 'time': 6, 'interface': 2, 'response': 3}
>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored
[(0, 1), (1, 1)]
dictionary.doc2bow()函数的作用是,计算文档new_doc中每个单词发生的次数,当然,字典dictionary中没有的单词将被忽略。最后我们会得到一个稀疏向量,这个结果表示”computer(id = 0)”单词出现一次,”human(id = 1)”单词出现一次。接下来,我们将每个文档都转化成向量,结果如下:
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/', corpus) # store to disk, for later use
>>> print(corpus)
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]
>>> class MyCorpus(object):
>>> def __iter__(self):
>>> for line in open('mycorpus.txt'):
>>> # assume there's one document per line, tokens separated by whitespace
>>> yield dictionary.doc2bow(line.lower().split())
>>> corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
>>> print(corpus_memory_friendly)
<__main__.MyCorpus object at 0x10d5690>
>>> for vector in corpus_memory_friendly: # load one vector into memory at a time
... print(vector)
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]
>>> from six import iteritems
>>> # collect statistics about all tokens
>>> dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
>>> # remove stop words and words that appear only once
>>> stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
>>> if stopword in dictionary.token2id]
>>> once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
>>> dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
>>> dictionary.compactify() # remove gaps in id sequence after words that were removed
>>> print(dictionary)
Dictionary(12 unique tokens)
最后我们再来谈谈corpus Formats.
There exist several file formats for serializing a Vector Space corpus (~sequence of vectors) to disk. Gensim implements them via the streaming corpus interface mentioned earlier: documents are read from (resp. stored to) disk in a lazy fashion, one document at a time, without the whole corpus being read into main memory at once.
>>> # create a toy corpus of 2 documents, as a plain Python list
>>> corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it
>>> corpora.MmCorpus.serialize('/tmp/', corpus)
>>> corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
>>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
>>> corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
>>> corpus = corpora.MmCorpus('/tmp/')
>>># Corpus objects are streams, so typically you won’t be able to print them directly:
>>>> print(corpus)
MmCorpus(2 documents, 2 features, 1 non-zero entries)
>>> # one way of printing a corpus: load it entirely into memory
>>> print(list(corpus)) # calling list() will convert any sequence to a plain Python list
[[(1, 0.5)], []]
>>> # another way of doing it: print one document at a time, making use of the streaming interface
>>> for doc in corpus:
... print(doc)
[(1, 0.5)]
>> import gensim
>>> import numpy as np
>>> numpy_matrix = np.random.randint(10, size=[5,2]) # random matrix as an example
>>> corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
>>> numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
>>> import scipy.sparse
>>> scipy_sparse_matrix = scipy.sparse.random(5,2) # random sparse matrix as example
>>> corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
>>> scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)