gensim的corpora.Dictionary

df_train_dict = corpora.Dictionary(try2) #建立词典 
## 如果想要过滤掉出现次数为1的词,可以使用以下代码
ids=[]
for key in df_train_dict.iterkeys():
    if df_train_dict.dfs[key]==1:
        ids.append(key)
df_train_dict.filter_tokens(bad_ids=ids)
print (len(df_train_dict))  #66626
'''
ids2=[]
for key in df_train_dict.itervalues():
    if len(key)==1:
        ids2.append(key)
''' 
#去掉单个字      
ids3=[]
for key,items in df_train_dict.iteritems():
    if len(items)==1:
        ids3.append(key)
df_train_dict.filter_tokens(bad_ids=ids3)
print (len(df_train_dict))  #64081


df_train_corpus = [df_train_dict.doc2bow(i) for i in try2] #建立语料库

单个解释一下

1.

dictionary.filter_tokens(good_ids=[0]) #good_ids=[0,2]表示仅保留编号为0,2的词语,bad_ids=[1,3]表示要删除编号为1,3的词语

2.

 print dictionary.keys()

#返回所有词语的编号 【上篇文章讲到了每个分词在corpora.Dictionary这一步都建立了一个唯一的编号】

3.

print dictionary.dfs [单词id]

#{单词id,在多少文档中出现}

这里有个坑,看下面代码

# 加载数据
wordslist = ["我在玉龙雪山我我我我","我喜欢玉龙雪山","我还要去玉龙雪山"] 
# 切词
textTest = [[word for word in jieba.cut(words)] for words in wordslist]
# 生成字典
dictionary = corpora.Dictionary(textTest,prune_at=2000000)

for key in dictionary.iterkeys():
    print (key,dictionary.get(key),dictionary.dfs[key])
# 1 在 1
# 5 还要 1
# 0 我 3
# 2 玉龙雪山 3
# 3 喜欢 1
# 4 去 1


# 加载数据
wordslist = ["我在玉龙雪山","我喜欢玉龙雪山","我还要去玉龙雪山"] 
# 切词
textTest = [[word for word in jieba.cut(words)] for words in wordslist]
# 生成字典
dictionary = corpora.Dictionary(textTest,prune_at=2000000)

for key in dictionary.iterkeys():
    print (key,dictionary.get(key),dictionary.dfs[key])
# 1 在 1
# 5 还要 1
# 0 我 3
# 2 玉龙雪山 3
# 3 喜欢 1
# 4 去 1

所以这是对单个文档来说的 
4.

for value in df_train_dict.itervalues():
    value是分词过后的分词
#去掉单个字      
ids3=[]
for key,items in df_train_dict.iteritems():
    if len(items)==1:
        ids3.append(key)
 

因此如果分词过后的词是一个单字,就append对应的key

之后再删除

 

https://blog.csdn.net/u011311291/article/details/78836535

你可能感兴趣的:(nlp)