TfidfTransformer 输出

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import json

corpus = []
f = open("data/msmarco/collection.tsv",mode="r",encoding="utf-8")
count = 0
for line in f:
    id, text = line.split("\t")
    corpus.append(text.replace("\n","").lower())
    count+=1
    if count%10==0:
        break

print("read text done")
print(len(corpus))
# def cut(sentence):
#         return sentence.split(" ")
vectorizer = CountVectorizer(analyzer="word")  # 将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i个文本下的词频
transformer = TfidfTransformer()  # 统计每个词语的tf-idf权值
X = vectorizer.fit_transform(corpus)
print("CountVectorizer.fit_transform done")
tfidf = transformer.fit_transform(X)  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
print("TfidfTransformer.fit_transform done")
word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
outname = "word2weight.jsonl"

import os
if os.path.exists(outname):
  os.remove(outname)

f2 = open(outname, mode="a", encoding="utf-8")

i = 0
for line in tfidf:  # 打印每个文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一个文本下的词语权重
    word2weight = {}
    for id,weight in enumerate(line.toarray()[0]):
        if weight>0.1:
            word2weight[word[id]]=weight
    json.dump(word2weight, f2)
    f2.write('\n')
    i += 1
    if i%100000==0:
        print(i)

f2.close()

其中调用tfidf.toarray()的话,即转成dense的,数据量大时会MemoryError

你可能感兴趣的:(Python)