from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import json
corpus = []
f = open("data/msmarco/collection.tsv",mode="r",encoding="utf-8")
count = 0
for line in f:
id, text = line.split("\t")
corpus.append(text.replace("\n","").lower())
count+=1
if count%10==0:
break
print("read text done")
print(len(corpus))
# def cut(sentence):
# return sentence.split(" ")
vectorizer = CountVectorizer(analyzer="word") # 将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i个文本下的词频
transformer = TfidfTransformer() # 统计每个词语的tf-idf权值
X = vectorizer.fit_transform(corpus)
print("CountVectorizer.fit_transform done")
tfidf = transformer.fit_transform(X) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
print("TfidfTransformer.fit_transform done")
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
outname = "word2weight.jsonl"
import os
if os.path.exists(outname):
os.remove(outname)
f2 = open(outname, mode="a", encoding="utf-8")
i = 0
for line in tfidf: # 打印每个文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一个文本下的词语权重
word2weight = {}
for id,weight in enumerate(line.toarray()[0]):
if weight>0.1:
word2weight[word[id]]=weight
json.dump(word2weight, f2)
f2.write('\n')
i += 1
if i%100000==0:
print(i)
f2.close()
其中调用tfidf.toarray()
的话,即转成dense的,数据量大时会MemoryError