gensim跟sklearn对tf-idf的使用

from nltk import word_tokenize,WordNetLemmatizer
import pandas as pd
from nltk.corpus import stopwords
import re
from gensim import corpora
from gensim import models
from sklearn.feature_extraction.text import TfidfVectorizer
stoplist = stopwords.words('english')
data_train=pd.read_csv(r'D:\Kaggle\train.tsv',sep='\t')
def clean(data):
    all_word = []
    for i in data:
        i = re.sub('[^a-zA-Z]',' ',i)
        word_list = word_tokenize(i)
        word_result = [i for i in word_list if i not in stoplist]
        if word_result !=[]:
            all_word.append(word_result)
    return all_word
all_word = clean(data_train.Phrase.values)
dictionary = corpora.Dictionary(all_word) # 赋给语料库中每个词(不重复的词)一个整数id
new_corpus = [dictionary.doc2bow(text) for text in all_word]# 元组中第一个元素是词语在词典中对应的id,第二个元素是词语在文档中出现的次数
# print(dictionary.token2id)
tfidf = models.TfidfModel(new_corpus)
tfidf.save("my_model.tfidf")
tfidf = models.TfidfModel.load("my_model.tfidf")
tfidf_vec = []
for i in all_word:
    # for j in i:
    #     print(j)
    string_bow = dictionary.doc2bow(i) #列表形式输入
    string_tfidf = tfidf[string_bow]
    tfidf_vec.append(string_tfidf)
# for i in tfidf_vec:
#     print(i) #获取ti-idf值
tfidf_vec1 = TfidfVectorizer()
re_word = []
for i in all_word:
    for j in i:
        re_word.append(j)
# print(re_word)
tfidf_matrix = tfidf_vec1.fit_transform(re_word) #拟合语料
print(tfidf_vec1.get_feature_names())# 得到语料库所有不重复的词
print(tfidf_vec1.vocabulary_) #每个词对应的id
# 得到每个句子所对应的向量
# 向量里数字的顺序是按照词语的id顺序来的
print(tfidf_matrix)

 

你可能感兴趣的:(gensim跟sklearn对tf-idf的使用)