下面这段代码是用来计算文本的词频、TF-IDF值
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# Load data
comment1 = pd.read_csv(r"good_1.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8')
comment2 = pd.read_csv(r"good_2.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8')
comment3 = pd.read_csv(r"good_3.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8')
comment4 = pd.read_csv(r"good_4.csv", header = 0, index_col = False, engine='python',encoding = 'utf-8')
# 生成corpus
corpus = []
for i in range(4):
file = eval('comment'+str(i+1))
print(file.shape)
comment_txt = ''
for line in range(file.shape[0]):
if len(str(file.iloc[line,8])) >5:
comment_txt += file.iloc[line,8]
if (line % 2000 == 0):
print(line, end = ' ')
corpus.append(comment_txt)
a = pd.DataFrame(columns = [0, 1, 2, 3])
# 计数器和TFIDF生成器
cv = CountVectorizer()
cv_fit = cv.fit_transform(corpus)
# 计算
transformer = TfidfTransformer()
tfidf_fit = transformer.fit_transform(cv_fit)
word = cv.get_feature_names()
weight = cv_fit.toarray()
weight_tfidf = tfidf_fit.toarray()
for i in range(len(weight)):
print("-------第", i+1, "段文本的词语------")
for j in range(len(word)):
# 进度查看器
if j % 2000 == 0:
print(j,end =' ')
b = pd.DataFrame([str(i), str(word[j]), str(weight[i][j]), str(weight_tfidf[i][j])]).T
a = pd.concat([a,b])
print(' ')
a.columns = ['type','name','number','tfidf']
# 写文件
a.to_csv(r"tfidf.csv",header=True,index=False,encoding='utf-8')