1、TfidfVectorizers生成的矩阵,要使用每一行的时候,必须重新存入新的数组,否则无法进行相似度计算
2、矩阵超出255*255,xlwt不适用,使用xlsxwriter
3、导出词袋的时候,按一列N行保存,不要按一行N列,xlwt会出错
4、xlwt保存格式用xls,用xlsx也会出错
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 8 10:39:20 2020
@author: 娴娴
"""
import numpy as np
import xlwt
import xlsxwriter
from sklearn.feature_extraction.text import TfidfVectorizer
# 计算向量a,b的余弦值
def getCos(vec_a,vec_b):
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(vec_a)):
sum += vec_a[i] * vec_b[i]
sq1 += pow(vec_a[i], 2)
sq2 += pow(vec_b[i], 2)
try:
result = round(float(sum) / (np.sqrt(sq1) * np.sqrt(sq2)), 2)
except ZeroDivisionError:
result = 0.0
return result
if __name__ == "__main__":
corpus = []
# 读取82个txt文件
for i in range(82):
f = open("txt%d"% (i+1) + ".txt", "r", encoding='ANSI')
str1 = f.read()
corpus.append(str1)
# 计算tfidf值
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english') #分词、去除停用词
vec=cv.fit_transform(corpus)#传入句子组成的list
arr=vec.toarray()#生成tfidf矩阵
# 将矩阵导出excel
# 矩阵超出255*255,使用xlsxwriter
f1 =xlsxwriter.Workbook(r'C:\Users\娴娴\Desktop\tfidf_array.xlsx') #创建工作簿
sheet1 = f1.add_worksheet('sheet1') #创建sheet
[h,l]=arr.shape #h为行数,l为列数
for i in range (h):
for j in range (l):
sheet1.write(i,j,arr[i,j])
f1.close()
# 获取词袋
cd=cv.get_feature_names()
# print (cd)
# 将词袋写入excel
f2 = xlwt.Workbook()
sheet1 = f2.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
for i in range(len(cd)):
sheet1.write(i,1,cd[i])
f2.save(r'C:\Users\娴娴\Desktop\cidai.xls') #保存文件
# 计算余弦相似度并导出excel
f3 = xlwt.Workbook()
sheet1 = f3.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
for i in range (h):
# 获取arr[i],存入a
a=[]
for m in range (l):
a.append(arr[i,m])
for j in range (h):
# 获取arr[j],存入b
b=[]
for m in range (l):
b.append(arr[j,m])
result = getCos(a,b)
sheet1.write(i,j,result)
f3.save(r'C:\Users\娴娴\Desktop\cos_value2.xls') #保存文件