tfidf+余弦相似度

1、TfidfVectorizers生成的矩阵,要使用每一行的时候,必须重新存入新的数组,否则无法进行相似度计算

2、矩阵超出255*255,xlwt不适用,使用xlsxwriter

3、导出词袋的时候,按一列N行保存,不要按一行N列,xlwt会出错

4、xlwt保存格式用xls,用xlsx也会出错

# -*- coding: utf-8 -*-
"""
Created on Wed Apr  8 10:39:20 2020

@author: 娴娴
"""

import numpy as np
import xlwt
import xlsxwriter
from sklearn.feature_extraction.text import  TfidfVectorizer

#    计算向量a,b的余弦值
def getCos(vec_a,vec_b):
    sum = 0
    sq1 = 0
    sq2 = 0
    for i in range(len(vec_a)):
        sum += vec_a[i] * vec_b[i]
        sq1 += pow(vec_a[i], 2)
        sq2 += pow(vec_b[i], 2)
    try:
        result = round(float(sum) / (np.sqrt(sq1) * np.sqrt(sq2)), 2)
    except ZeroDivisionError:
        result = 0.0
    return result

if __name__ == "__main__":
    corpus = []
#    读取82个txt文件    
    for i in range(82):
        f = open("txt%d"% (i+1) + ".txt", "r", encoding='ANSI')
        str1 = f.read()
        corpus.append(str1)
#    计算tfidf值        
    cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english') #分词、去除停用词
    vec=cv.fit_transform(corpus)#传入句子组成的list
    arr=vec.toarray()#生成tfidf矩阵
    
#    将矩阵导出excel
#    矩阵超出255*255,使用xlsxwriter
    f1 =xlsxwriter.Workbook(r'C:\Users\娴娴\Desktop\tfidf_array.xlsx') #创建工作簿 
    sheet1 = f1.add_worksheet('sheet1') #创建sheet 
    [h,l]=arr.shape #h为行数,l为列数
    
    for i in range (h): 
      for j in range (l):
        sheet1.write(i,j,arr[i,j])
        
    f1.close()
    
#    获取词袋
    cd=cv.get_feature_names()
#    print (cd)
    
#  将词袋写入excel
    f2 = xlwt.Workbook()
    sheet1 = f2.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet

    for i in range(len(cd)):
            sheet1.write(i,1,cd[i])
            
    f2.save(r'C:\Users\娴娴\Desktop\cidai.xls') #保存文件

#    计算余弦相似度并导出excel
    f3 = xlwt.Workbook()
    sheet1 = f3.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
    for i in range (h): 
#      获取arr[i],存入a
      a=[]
      for m in range (l):
          a.append(arr[i,m])

      for j in range (h):
#          获取arr[j],存入b
          b=[]
          for m in range (l):
              b.append(arr[j,m])
          result = getCos(a,b)
          sheet1.write(i,j,result)
    f3.save(r'C:\Users\娴娴\Desktop\cos_value2.xls') #保存文件      




    
    
    

 

你可能感兴趣的:(python)