基于词频统计的文本相似度

基于词频统计的文本相似度

// An highlighted block
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 26 14:29:01 2018

@author: 呜啦吧哈
"""

import pymssql
import pandas as pd
import jieba
#import jieba.analyse as ana
import numpy as np
#import nltk
#from nltk.corpus import stopwords
##import re
#import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import confusion_matrix
#from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import pairwise_distances
###连接sql sever并读取数据到dataframe
conn = pymssql.connect('WIN-**********', 'sa', '34546555', 'test3')
cursor = conn.cursor()
sql = 'select * from test'
cursor.execute(sql)
df0 = pd.read_sql(sql, conn)
chapter = pd.DataFrame(df0)
#chapter = pd.read_csv("C:\\Users\\test\\Chinese_rss.csv",index_col='RSSID',encoding = 'gbk')

####在尾部插入数据
test =" China 中国  python 自然语言处理 张三 李四"
row = {"RSSID":"rank","EntryDescription":test}
#chapter.append(row,ignore_index=True)

chapter = chapter.append(row,ignore_index=True)

###分词及清理停用词
stopwords=pd.read_csv("C:\\Users\\test\\stopword.txt",names=['stopword'], encoding='utf-8')
stoplist = list(stopwords.stopword)

dict = 'C:\\Users\\test\\自定义词典.txt'
jieba.load_userdict(dict)

def m_cut(intxt):
    return [w for w in jieba.cut(intxt) if w not in stoplist and len(w)>1]

####文档预处理,提取主题词 
chaplist = [m_cut(w) for w in chapter.EntryDescription]

cleanchap = [" ".join(m_cut(w)) for w in chapter.EntryDescription]
###去除低频词
#from collections import defaultdict
#frequency=defaultdict(int)
#for text in chaplist:
      #for word in text:
           # frequency[word]+=1
#chaplist2=[[word for word in text if frequency[word]>1] for text in chaplist]

countvec = CountVectorizer(min_df = 2)###至少在2章中出现的词才纳入 
resmtx = countvec.fit_transform(cleanchap)

####基于词频矩阵计算tfidf值
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(resmtx)

###使用tfidf计算两篇文章相似度 
t = pairwise_distances(tfidf,metric = 'cosine')

#计算相关系数并做处理
col=pd.DataFrame(np.corrcoef(t,rowvar=0))

analyse=[]
for item in chapter['RSSID']:
    analyse.append(item) 
analyse = pd.DataFrame(analyse, columns=['RSSID'])

###行列重命名
col.columns=analyse["RSSID"].tolist()
col.index=col.columns

#i = 43658
#l=[*zip(col[i],[*col.index])]
#l.sort(reverse=True)


#for i in chapter['RSSID']:
   # l=[*zip(col[i],[*col.index])]####选取i列 ,i为尾列 
    #l.sort(reverse=True)##排序
    #text=str(l)###转化为str 
   
l=[*zip(col['rank'],[*col.index])]##取rank行数据
l.sort(reverse=True)##排序
l_new = l[1:]###去掉与其本身的相似度数据


cursor.executemany('UPDATE test Set rank = %s WHERE RSSID = %s', l_new)
conn.commit()

参考来源:
[1] https://docs.python.org/3.7/tutorial/index.html
[2] http://blog.sina.com.cn/s/blog_ae9d46c30102wzet.html
[3] https://www.oschina.net/question/4000602_2288253

你可能感兴趣的:(文本相似度)