如何用python做文本相似度计算

import nltk
import numpy as np
import jieba
import codecs
import os
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster import hierarchy
from scipy import cluster
import scipy

connect=MongoClient("",27017
db2 = connect['
’]
db2.authenticate(“", "”)
collect = db2.****

class SummaryTxt:

def __init__(self,stopwordspath):
    # 单词数量
    self.N = 100
    # 单词间的距离
    self.CLUSTER_THRESHOLD = 5
    # 返回的top n句子
    self.TOP_SENTENCES = 5
    self.stopwrods = {}
    #加载停用词
    if os.path.exists(stopwordspath):
        stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()]
        self.stopwrods = {}.fromkeys(stoplist)

def _split_sentences(self,texts):
    '''
    把texts拆分成单个句子,保存在列表里面,以(.!?。!?)这些标点作为拆分的意见,
    :param texts: 文本信息
    :return:
    '''
    splitstr = '!?。!?;;   :'
    start = 0
    index = 0  # 每个字符的位置
    sentences = []
    for text in texts:
        if text in splitstr:  # 检查标点符号下一个字符是否还是标点
            sentences.append(texts[start:index + 1])  # 当前标点符号位置
            start = index + 1  # start标记到下一句的开头
        index += 1
    if start < len(texts):
        sentences.append(texts[start:])  # 这是为了处理文本末尾没有标

    return sentences

def _score_sentences(self,sentences, topn_words):
    '''
    利用前N个关键字给句子打分
    :param sentences: 句子列表
    :param topn_words: 关键字列表
    :return:
    '''
    scores = []
    sentence_idx = -1
    for s in [list(jieba.cut(s)) for s in sentences]:
        sentence_idx += 1
        word_idx = []
        for w in topn_words:
            try:
                word_idx.append(s.index(w))  # 关键词出现在该句子中的索引位置
            except ValueError:  # w不在句子中
                pass
        word_idx.sort()
        if len(word_idx) == 0:
            continue
        # 对于两个连续的单词,利用单词位置索引,通过距离阀值计算族
        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)
        # 对每个族打分,每个族类的最大分数是对句子的打分
        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
            if score > max_cluster_score:
                max_cluster_score = score
        scores.append((sentence_idx, max_cluster_score))
    return scores

def summaryScoredtxt(self,text):
    # 将文章分成句子
    sentences = self._split_sentences(text)

    # 生成分词
    words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
             len(w) > 1 and w != '\t']
    # words = []
    # for sentence in sentences:
    #     for w in jieba.cut(sentence):
    #         if w not in stopwords and len(w) > 1 and w != '\t':
    #             words.append(w)

    # 统计词频
    wordfre = nltk.FreqDist(words)

    # 获取词频最高的前N个词
    topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

    # 根据最高的n个关键词,给句子打分
    scored_sentences = self._score_sentences(sentences, topn_words)

    # 利用均值和标准差过滤非重要句子
    avg = np.mean([s[1] for s in scored_sentences])  # 均值
    std = np.std([s[1] for s in scored_sentences])  # 标准差
    summarySentences = []

    for (sent_idx, score) in scored_sentences:
        if score > (avg + 0.5 * std):
            summarySentences.append(sentences[sent_idx])
            # print(sentences[sent_idx])
    return summarySentences

def cos_sim(self,cons):
    # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()
    # 该类会统计每个词语的tf-idf权值
    transformer = TfidfTransformer()
    # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
    tfidf = transformer.fit_transform(vectorizer.fit_transform(cons))
    # 获取词袋模型中的所有词语
    # word = vectorizer.get_feature_names()
    weight = tfidf.toarray()
    # print(weight.shape)
    Y = scipy.spatial.distance.pdist((weight, 'cityblock')) # 计算距离列表
    Z = hierarchy.linkage(Y, 'average')  # 进行层次聚类
    cluster.hierarchy.cophenet(Z, Y)  # 计算共表相关系数
    
    # with open("result.csv", 'a+', encoding="utf-8") as result:
    #     result.write(weight[0])
    #     result.write('\n')
#     return weight

if name==‘main’:
obj =SummaryTxt(’./stopwords.txt’)
i = 0
# a_list = []
for item in collect.find(no_cursor_timeout=True)[i:100]:
txt = str(item[“content”])
con = [str(obj.summaryScoredtxt(txt)).replace("’,’","").replace("[’","").replace("’]","")]
if len(con[0]) > 3:
obj.cos_sim(con)
# a_list.append(obj.cos_sim(con))
# obj.chenci(a_list)
# print(a_list)

你可能感兴趣的:(python)