import nltk
import numpy as np
import jieba
import codecs
import os
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster import hierarchy
from scipy import cluster
import scipy
connect=MongoClient("",27017
db2 = connect['’]
db2.authenticate(“", "”)
collect = db2.****
class SummaryTxt:
def __init__(self,stopwordspath):
# 单词数量
self.N = 100
# 单词间的距离
self.CLUSTER_THRESHOLD = 5
# 返回的top n句子
self.TOP_SENTENCES = 5
self.stopwrods = {}
#加载停用词
if os.path.exists(stopwordspath):
stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()]
self.stopwrods = {}.fromkeys(stoplist)
def _split_sentences(self,texts):
'''
把texts拆分成单个句子,保存在列表里面,以(.!?。!?)这些标点作为拆分的意见,
:param texts: 文本信息
:return:
'''
splitstr = '!?。!?;; :'
start = 0
index = 0 # 每个字符的位置
sentences = []
for text in texts:
if text in splitstr: # 检查标点符号下一个字符是否还是标点
sentences.append(texts[start:index + 1]) # 当前标点符号位置
start = index + 1 # start标记到下一句的开头
index += 1
if start < len(texts):
sentences.append(texts[start:]) # 这是为了处理文本末尾没有标
return sentences
def _score_sentences(self,sentences, topn_words):
'''
利用前N个关键字给句子打分
:param sentences: 句子列表
:param topn_words: 关键字列表
:return:
'''
scores = []
sentence_idx = -1
for s in [list(jieba.cut(s)) for s in sentences]:
sentence_idx += 1
word_idx = []
for w in topn_words:
try:
word_idx.append(s.index(w)) # 关键词出现在该句子中的索引位置
except ValueError: # w不在句子中
pass
word_idx.sort()
if len(word_idx) == 0:
continue
# 对于两个连续的单词,利用单词位置索引,通过距离阀值计算族
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# 对每个族打分,每个族类的最大分数是对句子的打分
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, max_cluster_score))
return scores
def summaryScoredtxt(self,text):
# 将文章分成句子
sentences = self._split_sentences(text)
# 生成分词
words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
len(w) > 1 and w != '\t']
# words = []
# for sentence in sentences:
# for w in jieba.cut(sentence):
# if w not in stopwords and len(w) > 1 and w != '\t':
# words.append(w)
# 统计词频
wordfre = nltk.FreqDist(words)
# 获取词频最高的前N个词
topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]
# 根据最高的n个关键词,给句子打分
scored_sentences = self._score_sentences(sentences, topn_words)
# 利用均值和标准差过滤非重要句子
avg = np.mean([s[1] for s in scored_sentences]) # 均值
std = np.std([s[1] for s in scored_sentences]) # 标准差
summarySentences = []
for (sent_idx, score) in scored_sentences:
if score > (avg + 0.5 * std):
summarySentences.append(sentences[sent_idx])
# print(sentences[sent_idx])
return summarySentences
def cos_sim(self,cons):
# 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
# 该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
# 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(cons))
# 获取词袋模型中的所有词语
# word = vectorizer.get_feature_names()
weight = tfidf.toarray()
# print(weight.shape)
Y = scipy.spatial.distance.pdist((weight, 'cityblock')) # 计算距离列表
Z = hierarchy.linkage(Y, 'average') # 进行层次聚类
cluster.hierarchy.cophenet(Z, Y) # 计算共表相关系数
# with open("result.csv", 'a+', encoding="utf-8") as result:
# result.write(weight[0])
# result.write('\n')
# return weight
if name==‘main’:
obj =SummaryTxt(’./stopwords.txt’)
i = 0
# a_list = []
for item in collect.find(no_cursor_timeout=True)[i:100]:
txt = str(item[“content”])
con = [str(obj.summaryScoredtxt(txt)).replace("’,’","").replace("[’","").replace("’]","")]
if len(con[0]) > 3:
obj.cos_sim(con)
# a_list.append(obj.cos_sim(con))
# obj.chenci(a_list)
# print(a_list)