自动摘要(抽取式)

wiki语料处理与word2vec模型训练
step1_wiki.py

# -*- encoding:utf-8 -*-

'''
Function: 解析.bz2格式语料包,得到.txt
'''

import requests
from gensim.corpora.wikicorpus import extract_pages, filter_wiki
import bz2file
import re
from tqdm import tqdm
import codecs

wikipath = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2'))   # 待处理语料
wikioutpath = 'wiki.txt'

url = 'http://139.129.98.92:8080/nlp-cs-fjconvert/nlp/tools/cc'     # 繁体转简体接口
headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}


# 去多余符号和繁体转简体子函数'''
def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('[\s\S]*?', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    ans = []
    with open('wikis.txt', 'w', encoding='utf-8') as f1:
        f1.write(s)
    with open('wikis.txt', 'r', encoding='utf-8') as f2:
        lines = f2.readlines()
        for line in lines:
            print('原文:' + line)
            properties = {'text': line.encode("utf-8"), 'access_token': '123456', 'conversion': 'tw2s'}        # 接口输入
            resp = requests.get(url, params=properties, headers=headers)        # 接口输出
            print('译文:' + resp.text)
            ans.append(resp.text)
            print(ans)
    return ans


# 可执行语句
i = 0   # 标志位
f = codecs.open(wikioutpath, 'w', encoding='utf-8')    # 生成结果存储文件
w = tqdm(wikipath, desc=u'已获取0篇文章')     # 进度条
for d in w:     # 按文章处理
    if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(u'^#', d[1]):
        s = wiki_replace(d)     # 处理
        for line in s:      # 按行写入
            f.write(line)
        f.write('\n')
        i += 1
        print(i)
        if i % 100 == 0:
            w.set_description(u'已获取%s篇文章' % i)      # 100篇文章显示一次
f.close()

step2_prehandle.py

# -*- encoding:utf-8 -*-

'''
Function: 预处理(分句、分词和去停用词)语料
'''

import jieba

stopwordpath = 'stopword.txt'
wikipath = "wiki.txt"
wikioutpath = "wiki_seg.txt"


# 句子切分子函数
def cut_sentences(sentence):
    puns = frozenset(u'。!?')   # 创建不可变集合{'。', '!', '?'}
    tmp = []
    for ch in sentence:   # 每个字符
        tmp.append(ch)   # 加入tmp末尾
        if puns.__contains__(ch):   # 如果某个字是。!?
            yield ''.join(tmp)      # 完成一次迭代也即一句话为一个元素
            tmp = []                # 重置tmp
    yield ''.join(tmp)     # 返回一个generator


# 创建停用词列表子函数
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='UTF-8').readlines()]
    return stopwords


# 分词和去停用词子函数
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())      # 先去空格再分词
    stopwords = stopwordslist(stopwordpath)        # 加载停用词
    outstr = ''      #初始输出
    for word in sentence_seged:      # 循环,所有在文本中的词
        if word not in stopwords:      # 如果不是停用词
            if word != '\t':      # 如果不是缩进
                outstr += word      # 记入输出
                outstr += " "      # 加空格
    return outstr


# 可执行语句
with open(wikipath, "r", encoding='utf-8') as myfile:      # 加载待处理语料
    text = myfile.read().replace('\n', '')      # 读取为一行
    print('原文:' + text)
    token = cut_sentences(text)     # 切分为句子
    output = open(wikioutpath, "w", encoding='utf-8')      # 生成结果文件
    for sentence in token:      # 按句处理
        print(sentence)
        line_seg = seg_sentence(sentence)       # 分词与去停用词
        output.write(line_seg)      # 按句写入
    output.close()

step3_model_train.py

# encoding=utf-8

'''
Function: 模型训练
Algorithm: word2vec
'''

import logging
from gensim.models import word2vec

wikipath = u"/data01/chennan/wiki_chs.txt"
modeloutpath = '/data01/chennan/modelcn'

'''可执行语句'''
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus(wikipath)  # 加载语料
model = word2vec.Word2Vec(sentences, size=200, min_count=2)     # 训练
model.save(modeloutpath)      # 模型存储

自动摘要

extractiveCN.py

# -*- coding: utf-8 -*-

'''
Interface: ExtractiveCN(text,  summarizenum=4, stoppath='stopword.txt')
SupportFile: stopword.txt  model
Function: 中文抽取式自动摘要
Algorithm: word2vec + Textrank
'''

import jieba
import math
from heapq import nlargest
from itertools import product, count
import numpy as np
from gensim.models import word2vec

np.seterr(all='warn')


# 按句切分并返回一个generator
# note:切分方法很重要,可以有效避免同一句话中间转折,从而导致被断句。这种方法先replace('\n',''),再检测。!?'''
def cut_sentences(sentence):
    puns = frozenset(u'。!?')   # 创建不可变集合{'。', '!', '?'}
    tmp = []
    for ch in sentence:   # 每个字符
        tmp.append(ch)   # 加入tmp末尾
        if puns.__contains__(ch):   # 如果某个字是。!?
            yield ''.join(tmp)      # 完成一次迭代也即一句话为一个元素
            tmp = []                # 重置tmp
    yield ''.join(tmp)     # 返回一个generator


# 创建停用词表
def create_stopwords(stoppath):
    stop_list = [line.strip() for line in open(stoppath, 'r', encoding='utf-8').readlines()]
    return stop_list


# 计算两个句子的相似性
def two_sentences_similarity(sents_1, sents_2):
    counter = 0
    for sent in sents_1:
        if sent in sents_2:
            counter += 1
    return counter / (math.log(len(sents_1) + len(sents_2)))


# 传入句子链表  返回句子之间相似度的图
def create_graph(model, word_sent):
    num = len(word_sent)
    board = [[0.0 for _ in range(num)] for _ in range(num)]
    for i, j in product(range(num), repeat=2):      # range(num)未0-15的整数
        if i != j:
            board[i][j] = compute_similarity_by_avg(model, word_sent[i], word_sent[j])
    return board


# 计算两个向量之间的余弦相似度
def cosine_similarity(vec1, vec2):
    tx = np.array(vec1)
    ty = np.array(vec2)
    cos1 = np.sum(tx * ty)
    cos21 = np.sqrt(sum(tx ** 2))
    cos22 = np.sqrt(sum(ty ** 2))
    cosine_value = cos1 / float(cos21 * cos22)
    return cosine_value


# 对两个句子求平均词向量
def compute_similarity_by_avg(model, sents_1, sents_2):
    if len(sents_1) == 0 or len(sents_2) == 0:
        return 0.0
    vec1 = model[sents_1[0]]
    for word1 in sents_1[1:]:
        vec1 = vec1 + model[word1]
    vec2 = model[sents_2[0]]
    for word2 in sents_2[1:]:
        vec2 = vec2 + model[word2]
    similarity = cosine_similarity(vec1 / len(sents_1), vec2 / len(sents_2))
    return similarity


# 计算句子在图中的分数
def calculate_score(weight_graph, scores, i):
    length = len(weight_graph)
    d = 0.85
    added_score = 0.0
    for j in range(length):
        fraction = 0.0
        denominator = 0.0
        # 计算分子
        fraction = weight_graph[j][i] * scores[j]
        # 计算分母
        for k in range(length):
            denominator += weight_graph[j][k]
            if denominator == 0:
                denominator = 1
        added_score += fraction / denominator
    # 算出最终的分数
    weighted_score = (1 - d) + d * added_score
    return weighted_score


# 输入相似度的图(矩阵),返回各个句子的分数
def weight_sentences_rank(weight_graph):
    # 初始分数设置为0.5
    scores = [0.5 for _ in range(len(weight_graph))]
    old_scores = [0.0 for _ in range(len(weight_graph))]
    # 开始迭代
    while different(scores, old_scores):
        for i in range(len(weight_graph)):
            old_scores[i] = scores[i]
        for i in range(len(weight_graph)):
            scores[i] = calculate_score(weight_graph, scores, i)
    return scores


# 判断前后分数有无变化
def different(scores, old_scores):

    flag = False
    for i in range(len(scores)):
        if math.fabs(scores[i] - old_scores[i]) >= 0.0001:
            flag = True
            break
    return flag


# 过滤符号
def filter_symbols(sents, stoppath):
    stopwords = create_stopwords(stoppath) + ['。', ' ', '.']
    _sents = []
    for sentence in sents:
        for word in sentence:
            if word in stopwords:
                sentence.remove(word)
        if sentence:
            _sents.append(sentence)
    return _sents


# 过滤掉模型外未训练到的字
def filter_model(model, sents):
    _sents = []
    for sentence in sents:
        for word in sentence:
            if word not in model.wv.vocab:
                # print('remove' + word)
                sentence.remove(word)       # 剔除没有在训练模型中的字
        if sentence:
            _sents.append(sentence)     # 剔除后加入_sent并返回
    return _sents


# 冒泡排序
def bubble_sort(array):
    for i in range(len(array)):
        for j in range(i, len(array)):
            if array[i] > array[j]:
                array[i], array[j] = array[j], array[i]
    return array


# 生成摘要
def summarize(model, text, n, stoppath):
    tokens = cut_sentences(text)    # 按句切分,这是一个generator
    sentences = []
    sents = []
    for sent in tokens:     # 每句话
        sentences.append(sent)      # 把每句话一次加入sentences,也即把原来的generator变成数组,每个元素是一个句子。
        sents.append([word for word in jieba.cut(sent) if word])        # 把分词后的每个词加入sents,也即把原来的generator变成j矩阵,每行是一句,单个字成列。
    sents = filter_symbols(sents, stoppath)
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词
    sents = filter_model(model, sents)     # 去除模型外的词,一次并不成功
    graph = create_graph(model, sents)     # 传入句子链表  返回句子之间相似度的图
    scores = weight_sentences_rank(graph)       # 输入相似度的图(矩阵),返回各个句子的分数
    sent_selected = nlargest(n, zip(scores, count()))       # 选择出得分最大的n个句子的值和编号
    sent_index = []
    if len(sent_selected) < n:
        return None
    for i in range(n):
        sent_index.append(sent_selected[i][1])
    sent_index = bubble_sort(sent_index)        # 按文中顺序排序
    return [sentences[i] for i in sent_index]


def Model(modelpath):
    model = word2vec.Word2Vec.load(modelpath)  # 加载训练好的模型
    return model


def ExtractiveCN(text, model, summarizenum=4, stoppath='stopword.txt'):
    # model = Model(modelpath)
    summarize(model, text, summarizenum, stoppath)
    summary = summarize(model, text, summarizenum, stoppath)   # 摘要summarizenum句
    if summary is None:
        return text
    summarystr = ""
    for sum in summary:
        summarystr = summarystr + sum
    return summarystr

你可能感兴趣的:(nlp)