nltk 同义词替换 单词拼写校正 制作伪原创文章

一.基于贝叶斯单词拼写校正

# -*- coding: utf-8 -*-
# @Time    : 2019/11/26 10:13
# @Author  :
# @FileName: word_check.py

import os
import re
import collections

# 下载词库big.txt文件到本地
# import requests
# url = "http://norvig.com/big.txt"
# response = requests.get(url=url)
# with open("big.txt","w",encoding="utf-8") as f:
#     f.write(response.text)

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model


filepath = os.path.dirname(__file__)
NWORDS = train(words(open('%s/big.txt' % filepath).read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts = [a + c + b for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)


def known(words): return set(w for w in words if w in NWORDS)


def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)


if __name__ == '__main__':
    print(correct("Hammett"))

二.同义词替换生成伪原创文章

# -*- coding: utf-8 -*-
# @Time    : 2019/11/26 10:13
# @Author  :

import re

import nltk
import inflect
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from word_check import correct

# 缩略词还原
replacement_patterns = [
    ("won\'t", "will not"),
    ("won\’t", "will not"),
    ("can\'t", "cannot"),
    ("can\’t", "cannot"),
    ("i\'m", "i am"),
    ("i\’m", "i am"),
    ("ain\'t", "is not"),
    ("ain\’t", "is not"),
    ("(\w+)\'ll", "\g<1> will"),
    ("(\w+)\’ll", "\g<1> will"),
    ("(\w+)n\'t", "\g<1> not"),
    ("(\w+)n’t", "\g<1> not"),
    ("(\w+)\'ve", "\g<1> have"),
    ("(\w+)\’ve", "\g<1> have"),
    ("(\w+)\'s", "\g<1> is"),
    ("(\w+)\’s", "\g<1> is"),
    ("(\w+)\'re", "\g<1> are"),
    ("(\w+)\’re", "\g<1> are"),
    ("(\w+)\'d", "\g<1> would"),
    ("(\w+)\’d", "\g<1> would")
]


class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        """缩略词还原"""
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]

    def rep(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


def generate_the_article(text):
    """
    伪原创文章生成
    :param text:
    :return:
    """
    replacer = RegexpReplacer()
    text = replacer.rep(text)

    # 将文章切割成句子
    setences = nltk.sent_tokenize(text)
    all_world = []
    for setn in setences:

        # 词性分析
        tokens = nltk.word_tokenize(setn)
        pos_tags = nltk.pos_tag(tokens)
        new_world = []

        for word, pos in pos_tags:
            pos_list = ["VBG", "JJ", "NN", "NNS", "MD", "VB", "VBD"]
            if pos in pos_list:
                word_list = []

                # 同义词替换
                if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
                    word_set = wn.synsets(word, pos='n')
                elif pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', "MD"]:
                    word_set = wn.synsets(word, pos='v')
                elif pos in ['RB', 'RBR', 'RBS']:
                    word_set = wn.synsets(word, pos='r')
                elif pos in ['JJ', 'JJR', 'JJS']:
                    word_set = wn.synsets(word, pos='a')
                else:
                    word_set = wn.synsets(word)
                w_list = list(i._lemma_names for i in word_set)
                for w in w_list:
                    word_list.extend(w)
                if word_list:
                    word_set = set(word_list)
                if word_set:
                    word_set.discard(word)
                    if word_set:
                        word_set.discard(word)
                        if word.istitle():
                            nw = word_set.pop().replace("_", " ").capitalize()
                        else:
                            p_word = word_set.pop().replace("_", " ")
                            if p_word.lower() == word.lower():
                                nw = word
                            else:
                                nw = p_word
                        if pos in ["VBG"]:

                            # 动词现在进行时转换
                            nw = p.present_participle(nw)
                            # 单词拼写校正
                            nw = correct(nw)

                        elif pos in ["NNS", "NNPS"]:
                            if not nw.endswith("s"):
                                # 名词复数转换
                                nw = p.plural_noun(nw)
                                # 单词拼写校正
                                nw = correct(nw)
                    else:
                        nw = word
                else:
                    nw = word
                new_world.append(nw)
            else:
                new_world.append(word)
            if new_world:
                if new_world[-1] in [',', ':', ',', '.', ';', '。', ';', '-', '—', '?', '?', '!', '!', ']', '】', '}',
                                     '}', ')',
                                     ')', '|']:
                    if len(new_world) > 1:
                        point = new_world.pop(-1)
                        new_world[-1] = new_world[-1] + point

        all_world.extend(new_world)

    return " ".join(all_world).replace("( ", "(").replace("( ", "(").replace('[ ', "[").replace('【 ', '【', ).replace(
        '{ ', '{', ).replace('{ ', '{')


def main():
    global p, lemmatizer
    text = """NORMANDY PARK, Wash. — There was just one problem with the brand-new, wide-open layout of Kay and Bob Comiskey's home: it was really open.
"We remodelled and didn't have money for furniture," says Kay. "We lived for three years with almost nothing. We had one sectional we'd move from room to room. The UPS driver asked if we ballroom-danced."
They did not. But the Comiskeys are promenading on air these days, now that Bjarko5/8Serra Architects (who created those wide-open spaces) connected them with Amy May, of MAY Designs (who creatively filled those wide-open spaces).
The Comiskeys love their family-friendly neighbourhood and their delightful midcentury home (originally an old, dark A-frame), which served admirably for years as a flexible, active hub for their three kids and their coming-and-going lifestyle. But once the nest emptied, May says, "They wanted a new way to enjoy the house, such as creating intimate gathering spaces with dedicated furniture."
May didn't have to look far for decor inspiration: sand and saltwater shimmer forever, just outside a west-facing wall of windows.
"The clients wanted to maintain a neutral palette that acted as a backdrop and setting for the natural beauty of the Puget Sound," May says.
And now, a beautiful blend of natural simplicity and industrial touches artfully flows through the reimagined first floor, in driftwood, coral and beachy glass; colour-popping art pieces; and all-new, fantastically functional furniture whose only movement is the occasional swivel of a purposely placed chair.
•In the warmly welcoming living room, May softened the existing two-storey, black-clad fireplace with a giant, artsy, battery-operated clock that hangs from the 20-foot ceiling. 
"It casts interesting shadows and helps break up the mass of the black background," she says. 
"""
    p = inflect.engine()
    lemmatizer = WordNetLemmatizer()
    text_nw = generate_the_article(text=text)
    print(text_nw)

    # 借助第三方平台进行语法校正
    # print(grammar_check(text_nw))


if __name__ == '__main__':
    main()

# 词性分析: https://www.jianshu.com/p/418cbdbf5e20
# 同义词替换: https://blog.csdn.net/jining11/article/details/89458865
# 词性还原: https://pypi.org/project/inflect/

# 单词拼写校验 https://blog.csdn.net/Pwiling/article/details/50573650
# textblob https://textblob.readthedocs.io/en/dev/quickstart.html

你可能感兴趣的:(个人学习记录,nltk,伪原创文章)