一.基于贝叶斯单词拼写校正
import os
import re
import collections
def words(text):
return re.findall('[a-z]+', text.lower())
def train(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
filepath = os.path.dirname(__file__)
NWORDS = train(words(open('%s/big.txt' % filepath).read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
inserts = [a + c + b for a, b in splits for c in alphabet]
return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def known(words): return set(w for w in words if w in NWORDS)
def correct(word):
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
return max(candidates, key=NWORDS.get)
if __name__ == '__main__':
print(correct("Hammett"))
二.同义词替换生成伪原创文章
import re
import nltk
import inflect
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from word_check import correct
replacement_patterns = [
("won\'t", "will not"),
("won\’t", "will not"),
("can\'t", "cannot"),
("can\’t", "cannot"),
("i\'m", "i am"),
("i\’m", "i am"),
("ain\'t", "is not"),
("ain\’t", "is not"),
("(\w+)\'ll", "\g<1> will"),
("(\w+)\’ll", "\g<1> will"),
("(\w+)n\'t", "\g<1> not"),
("(\w+)n’t", "\g<1> not"),
("(\w+)\'ve", "\g<1> have"),
("(\w+)\’ve", "\g<1> have"),
("(\w+)\'s", "\g<1> is"),
("(\w+)\’s", "\g<1> is"),
("(\w+)\'re", "\g<1> are"),
("(\w+)\’re", "\g<1> are"),
("(\w+)\'d", "\g<1> would"),
("(\w+)\’d", "\g<1> would")
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
"""缩略词还原"""
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def rep(self, text):
s = text
for (pattern, repl) in self.patterns:
s = re.sub(pattern, repl, s)
return s
def generate_the_article(text):
"""
伪原创文章生成
:param text:
:return:
"""
replacer = RegexpReplacer()
text = replacer.rep(text)
setences = nltk.sent_tokenize(text)
all_world = []
for setn in setences:
tokens = nltk.word_tokenize(setn)
pos_tags = nltk.pos_tag(tokens)
new_world = []
for word, pos in pos_tags:
pos_list = ["VBG", "JJ", "NN", "NNS", "MD", "VB", "VBD"]
if pos in pos_list:
word_list = []
if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
word_set = wn.synsets(word, pos='n')
elif pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', "MD"]:
word_set = wn.synsets(word, pos='v')
elif pos in ['RB', 'RBR', 'RBS']:
word_set = wn.synsets(word, pos='r')
elif pos in ['JJ', 'JJR', 'JJS']:
word_set = wn.synsets(word, pos='a')
else:
word_set = wn.synsets(word)
w_list = list(i._lemma_names for i in word_set)
for w in w_list:
word_list.extend(w)
if word_list:
word_set = set(word_list)
if word_set:
word_set.discard(word)
if word_set:
word_set.discard(word)
if word.istitle():
nw = word_set.pop().replace("_", " ").capitalize()
else:
p_word = word_set.pop().replace("_", " ")
if p_word.lower() == word.lower():
nw = word
else:
nw = p_word
if pos in ["VBG"]:
nw = p.present_participle(nw)
nw = correct(nw)
elif pos in ["NNS", "NNPS"]:
if not nw.endswith("s"):
nw = p.plural_noun(nw)
nw = correct(nw)
else:
nw = word
else:
nw = word
new_world.append(nw)
else:
new_world.append(word)
if new_world:
if new_world[-1] in [',', ':', ',', '.', ';', '。', ';', '-', '—', '?', '?', '!', '!', ']', '】', '}',
'}', ')',
')', '|']:
if len(new_world) > 1:
point = new_world.pop(-1)
new_world[-1] = new_world[-1] + point
all_world.extend(new_world)
return " ".join(all_world).replace("( ", "(").replace("( ", "(").replace('[ ', "[").replace('【 ', '【', ).replace(
'{ ', '{', ).replace('{ ', '{')
def main():
global p, lemmatizer
text = """NORMANDY PARK, Wash. — There was just one problem with the brand-new, wide-open layout of Kay and Bob Comiskey's home: it was really open.
"We remodelled and didn't have money for furniture," says Kay. "We lived for three years with almost nothing. We had one sectional we'd move from room to room. The UPS driver asked if we ballroom-danced."
They did not. But the Comiskeys are promenading on air these days, now that Bjarko5/8Serra Architects (who created those wide-open spaces) connected them with Amy May, of MAY Designs (who creatively filled those wide-open spaces).
The Comiskeys love their family-friendly neighbourhood and their delightful midcentury home (originally an old, dark A-frame), which served admirably for years as a flexible, active hub for their three kids and their coming-and-going lifestyle. But once the nest emptied, May says, "They wanted a new way to enjoy the house, such as creating intimate gathering spaces with dedicated furniture."
May didn't have to look far for decor inspiration: sand and saltwater shimmer forever, just outside a west-facing wall of windows.
"The clients wanted to maintain a neutral palette that acted as a backdrop and setting for the natural beauty of the Puget Sound," May says.
And now, a beautiful blend of natural simplicity and industrial touches artfully flows through the reimagined first floor, in driftwood, coral and beachy glass; colour-popping art pieces; and all-new, fantastically functional furniture whose only movement is the occasional swivel of a purposely placed chair.
•In the warmly welcoming living room, May softened the existing two-storey, black-clad fireplace with a giant, artsy, battery-operated clock that hangs from the 20-foot ceiling.
"It casts interesting shadows and helps break up the mass of the black background," she says.
"""
p = inflect.engine()
lemmatizer = WordNetLemmatizer()
text_nw = generate_the_article(text=text)
print(text_nw)
if __name__ == '__main__':
main()