拼写纠错python代码

from nltk import *
from nltk.corpus import brown
#每次访问数据需要添加数据至路径当中
corpus = brown.sents()
#.sent()整个语料库中的句子,sents(fileids=[f1,f2..],categories=[c1,c2...])
import numpy as np


# 读入字典
#set() 函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等
vocabs = set([lines.rstrip() for lines in open('vocab.txt')])


# 生成最短编辑距离的正确单词
# 1.生成候选集合和候选项
def generate1(wrong_word):
    letters = {
     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
               'v', 'w', 'x', 'y', 'z'}
    right_word_split = [[wrong_word[:i], wrong_word[i:]] for i in range(len(wrong_word) + 1)]  # 集合没有切片操作
    insert = {
     R + M + L for R, L in right_word_split for M in letters}  # 使用}为集合,使用[为list
    replace = {
     R + M + L[1:] for R, L in right_word_split for M in letters}
    # 集合{'aauaa', 'aafa', 'aaaza', 'uaaaa', 'faaa', 'afaa', 'aaada', 'aaa', 'afaaa', 'aaaua', 'alaa', 'amaaa', 'aaaax', 'xaaa', 'daaaa', 'asaaa'
    delete = {
     R + L[1:] for R, L in right_word_split for M in letters}  # 集合
    # 因为是集合,不支持相加操作
    candidates = insert | replace | delete  # 集合形式相与
    candidate_word = set([c for c in candidates if c in vocabs])  # 集合形式
    return candidate_word, candidates

#2.为多次迭代做准备
def generate2(candidates):
    # 定义一个空字典c = {}
    c = set()  # 定义一个空集合
    d = set()
    for candidate in candidates:
        a, b = generate1(candidate)
        c.update(a)  # 只有列表才能使用append,字典,集合使用update.
        d.update(b)
    return c, d

def recognition(wrongword):
    wrong_word = wrongword
    cw, c = generate1(wrong_word)
    if cw:
        return cw
    else:
        for k in range(len(wrong_word)):
            a, c = generate2(c)
            cw.update(a)
            if cw:
                return cw


#判别模型P(错误|正确)
spell_err = open('spellerror.txt').readlines()
print(spell_err)
prob = {
     }             #初始大索引
for lines in spell_err:                  #lines为字符串
    line = lines.rstrip('\n').split(':') #rstrip删除结尾指定字符,line为列表
    correct = line[0]                    #correct为字符串
    wrong =[x.strip() for x in line[1].split(',')]    #wrong为列表,为了消除wrong中字符的前面空格
    prob[correct] = {
     } #初始小索引
    for wro in wrong:
     prob[correct][wro] = 1.0/len(wrong)

#语言模型
unigram_freqency = {
     } #字典类型
bigram_freqency = {
     }
for lines in corpus:
  lines = ['']+lines
  for i in range(len(lines)-1):
    a = lines[i]
    b = lines[i:i + 2]
    c = "".join(b)  # 将列表b转换成字符串c,其中""里输入的是分别字符串的符号
    #lines[i]为str类型
    #lines[i:i+2]为list类型
    if a in unigram_freqency:
     unigram_freqency[a] += 1
    else:
     unigram_freqency[a] = 1
    if c in bigram_freqency:
     bigram_freqency[c] += 1
    else:
     bigram_freqency[c] = 1

def combination_prob(wrong_word,corrected_candidate,left_word,right_word):
 p1 = {
     }
 for candidate in corrected_candidate:
     #求P(正确)
    forward_combination = left_word + ' ' + candidate
    backward_combination = candidate + ' ' + right_word
    p1[candidate] = 0
    if candidate in unigram_freqency and forward_combination in bigram_freqency:
        p1[candidate] += np.log(bigram_freqency[forward_combination] + 1) / (unigram_freqency[candidate] + len(unigram_freqency))
    else:
        if candidate in unigram_freqency:
           p1[candidate] += np.log(1 / (unigram_freqency[candidate] + len(unigram_freqency)))
        else:
           p1[candidate] += np.log(1 / len(unigram_freqency))

    if candidate in unigram_freqency and backward_combination in bigram_freqency:
        p1[candidate] += np.log((bigram_freqency[backward_combination]+1)/(unigram_freqency[candidate]+len(unigram_freqency)))
    else:
        if candidate in unigram_freqency:
            p1[candidate] += np.log(1/(unigram_freqency[candidate]+len(unigram_freqency)))
        else:
            p1[candidate] += np.log(1/(len(unigram_freqency)))

    #求P(错误|正确)
    if candidate in prob and wrong_word in prob[candidate]:
        p1[candidate] += np.log(prob[candidate][wrong_word])
    else:
        p1[candidate] += np.log(0.01)
 return p1



# 判断字是否出错
# s = np.loadtxt('fanren.txt', dtype=str, delimiter='0' )
# 因为np.load只能处理长度相同的字符串
lines = open('testdata.txt').readlines()
# 逐行遍历
for l in lines:
    # 将每一行去空格再分割成3部分
    part = l.rstrip().split('\t')
    # 取其中的第三部分也即是句子中的每一个单词
    full_part = part[2]
    testword = [''] + full_part.split()
    corrected_word = {
     }
    for i in range(1,len(testword)-1):
        if testword[i] not in vocabs:# 如果测试词不在词库内,该词为错误的词,也即是需要修改的词
            corrected_candidate = recognition(testword[i])
            p = combination_prob(testword[i],corrected_candidate,testword[i-1],testword[i+1])
            word = list(p.keys())[list(p.values()).index(max(list(p.values())))]
            corrected_word[testword[i]] = word
            print(corrected_word[testword[i]])














你可能感兴趣的:(笔记,自然语言处理)