拼写纠错

import numpy as np
#词典库
vocab=set([line.rstrip() for line in open('./vocab.txt ')])

# 生成所有候选集
def generate_candidates(word):
    """"
    word:给定的输入(错误的输入)
    返回所有(valid)候选集
    """
    # 生成编辑距离为1的单词,方法有:1.insert   2. delete   3.replace  
    # 若生成编辑距离为2的加个循环就行了,此处省略
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits= [(word[:i],word[i:]) for i in range(len(word)+1)]
    # insert 操作
    inserts=[L+c+R for L,R in splits for c in letters]
    # delete 操作
    deletes=[L+R[1:] for L,R in splits if R]
    # replace 操作
    replaces=[L+c+R[1:] for L,R in splits if R for c in letters]
    candidates=set(inserts+deletes+replaces)
    return [word for word in candidates if word in vocab]

from nltk.corpus import reuters
#读取语料库,用于构建语言模型
categories=reuters.categories()
corpus=reuters.sents(categories=categories)

#构建语言模型:Bigram
term_count={}
bigram_count={}
for doc in corpus:
    doc = [''] + doc  #起始处加上
    for i in range(0,len(doc)-1):
        # bigram:[i,i+1]
        term=doc[i]
        bigram=doc[i:i+2]
        if term in term_count:
            term_count[term] += 1
        else:
            term_count[term] = 1
        bigram=' '.join(bigram)
        if bigram in bigram_count:
            bigram_count[bigram] += 1
        else:
            bigram_count[bigram] = 1
# 用户打错的概率统计
channel_prob={}
for line in open('./spell-errors.txt '):
    items=line.split(':')
    correct = items[0].strip()
    mistakes = [item.strip() for item in items[1].strip().split(',')]
    channel_prob[correct]={}
    for mis in mistakes:
        channel_prob[correct][mis]=1.0/len(mistakes)

V=len(term_count.keys())
file = open('./testdata.txt','r')
for line in file:
    items=line.rstrip().split('\t')
    line = items[2].split()
    for word in line:
        if word not in vocab:
            candidates = generate_candidates(word)
            if len(candidates) < 1:
                continue
            probs = []
            for candi in candidates:
                prob=0
                # a.计算channel probability p(s|c)
                if candi in channel_prob and word in channel_prob[candi]:
                    prob +=np.log(channel_prob[candi][word])
                else:
                    prob +=np.log(0.00001)
                # b .计算语言模型的概率 p(c)
                idx = items[2].index(word)+1
                if items[2][idx-1] in bigram_count and candi in bigram_count[items[2][idx-1]]:
                    prob += np.log((bigram_count[items[2][idx-1][candi]] + 1.0 )/(
                            term_count[bigram_count[items[2][idx-1]]] + V))
                else:
                    prob += np.log(1.0/V )
                probs.append(prob)
            max_idx = probs.index(max(probs))
            print(word,candidates[max_idx])

你可能感兴趣的:(nlp之拼写纠错,nlp,自然语言处理)