最近在学习nlp相关技术,自己的体会结合培训老师的教学做了些整理,和各位有兴趣的同仁分享。
纠错程序是一个用于校验出一句话或一篇文章中出现错误的单词。
分三步说明
【有些资料可能涉及到别人的版权,附件就不上传了】
我用的工具是python3.6、pycharm,Mac笔记本;
在pycharm中写个简单的程序text.py,代码如下:
import nltk
nltk.download()
运行后会出现下载nltk相关包,可按需下载,如果各种路径都用了还是不能下载,那可按本例最小需求下载两个包reuters和punkt,在本地nltk_data下新建两个目录,corpora和tokenizers,分别把包reuters和punkt放入目录中。
(1)读取初始化词库
(2)读取语料库
(3)生成候选词方法
(4)构建语言模型
(5)匹配词库,找出错误词语,并选出最大概率的替代词
【代码参考了https://blog.csdn.net/weixin_41250910/article/details/100080242】
from nltk.corpus import reuters
import numpy as np
#读取词典库
vocab = set([line.strip() for line in open(‘vocab.txt’)])
#print(“vocab”,vocab)
#生成候选集合
def generate_candidates(word):
letters = ‘abcdefghijklmnopqrstuvwxyz’
splits = [(word[:i],word[i:]) for i in range(len(letters)+1)]
insert = [(L+i+R) for L,R in splits for i in letters]
#print(insert)
delete = [(L+R[1:]) for L,R in splits if R]
#print(delete)
replace = [(L+c+R[1:]) for L,R in splits for c in letters if R]
#print(replace)
cadidates = set(insert + delete + replace)
return [ word for word in cadidates if word in vocab]
#读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
#构建语言模型
term_count = {}
bigram_count = {}
for doc in corpus:
doc = [‘字符串’] + doc
for i in range(0, len(doc) - 1):
term = doc[i]
bigram = doc[i:i + 2]
if term in term_count:
term_count[term] += 1
else :
term_count[term] = 1
bigram = ‘’.join(bigram)
if bigram in bigram_count:
bigram_count[bigram] += 1
else :
bigram_count[bigram] = 1
#print(‘term’,term,term_count)
#print(‘bigram->’,bigram,bigram_count)
#用户打错的概率,从日志中提取
channel_prob={}
for line in open(‘spell-errors.txt’):
items = line.split(’:’)
correct = items[0]
mistackes = [item.strip() for item in items[1].split(’,’)]
#确保key不为空
channel_prob[correct] = {}
for mis in mistackes:
channel_prob[correct][mis] = 1/len(mistackes)
V = len(term_count.keys())
file = open(‘testdata.txt’,‘r’)
for line in file :
items = line.rstrip().split(’\t’)
lines = items[2].split()
for word in lines:
if word not in vocab:
#生成候选集
candidates = generate_candidates(word)
if len(candidates) < 1 :
continue
probs = []
for candi in candidates:
#p(c|s) 输入错误的情况下纠正的最大概率
#max P(c|s) ~ P(s|c)P©~logP(s|c)+logP©
prob = 0
#计算channel probalitity P(s|c)
if candi in channel_prob and word in channel_prob[candi]:
prob = np.log(channel_prob[candi][word])
else:
prob = np.log(0.0001)
idx = items[2].index(word) + 1
#计算语言模型的概率 P©
# 注意这里用的是bigram模型,严格来算语言模型bigram
# P(Wi) = P(Wi-1|Wi)P(Wi|Wi+1) ,此处用了add-one smoothing
# P(Wi) = [C(Wi-1,Wi)+1]/[C(Wi)+V] ,C(Wi-1,Wi)表示词Wi-1和WI连着在语料库中出现的次数,[C(Wi)表示词Wi在语料库出现的次数,V是去重后的词库量
if items[2][idx-1].join(candi) in bigram_count:
#使用add-one smoothing方法避免概率0出现
prob += np.log((bigram_count[items[2][idx-1]][candi] + 1.0)/(term_count[bigram_count[items[2][idx-1]]] + V) )
else:
prob += np.log(1.0/V)
probs.append(prob)
max_idx = probs.index(max(probs))
#错误词语,最大概率替换词
print(word,candidates[max_idx])