用到的库 以及 预处理语料。 清除所有符号,并分句,分词
import re
import zipfile
import lxml.etree
from collections import defaultdict
from math import log
from nltk.probability import ConditionalFreqDist, FreqDist
import joblib
def pre_data():
"""
获取xml中的有效文本 content 数据 keywords 标签
"""
with zipfile.ZipFile(r'D:\C\NLP\Data\ted_zh-cn-20160408.zip', 'r') as z:
doc = lxml.etree.parse(z.open('ted_zh-cn-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()')) # 获取标签下的文字
z.close()
del doc, z
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)
input_text_noparens = re.sub(r'([^)]*)', '', input_text_noparens)
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
m = re.match(r'^(?:(?P[^:]{,20}):)?(?P.*)$' , line)
sentences_strings_ted.extend(sent for sent in re.split('[。?!]', m.groupdict()['postcolon']) if sent)
del input_text_noparens, input_text
sentences_strings_ted = [re.sub(r'[^\w\s]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = filter(None, sentences_strings_ted)
data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ')
fin_data = [' '.join(sent).split(' ') for sent in data]
del sentences_strings_ted, data
return fin_data
其实就是统计所有 1-n 元 ngram
重点函数
self.counter[n][gram[:-1]][gram[-1]] 套嵌字典
n =[1:n] 代表几元组
[gram[:-1]] 代表当前元组的
[gram[-1]] 代表$(w_{i-n+1},…w_{i-1}) $ 后存在的 ( w ′ ) (w') (w′)
value 为当前元组 ( w 1 , w 2 , . . . , w n ) (w_{1},w_{2},...,w_{n}) (w1,w2,...,wn) 的总数
例:self.counter=
3:[(a, b):[c:5, d:6, e:7], (a, c):[c:4, d:5, e:8, f:1, g:10]]
2:[(a):[b:6,c:6,d:7], (b):[y:1]]
1: a:23,b:21,c:10,d:20
class NGram:
def __init__(self, n):
"""
定义N元模型参数
nltk的 ConditionalFreqDist 很好用就没有复写
参考 from nltk.probability import ConditionalFreqDist, FreqDist
@param n: 定义 ngram 元
"""
self.N = n
self.counter = defaultdict(ConditionalFreqDist)
self.counter[1] = self.unigrams = FreqDist()
def prepare(self, sents):
"""
准备数据 分句在分字,句子头尾增加
@return:
"""
n = self.N
left = ['' ]
right = ['' ]
sents = list(left * (n - 1) + sent + right * (n - 1)for sent in sents)
return sents
def fit(self, sents):
"""
训练函数 其实就是统计所有 1-n 元 ngram
self.counter[n][gram[:-1]][gram[-1]]
n =[1:n] 代表几元组
[gram[:-1]] 代表当前元组的(wi-n-1,...wi-1)
[gram[-1]]代表(wi-n-1,...wi-1)后存在的(w`)
例:self.counter=
3:[(a, b):[c, d, e], (a, c):[c, d, e]]
2:[(a):[b,c,d]]
1:a,b,c,d
@param sents: 输入形式[[1,2,3,4,5],[6,7,8,9],[10,11]] 字粒度
@return:
"""
ready = self.prepare(sents)
n = 1
while n <= self.N:
for sent in ready:
for i in range(len(sent) - n + 1):
gram = tuple(sent[i:i + n])
if n == 1:
self.unigrams[gram[0]] += 1
continue
self.counter[n][gram[:-1]][gram[-1]] += 1
n += 1
self.d() # modified_alpha_gamma 中使用
bigram 的插值 Interpolation Kneser-Ney Smoothing 公式
P K N ( w i ∣ w i − 1 ) = m a x ( C ( w i − 1 w i ) − d , 0 ) C ( w i − 1 ) + γ ( w i − 1 ) P c o n t i n u t i o n ( w i ) P_{K N}(w_{i} | w_{i-1})=\frac{max (C(w_{i-1} w_{i})-d, 0)}{C(w_{i-1})}+\gamma(w_{i-1}) P_{ {continution }}(w_{i}) PKN(wi∣wi−1)=C(wi−1)max(C(wi−1wi)−d,0)+γ(wi−1)Pcontinution(wi)
m a x ( C ( w i − 1 w i ) − d , 0 ) max (C(w_{i-1} w_{i})-d, 0) max(C(wi−1wi)−d,0) 的目的是对 ngram计数 - d 后小于0的值 取0,避免成为复数。
γ \gamma γ为正则化常量。 ∣ { w : C ( w i − 1 , w ) > 0 } ∣ |\{w: C(w_{i-1}, w)>0\}| ∣{ w:C(wi−1,w)>0}∣ 为统计 C ( w i − 1 , w ) C(w_{i-1}, w) C(wi−1,w)的 样本数。
γ ( w i − 1 ) = d C ( w i − 1 ) ∣ { w : C ( w i − 1 , w ) > 0 } ∣ \gamma(w_{i-1})=\frac{d}{C(w_{i-1})} |\{w: C(w_{i-1}, w)>0\}| γ(wi−1)=C(wi−1)d∣{ w:C(wi−1,w)>0}∣
泛化的通用公式为:
P K N ( w i ∣ w i − n + 1 ⋯ w i − 1 ) = m a x ( C K N ( w i − n + 1 ⋯ w i ) − d , 0 ) C K N ( w i − n + 1 ⋯ w i − 1 ) + γ ( w i − n + 1 ⋯ w i − 1 ) ⋅ P K N ( w i ∣ w i − n + 2 ⋯ w i − 1 ) P_{K N}(w_{i} | w_{i-n+1} \cdots w_{i-1})=\frac{max(C_{K N}(w_{i-n+1} \cdots w_{i})-d, 0)}{C_{K N}(w_{i-n+1} \cdots w_{i-1})}+\gamma(w_{i-n+1} \cdots w_{i-1}) \cdot P_{K N}(w_{i} | w_{i-n+2} \cdots w_{i-1}) PKN(wi∣wi−n+1⋯wi−1)=CKN(wi−n+1⋯wi−1)max(CKN(wi−n+1⋯wi)−d,0)+γ(wi−n+1⋯wi−1)⋅PKN(wi∣wi−n+2⋯wi−1)
在代码中将定义
α = m a x ( C K N ( w i − n + 1 ⋯ w i ) − d , 0 ) C K N ( w i − n + 1 ⋯ w i − 1 ) \alpha=\frac{max(C_{K N}(w_{i-n+1} \cdots w_{i})-d, 0)}{C_{K N}(w_{i-n+1} \cdots w_{i-1})} α=CKN(wi−n+1⋯wi−1)max(CKN(wi−n+1⋯wi)−d,0)
γ = d ∑ w i C ( w i − n + 1 ⋯ w i − 1 ) ∣ { w : C ( w i − n + 1 ⋯ w i − 1 ) > 0 } ∣ \gamma=\frac{d}{\sum_{w_{i}}C(w_{i-n+1} \cdots w_{i-1})} |\{w: C(w_{i-n+1} \cdots w_{i-1})>0\}| γ=∑wiC(wi−n+1⋯wi−1)d∣{ w:C(wi−n+1⋯wi−1)>0}∣
重点
对于 C K N ( w i − n + 1 , . . . , w i − 1 ) C_{KN(w_{i-n+1},...,w_{i-1})} CKN(wi−n+1,...,wi−1) 存在 为0 的情况,可使用插值 Interpolation 以及 回退 back-off 方法
P K N ( w i ∣ w i − n + 1 i − 1 ) = { α if c ( w i − n + 1 i ) > 0 γ ( w i − n + 1 i − 1 ) p smooth ( w i ∣ w i − n + 2 i − 1 ) if c ( w i − n + 1 i ) = 0 ( B a c k − o f f ) P_{KN}(w_{i} | w_{i-n+1}^{i-1})=\{\begin{array}{ll} {\alpha} & {\text { if } c(w_{i-n+1}^{i})>0} \\ {\gamma(w_{i-n+1}^{i-1}) p_{\text {smooth }}(w_{i} | w_{i-n+2}^{i-1})} & {\text { if } c(w_{i-n+1}^{i})=0} \end{array} \qquad(Back-off) PKN(wi∣wi−n+1i−1)={ αγ(wi−n+1i−1)psmooth (wi∣wi−n+2i−1) if c(wi−n+1i)>0 if c(wi−n+1i)=0(Back−off)
P K N ( w i ∣ w i − n + 1 i − 1 ) = α + γ ( w i − n + 1 i − 1 ) P K N ( w i ∣ w i − n + 2 i − 1 ) ( I n t e r p o l a t i o n ) { if c ( w i − n + 1 i ) = 0 α = 0 } P_{KN}(w_{i} | w_{i-n+1}^{i-1})={\alpha}+\gamma(w_{i-n+1}^{i-1}) P_{KN}(w_{i} | w_{i-n+2}^{i-1}) (Interpolation )\qquad\{ {\text { if } c(w_{i-n+1}^{i})=0\quad{\alpha=0}}\} PKN(wi∣wi−n+1i−1)=α+γ(wi−n+1i−1)PKN(wi∣wi−n+2i−1)(Interpolation){ if c(wi−n+1i)=0α=0}
这里使用的是插值方法
其中:
c o n t e x t = ( w i − n + 1 , ⋯ , w i − 1 ) context = (w_{i-n+1} ,\cdots ,w_{i-1}) context=(wi−n+1,⋯,wi−1) 使用 c o n t e x t [ 1 : ] context[1:] context[1:]进行 C K N C_{KN} CKN的迭代
w o r d = w i word = w_{i} word=wi
p r e f i x _ c o u n t s [ w o r d ] = C K N ( w i − n + 1 ⋯ w i ) prefix\_counts[word] = C_{K N}(w_{i-n+1} \cdots w_{i}) prefix_counts[word]=CKN(wi−n+1⋯wi)
p r e f i x _ c o u n t s . N ( ) = ∑ w i C K N ( w i − n + 1 ⋯ w i − 1 ) prefix\_counts.N() = \sum_{w_{i}}C_{KN}(w_{i-n+1} \cdots w_{i-1}) prefix_counts.N()=∑wiCKN(wi−n+1⋯wi−1)
s = ∣ { w : C ( w i − n + 1 ⋯ w i − 1 ) > 0 } ∣ s = |\{w: C(w_{i-n+1} \cdots w_{i-1})>0\}| s=∣{ w:C(wi−n+1⋯wi−1)>0}∣
def kneser_ney(self, word, context, d=0.1):
"""
计算 kneser_ney平滑公式的两个部分
@return:
"""
prefix_counts = self.counter[len(context) + 1][context]
if prefix_counts[word] > 0:
alpha = max(prefix_counts[word] - d, 0.0) / prefix_counts.N()
else:
alpha = 0
s = sum(1.0 for c in prefix_counts.values() if c > 0)
gamma = d * s / prefix_counts.N()
return alpha, gamma
主要正对于d的取值进行改进。
Y = n 1 n 1 + 2 n 2 Y=\frac{n_{1}}{n_{1}+2 n_{2}} Y=n1+2n2n1 \qquad D = { 0 if c = 0 D 1 if c = 1 D 2 if c = 2 D 3 + if c > 2 D=\left\{\begin{array}{ll}{0} & {\text { if } c=0} \\{D_{1}} & {\text { if } c=1} \\{D_{2}} & {\text { if } c=2} \\{D_{3+}} & {\text { if } c>2}\end{array}\right. D=⎩⎪⎪⎨⎪⎪⎧0D1D2D3+ if c=0 if c=1 if c=2 if c>2 \qquad D 1 = 1 − 2 Y n 2 n 1 D 2 = 2 − 3 Y n 3 n 2 D 3 + = 3 − 4 Y n 4 n 3 \begin{aligned}D_{1} &=1-2 Y \frac{n_{2}}{n_{1}} \\D_{2}&=2-3 Y \frac{n_{3}}{n_{2}} \\D_{3+} &=3-4 Y \frac{n_{4}}{n_{3}}\end{aligned} D1D2D3+=1−2Yn1n2=2−3Yn2n3=3−4Yn3n4 \qquad
n 1 n_{1} n1表示 ngram 出现的次数为1 的总数 n 2 n_{2} n2表示 ngram 出现的次数为2 的总数 类推。
def d(self):
"""
计算D1, D2, D3
@return:
"""
# 计算公式中常数D(Y = n1 / (n1 + 2*n2))
n1, n2, n3, n4 = 0, 0, 0, 0
for ngram in self.counter[self.N]:
for gram in ngram:
num = self.counter[self.N][ngram][gram]
if num == 1:
n1 += 1
elif num == 2:
n2 += 1
elif num == 3:
n3 += 1
elif num == 4:
n4 += 1
Y = n1 / (n1 + 2 * n2)
self.D1 = 1 - 2 * Y * (n2/n1)
self.D2 = 2 - 3 * Y * (n3/n2)
self.D3 = 3 - 4 * Y * (n4/n3)
最终改进后的公式为:
p K N ( w i ∣ w i − n + 1 i − 1 ) = c ( w i − n + 1 i ) − D ( c ( w i − n + 1 i ) ) ∑ w i c ( w i − n + 1 i ) + γ ( w i − n + 1 i − 1 ) p K N ( w i ∣ w i − n + 2 i − 1 ) p_{ KN }\left(w_{i} | w_{i-n+1}^{i-1}\right)=\frac{c\left(w_{i-n+1}^{i}\right)-D\left(c\left(w_{i-n+1}^{i}\right)\right)}{\sum_{w_{i}} c\left(w_{i-n+1}^{i}\right)}+\gamma\left(w_{i-n+1}^{i-1}\right) p_{ KN }\left(w_{i} | w_{i-n+2}^{i-1}\right) pKN(wi∣wi−n+1i−1)=∑wic(wi−n+1i)c(wi−n+1i)−D(c(wi−n+1i))+γ(wi−n+1i−1)pKN(wi∣wi−n+2i−1)
γ high ( w i − n + 1 i − 1 ) = D 1 N 1 ( w i − n + 1 i − 1 ⋅ ) + D 2 N 2 ( w i − n + 1 i − 1 ⋅ ) + D 3 + N 3 + ( w i − n + 1 i − 1 ⋅ ) ∑ w i c ( w i − n + 1 i ) \gamma_\text{high}\left(w_{i-n+1}^{i-1}\right)=\frac{D_{1} N_{1}\left(w_{i-n+1}^{i-1} \cdot\right)+D_{2} N_{2}\left(w_{i-n+1}^{i-1} \cdot\right)+D_{3+} N_{3+}\left(w_{i-n+1}^{i-1} \cdot\right)}{\sum_{w_{i}} c\left(w_{i-n+1}^{i}\right)} γhigh(wi−n+1i−1)=∑wic(wi−n+1i)D1N1(wi−n+1i−1⋅)+D2N2(wi−n+1i−1⋅)+D3+N3+(wi−n+1i−1⋅)
γ low ( w i − n + 1 i − 1 ) = D 1 N 1 ( w i − n + 1 i − 1 ⋅ ) + D 2 N 2 ( w i − n + 1 i − 1 ⋅ ) + D 3 + N 3 + ( w i − n + 1 i − 1 ⋅ ) ∑ w i c ( w i − n + 1 i ) \gamma_\text{low}\left(w_{i-n+1}^{i-1}\right)=\frac{D_{1} N_{1}\left(w_{i-n+1}^{i-1} \cdot\right)+D_{2} N_{2}\left(w_{i-n+1}^{i-1} \cdot\right)+D_{3+} N_{3+}\left(w_{i-n+1}^{i-1} \cdot\right)}{\sum_{w_{i}} c\left(w_{i-n+1}^{i}\right)} γlow(wi−n+1i−1)=∑wic(wi−n+1i)D1N1(wi−n+1i−1⋅)+D2N2(wi−n+1i−1⋅)+D3+N3+(wi−n+1i−1⋅)
N 0 ( w i − n + 1 i − 1 ∙ ) = ∣ { w i : c ( w i − n + 1 i ) > 0 } ∣ N 1 ( w i − n + 1 i − 1 ∙ ) = ∣ { w i : c ( w i − n + 1 i ) = 1 } ∣ N 2 ( w i − n + 1 i − 1 ∙ ) = ∣ { w i : c ( w i − n + 1 i ) = 2 } ∣ N 3 + ( w i − n + 1 i − 1 ∙ ) = ∣ { w i : c ( w i − n + 1 i ) ≥ 3 } ∣ \begin{aligned} N_{0}\left(w_{i-n+1}^{i-1} \bullet\right) &=\left|\left\{w_{i}: c\left(w_{i-n+1}^{i}\right)>0\right\}\right| \\ N_{1}\left(w_{i-n+1}^{i-1} \bullet\right) &=\left|\left\{w_{i}: c\left(w_{i-n+1}^{i}\right)=1\right\}\right| \\ N_{2}\left(w_{i-n+1}^{i-1} \bullet\right) &=\left|\left\{w_{i}: c\left(w_{i-n+1}^{i}\right)=2\right\}\right| \\ N_{3+}\left(w_{i-n+1}^{i-1} \bullet\right) &=\left|\left\{w_{i}: c\left(w_{i-n+1}^{i}\right)≥3\right\}\right| \end{aligned} N0(wi−n+1i−1∙)N1(wi−n+1i−1∙)N2(wi−n+1i−1∙)N3+(wi−n+1i−1∙)=∣∣{ wi:c(wi−n+1i)>0}∣∣=∣∣{ wi:c(wi−n+1i)=1}∣∣=∣∣{ wi:c(wi−n+1i)=2}∣∣=∣∣{ wi:c(wi−n+1i)≥3}∣∣
def modified_kneser_ney(self, word, context):
"""
计算 改进 kneser_ney平滑公式的两个部分,主要对d的取值进行改进
@return:
"""
prefix_counts = self.counter[len(context) + 1][context]
if prefix_counts[word] == 1:
alpha = max(prefix_counts[word] - self.D1, 0.0) / prefix_counts.N()
elif prefix_counts[word] == 2:
alpha = max(prefix_counts[word] - self.D2, 0.0) / prefix_counts.N()
elif prefix_counts[word] >= 3:
alpha = max(prefix_counts[word] - self.D3, 0.0) / prefix_counts.N()
else:
alpha = 0
N0 = sum(1.0 for c in prefix_counts.values() if c > 0)
N1 = sum(1.0 for c in prefix_counts.values() if c == 1)
N2 = sum(1.0 for c in prefix_counts.values() if c == 2)
N3 = sum(1.0 for c in prefix_counts.values() if c >= 3)
if len(context) == self.N-1:
gamma = (self.D1 * N1 + self.D2 * N2 + self.D3 * N3) / prefix_counts.N()
else:
gamma = (self.D1 * N1 + self.D2 * N2 + self.D3 * N3) / N0
return alpha, gamma
得到 alpha 及 gamma 后,使用插值方法对 P K N h i g h 、 P K N l o w P_{KN_{high}}、P_{KN_{low}} PKNhigh、PKNlow 递归指导 len(ngram) == 1即 unigram
s e l f . p k n ( w , ( w i − n + 1 , ⋯ , w i − 1 ) ) = α + γ ∗ s e l f . p k n ( w , ( w i − n + 2 , ⋯ , w i − 1 ) ) self.pkn(w, (w_{i-n+1} ,\cdots ,w_{i-1})) = \alpha+\gamma*self.pkn(w, (w_{i-n+2} ,\cdots ,w_{i-1})) self.pkn(w,(wi−n+1,⋯,wi−1))=α+γ∗self.pkn(w,(wi−n+2,⋯,wi−1))
def pkn(self, word, context, smoothing):
"""
调用 kneser_ney 或者 modified_kneser_ney 计算Pkn
@return:
"""
if not context:
return 1.0 / len(self.unigrams)
if smoothing == 'modified':
alpha, gamma = self.modified_kneser_ney(word, context)
else:
alpha, gamma = self.kneser_ney(word, context)
return alpha + gamma * self.pkn(word, context[1:], smoothing)
困惑度公式:
p p ( t e s t _ n g r a m s ) = 2 − 1 n ∑ i n log 2 P K N pp(test\_ngrams) = 2^{-\frac{1}{n}\sum_{i}^{n}\log_{2}P_{KN}} pp(test_ngrams)=2−n1∑inlog2PKN
其中
n = l e n ( t e s t _ n g r a m s ) 即 几 组 n g r a m n=len(test\_ngrams)即几组ngram n=len(test_ngrams)即几组ngram
l o g s = ∑ i n log 2 P K N logs = \sum_{i}^{n}\log_{2}P_{KN} logs=∑inlog2PKN
e n t r o p y = − 1 n ∗ l o g s entropy = -\frac{1}{n}*logs entropy=−n1∗logs
p e r p l e x i t = 2. 0 e n t r o p y ) perplexit = 2.0^{entropy}) perplexit=2.0entropy)
def perplexity(self, test_ngrams, smoothing='modified'):
"""
困惑度
输入为 ngram 形式
@return:
"""
logs = sum(log(self.pkn(ngram[-1], ngram[:-1], smoothing), 2) for ngram in test_ngrams)
entropy = -1 * logs / len(test_ngrams)
perplexit = pow(2.0, entropy)
return perplexit
if __name__ == "__main__":
train_data = pre_data()
lm = NGram(3)
lm.fit(train_data)
# joblib.dump(lm, 'ngram.pkl') # 储存模型
# lm = joblib.load('ngram.pkl') # 复用模型
perplexity_modified = lm.perplexity([('我', '想', '你'), ('想', '上', '天')], 'modified')
perplexity_old = lm.perplexity([('我', '想', '你'), ('想', '上', '天')], 'kneser_ney')
print(perplexity_modified, perplexity_old)
Out[ ]:
perplexity_modified = 264.1499600564527
perplexity_old = 853.5240371819147
可见 modified_kneser_ney 提高了一定的困惑度
完整代码:https://github.com/RayX-X/NLPLearning/tree/master/LanguageModel
参考论文:
Chen, Goodmna.An empirical study of smoothing techniques for language modeling
Implementation of Modified Kneser-Ney Smoothing on Top of Generalized Language Models for Next Word Prediction