bleu 是用来衡量机器翻译结果好坏的指标
最初的版本:
使用累加器计算candidate中词在reference中出现词的次数,除以总次数称为precision
但可能会出现以下情况:
此时
显然这个计算方法是存在bug的
改良版:
可以解决大部分问题但是
所以加入短句惩罚
此处定义一个概念, 当candidate doc 长度等于任何一个reference doc的长度的时候, 我们称此时为最佳匹配, 此时不需要对翻译的结果进行惩罚, 当candidate doc 的长度不等于任何reference doc 的长度的时候, 需要引入一个参考长度(记做ReflenReflen 同时定义candidate doc 的长度为cc那么惩罚因子计算公式如下:
final equation:
Wn is the n-gram's weight
for example:w=[0.25,0.25,0.25,0.25]
)
BP is brevity_penalty
r is the length of reference sentence
c is the length of candidate sentence
if c>r BP equal to 1
else BP equal to exp(1-r/c)
BLEU源码
from __future__ import division
import math
import os
import collections
from nltk.tokenize import word_tokenize
from nltk import Counter
from nltk.util import ngrams
def bleu(candidate, references, weights):
p_ns = (
_modified_precision(candidate, references, i)
for i, _ in enumerate(weights, start=1)
)
try:
s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns))
except ValueError:
# some p_ns is 0
return 0
bp = _brevity_penalty(candidate, references)
return bp * math.exp(s)
def _modified_precision(candidate, references, n):
counts = Counter(ngrams(candidate, n))
if not counts:
return 0
max_counts = {}
for reference in references:
reference_counts = Counter(ngrams(reference, n))
for ngram in counts:
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())
return sum(clipped_counts.values()) / sum(counts.values())
def _brevity_penalty(candidate, references): #简短惩罚BP
c = len(candidate)
ref_lens = (len(reference) for reference in references)
r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
if c > r:
return 1
else:
return math.exp(1 - r / c)
"""Calculate BLEU score (Bilingual Evaluation Understudy)
:param candidate: a candidate sentence
:type candidate: list(str)
:param references: reference sentences
:type references: list(list(str))
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)"""
weights = [0.25, 0.25, 0.25, 0.25]
candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
'ensures', 'that', 'the', 'military', 'always',
'obeys', 'the', 'commands', 'of', 'the', 'party']
candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
'forever', 'hearing', 'the', 'activity', 'guidebook','that', 'party', 'direct']
reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
'ensures', 'that', 'the', 'military', 'will', 'forever',
'heed', 'Party', 'commands']
reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
'guarantees', 'the', 'military', 'forces', 'always',
'being', 'under', 'the', 'command', 'of', 'the',
'Party']
reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
'army', 'always', 'to', 'heed', 'the', 'directions',
'of', 'the', 'party']
bleu(candidate1, [reference1, reference2, reference3], weights)