在完成模型搭建之后先简单总结一下deepspeech所用的训练参数和模型选择
Levenshtein.distance(str1, str2)
计算编辑距离(也称Levenshtein距离)。是描述由一个字串转化成另一个字串最少的操作次数,在其中的操作包括插入、删除、替换。算法实现:动态规划。
Levenshtein.hamming(str1, str2)
计算汉明距离。要求str1和str2必须长度一致。是描述两个等长字串之间对应位置上不同字符的个数。
import Levenshtein as Lev
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
def cer(self, s1, s2):
"""
Computes the Character Error Rate, defined as the edit distance.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
return Lev.distance(s1, s2)
decoded_output, _ = decoder.decode(out, output_sizes)
target_strings = decoder.convert_to_strings(split_targets)
wer, cer = 0, 0
for x in range(len(target_strings)):
transcript, reference = decoded_output[x][0], target_strings[x][0]
wer += decoder.wer(transcript, reference) / float(len(reference.split()))
cer += decoder.cer(transcript, reference) / float(len(reference))
total_cer += cer
total_wer += wer
del out
class GreedyDecoder(Decoder):
def __init__(self, labels, blank_index=0):
super(GreedyDecoder, self).__init__(labels, blank_index)
def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False):
"""Given a list of numeric sequences, returns the corresponding strings"""
strings = []
offsets = [] if return_offsets else None
for x in xrange(len(sequences)):
seq_len = sizes[x] if sizes is not None else len(sequences[x])
string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions)
strings.append([string]) # We only return one path
if return_offsets:
offsets.append([string_offsets])
if return_offsets:
return strings, offsets
else:
return strings
def process_string(self, sequence, size, remove_repetitions=False):
string = ''
offsets = []
for i in range(size):
char = self.int_to_char[sequence[i].item()]
if char != self.int_to_char[self.blank_index]:
# if this char is a repetition and remove_repetitions=true, then skip
if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]:
pass
elif char == self.labels[self.space_index]:
string += ' '
offsets.append(i)
else:
string = string + char
offsets.append(i)
return string, torch.tensor(offsets, dtype=torch.int)
def decode(self, probs, sizes=None):
"""
Returns the argmax decoding given the probability matrix. Removes
repeated elements in the sequence, as well as blanks.
Arguments:
probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
sizes(optional): Size of each sequence in the mini-batch
Returns:
strings: sequences of the model's best guess for the transcription on inputs
offsets: time step per character predicted
"""
_, max_probs = torch.max(probs, 2)
strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes,
remove_repetitions=True, return_offsets=True)
return strings, offsets
class BeamCTCDecoder(Decoder):
def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
num_processes=4, blank_index=0):
super(BeamCTCDecoder, self).__init__(labels)
try:
from ctcdecode import CTCBeamDecoder
except ImportError:
raise ImportError("BeamCTCDecoder requires paddledecoder package.")
self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
num_processes, blank_index)
def convert_to_strings(self, out, seq_len):
results = []
for b, batch in enumerate(out):
utterances = []
for p, utt in enumerate(batch):
size = seq_len[b][p]
if size > 0:
transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size]))
else:
transcript = ''
utterances.append(transcript)
results.append(utterances)
return results
def convert_tensor(self, offsets, sizes):
results = []
for b, batch in enumerate(offsets):
utterances = []
for p, utt in enumerate(batch):
size = sizes[b][p]
if sizes[b][p] > 0:
utterances.append(utt[0:size])
else:
utterances.append(torch.tensor([], dtype=torch.int))
results.append(utterances)
return results
def decode(self, probs, sizes=None):
"""
Decodes probability output using ctcdecode package.
Arguments:
probs: Tensor of character probabilities, where probs[c,t]
is the probability of character c at time t
sizes: Size of each sequence in the mini-batch
Returns:
string: sequences of the model's best guess for the transcription
"""
probs = probs.cpu()
out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)
strings = self.convert_to_strings(out, seq_lens)
offsets = self.convert_tensor(offsets, seq_lens)
return strings, offsets