deepspeech2 代码之数据处理

以Librispeech为例

step 1 下载数据集

下载地址: http://www.openslr.org/12/
下载文件
deepspeech2 代码之数据处理_第1张图片

step 2 解压文件

tar -zxvf *.tar.gz ./

此处可以不解压 代码中可以边解压边读取边删除 但是为了效率 这里选择先解压
deepspeech2 代码之数据处理_第2张图片

step 3 定义参数

class parser():
    def __init__(self):
        self.target_dir = '../librispeech_out/'
        self.sample_rate = 16000
        self.files_to_use = '/data/LibriSpeech/LibriSpeech/'
        self.min_duration = 1
        self.max_duration = 15
args = parser()
LIBRI_SPEECH_DICTS = {
    'train':args.files_to_use + 'train-clean-360',
    'val' : args.files_to_use + 'dev-clean',
    'test-clean' : args.files_to_use + 'test-clean',  
}
LIBRI_SPEECH_URLS

这里因为使用的jupyter 所以写死了,如果想改的话可以用argparse写入即可

step 4 数据处理

代码分为三部分
1.创建文件夹
2.将原解压文件(解压好的文件)中的文件用os.walk深度优先遍历复制到自己创建的文件目录中
3.制作wav ,txt对应的csv文件

def main():
    target_dir = args.target_dir
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    for split_type, lst_libri_dic in LIBRI_SPEECH_DICTS.items():
        split_dir = os.path.join(target_dir, split_type) #librispeech_out/train dev test
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_war_dir = os.path.join(split_dir, 'wav') #librispeech_out/train/wav
        if not os.path.exists(split_war_dir):
            os.makedirs(split_war_dir)
        split_txt_dir = os.path.join(split_dir, 'txt') #librispeech_out/train/txt
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        '''
        #优化 不需要提前解压 创建解压临时文件
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir) #删除该目录下所有文件
        
        # 解压文件
        target_filename = 'train-clean-100.tar.gz'
        tar = tarfile.open(target_filename)
        tar.extractall(split_dir)
        tar.close()
        os.remove(target_filename)
        '''
        # 从解压文件中将所有文件放入 wav 和 txt 文件夹
        for root, subdirs, files in tqdm(os.walk(lst_libri_dic)):
            for f in files:
                if f.find('.flac') != -1:
                    _process_file(wav_dir=split_war_dir, txt_dir=split_txt_dir,
                                 base_filename=f, root_dir=root)
        # 删除         
        # shutil.rmtree(extracted_dir)
        if split_type == 'train':  # Prune to min/max duration
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration)
        else:
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')

4.1 创建文件夹

代码前13行是在创建保存数据的文件夹
deepspeech2 代码之数据处理_第3张图片

4.2 复制文件到自定义目录

def _preprocess_transcript(phrase):
    return phrase.strip().upper()


def _process_file(wav_dir, txt_dir, base_filename, root_dir):
    full_recording_path = os.path.join(root_dir, base_filename)
    assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
    wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
    subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
                                                          wav_recording_path)], shell=True)
    # process transcript
    txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
    transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
    assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
    transcriptions = open(transcript_file).read().strip().split("\n")
    transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
    with open(txt_transcript_path, "w") as f:
        key = base_filename.replace(".flac", "").split("-")[-1]
        assert key in transcriptions, "{} is not in the transcriptions".format(key)
        f.write(_preprocess_transcript(transcriptions[key]))
        f.flush()

4.3 创建csv文件


def create_manifest(data_path, output_path, min_duration=None, max_duration=None):
    file_paths = [os.path.join(dirpath, f)
                  for dirpath, dirnames, files in os.walk(data_path)
                  for f in fnmatch.filter(files, '*.wav')]
    file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
    with io.FileIO(output_path, "w") as file:
        for wav_path in tqdm(file_paths, total=len(file_paths)):
            transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
            sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
            file.write(sample.encode('utf-8'))
    print('\n')


def order_and_prune_files(file_paths, min_duration, max_duration):
    print("Sorting manifests...")
    duration_file_paths = [(path, float(subprocess.check_output(
        ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths]
    if min_duration and max_duration:
        print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
        duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
                               min_duration <= duration <= max_duration]

    def func(element):
        return element[1]

    duration_file_paths.sort(key=func)
    return [x[0] for x in duration_file_paths]  # Remove durations

你可能感兴趣的:(语音识别)