tar -zxvf *.tar.gz ./
class parser():
def __init__(self):
self.target_dir = '../librispeech_out/'
self.sample_rate = 16000
self.files_to_use = '/data/LibriSpeech/LibriSpeech/'
self.min_duration = 1
self.max_duration = 15
args = parser()
LIBRI_SPEECH_DICTS = {
'train':args.files_to_use + 'train-clean-360',
'val' : args.files_to_use + 'dev-clean',
'test-clean' : args.files_to_use + 'test-clean',
}
LIBRI_SPEECH_URLS
这里因为使用的jupyter 所以写死了,如果想改的话可以用argparse写入即可
代码分为三部分
1.创建文件夹
2.将原解压文件(解压好的文件)中的文件用os.walk深度优先遍历复制到自己创建的文件目录中
3.制作wav ,txt对应的csv文件
def main():
target_dir = args.target_dir
if not os.path.exists(target_dir):
os.makedirs(target_dir)
for split_type, lst_libri_dic in LIBRI_SPEECH_DICTS.items():
split_dir = os.path.join(target_dir, split_type) #librispeech_out/train dev test
if not os.path.exists(split_dir):
os.makedirs(split_dir)
split_war_dir = os.path.join(split_dir, 'wav') #librispeech_out/train/wav
if not os.path.exists(split_war_dir):
os.makedirs(split_war_dir)
split_txt_dir = os.path.join(split_dir, 'txt') #librispeech_out/train/txt
if not os.path.exists(split_txt_dir):
os.makedirs(split_txt_dir)
'''
#优化 不需要提前解压 创建解压临时文件
extracted_dir = os.path.join(split_dir, "LibriSpeech")
if os.path.exists(extracted_dir):
shutil.rmtree(extracted_dir) #删除该目录下所有文件
# 解压文件
target_filename = 'train-clean-100.tar.gz'
tar = tarfile.open(target_filename)
tar.extractall(split_dir)
tar.close()
os.remove(target_filename)
'''
# 从解压文件中将所有文件放入 wav 和 txt 文件夹
for root, subdirs, files in tqdm(os.walk(lst_libri_dic)):
for f in files:
if f.find('.flac') != -1:
_process_file(wav_dir=split_war_dir, txt_dir=split_txt_dir,
base_filename=f, root_dir=root)
# 删除
# shutil.rmtree(extracted_dir)
if split_type == 'train': # Prune to min/max duration
create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration)
else:
create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
def _preprocess_transcript(phrase):
return phrase.strip().upper()
def _process_file(wav_dir, txt_dir, base_filename, root_dir):
full_recording_path = os.path.join(root_dir, base_filename)
assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
wav_recording_path)], shell=True)
# process transcript
txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
transcriptions = open(transcript_file).read().strip().split("\n")
transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
with open(txt_transcript_path, "w") as f:
key = base_filename.replace(".flac", "").split("-")[-1]
assert key in transcriptions, "{} is not in the transcriptions".format(key)
f.write(_preprocess_transcript(transcriptions[key]))
f.flush()
def create_manifest(data_path, output_path, min_duration=None, max_duration=None):
file_paths = [os.path.join(dirpath, f)
for dirpath, dirnames, files in os.walk(data_path)
for f in fnmatch.filter(files, '*.wav')]
file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
with io.FileIO(output_path, "w") as file:
for wav_path in tqdm(file_paths, total=len(file_paths)):
transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
file.write(sample.encode('utf-8'))
print('\n')
def order_and_prune_files(file_paths, min_duration, max_duration):
print("Sorting manifests...")
duration_file_paths = [(path, float(subprocess.check_output(
['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths]
if min_duration and max_duration:
print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
min_duration <= duration <= max_duration]
def func(element):
return element[1]
duration_file_paths.sort(key=func)
return [x[0] for x in duration_file_paths] # Remove durations