参考资料: 如何在Notebook中上传下载OBS文件?
#下载一个OBS文件夹sub_dir_0,从OBS下载至Notebook
mox.file.copy_parallel('obs://bucket_name/sub_dir_0', '/home/ma-user/work/sub_dir_0')
#下载一个OBS文件obs_file.txt,从OBS下载至Notebook
mox.file.copy('obs://bucket_name/obs_file.txt', '/home/ma-user/work/obs_file.txt')
#上传一个OBS文件夹sub_dir_0,从Notebook上传至OBS
mox.file.copy_parallel('/home/ma-user/work/sub_dir_0', 'obs://bucket_name/sub_dir_0')
#上传一个OBS文件obs_file.txt,从Notebook上传至OBS
mox.file.copy('/home/ma-user/work/obs_file.txt', 'obs://bucket_name/obs_file.txt')
基于此编写代码,思路通过os.system将压缩包解压后回传至OBS。
import moxing as mox
import os
import argparse
def unzip(data_path, zip_name, desc_path):
tar_path = os.path.join(data_path, zip_name)
if zip_name.endswith('zip'):
command = 'unzip -d {} {}'.format(data_path + '/', tar_path)
else:
print('压缩包格式不符合要求!')
return -1
print('*'*10+'start unzip'+'*'*10)
print(command)
os.system(command)
print('*'*10+'finish unzip'+'*'*10)
for root, folders, files in os.walk(data_path):
for file in files:
file_path = os.path.join(root, file)
if file_path != tar_path:
file_path_info = file_path.replace(data_path, '')
file_path_info = file_path_info.split(os.sep)
tmp_desc_path = desc_path
for info in file_path_info:
if info != '':
tmp_desc_path = os.path.join(tmp_desc_path, info)
mox.file.copy(file_path, tmp_desc_path)
print('*'*10+'finish transfer'+'*'*10)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=False, help='Location of data.')
args, unknown = parser.parse_known_args()
data_path = os.path.abspath(args.data_url)
unzip(data_path, 'data.zip', 'obs://gyz/output/')
如何在ModelArts上运行程序,参考 ModelArts运行mindspore程序实例教程 。目前只支持zip格式,需要解压其他格式压缩包,请修改下面的代码。
if zip_type == 'zip':
commend = 'unzip -d {} {}'.format(data_path + '/', tar_path)
在创建训练作业时指定压缩包在OBS上的位置。
运行程序,会将数据解压,最后回传到指定路径obs://gyz/output/中。
进入notebook之后新建Python文件;
使用moxing复制压缩包到notebook中;
import moxing as mox
mox.file.copy_parallel('obs://automation-system/data_vision/ImageNet1K/ImageNet1K.tar', 'ImageNet1K.tar')
打开terminal;
运行python脚本,可发现压缩包被复制到notebook中。
然后解压压缩包,编写代码把文件回传到OBS中。新建transfer_file.py脚本,编写代码如下:
import moxing as mox
import os
if __name__ == '__main__':
data_folder = 'data'
desc_path = 'obs://gyz/output/'
cwd = os.getcwd()
data_path = os.path.join(cwd, data_folder)
for root, folders, files in os.walk(data_path):
for file in files:
file_path = os.path.join(root, file)
file_path_info = file_path.replace(data_path, '')
file_path_info = file_path_info.split(os.sep)
tmp_desc_path = desc_path+data_folder+'/'
for info in file_path_info:
if info != '':
tmp_desc_path = os.path.join(tmp_desc_path, info)
mox.file.copy(file_path, tmp_desc_path)
执行脚本,即可将解压的文件复制到OBS中。
在解压OBS上150G大小的数据集ImageNet1K.tar时,发现上述两种方式均失败了。上述两种方式其实都是把数据复制到ModelArts中,解压后再回传到OBS中,而ModelArts可能禁止一次性传输太大的数据或者存在空间不足的问题。经过尝试,摸索了一种方式,首先直接使用OBS将ImageNet1K.tar下载到本地,本地解压后本来打算直接用OBS上传,但发现OBS直接上传会很慢。因此采用了一种折中的方式,首先按子文件夹将数据压缩成多个压缩包;然后将压缩包上传至OBS;最后使用ModelArts中notebook依次解压OBS上各个小型压缩包,并把数据回传至OBS;过程异常曲折,更好的办法还在探寻中。
ImageNet1K.tar解压后,将rain下面的多个子文件夹进行压缩;
import os
import zipfile
def zip_dir(dirname, zipfilename):
filelist = []
if os.path.isfile(dirname):
filelist.append(dirname)
else:
for root, dirs, files in os.walk(dirname):
for name in files:
filelist.append(os.path.join(root, name))
zf = zipfile.ZipFile(zipfilename, "w", zipfile.zlib.DEFLATED)
for tar in filelist:
arcname = tar[len(dirname):]
# print arcname
zf.write(tar, arcname)
zf.close()
if __name__ == '__main__':
data_folder = r'E:\CLS-LOC\train'
folder_list = os.listdir(r'E:\CLS-LOC\train')
folder_num = len(folder_list)
idx = 1
for folder in folder_list:
if folder.endswith('zip'):
continue
folder_path = os.path.join(data_folder, folder)
zip_path = folder_path + '.zip'
zip_dir(folder_path, zip_path)
print('finish: {}/{}'.format(idx, folder_num))
idx += 1
使用ModelArts中Notebook运行如下程序,将多个子压缩包先复制到Notebook中,然后解压之后依次回传到OBS;为了防止Notebook中空间不够,每完成一个,则删除Notebook上复制的压缩包与解压的文件。
import moxing as mox
import os
def unzip_in_obs(obs_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
file_list = mox.file.list_directory(obs_folder)
for file in file_list:
if file.endswith('zip'):
obs_path = os.path.join(obs_folder, file)
output_path = os.path.join(output_folder, file)
mox.file.copy_parallel(obs_path, output_path)
print('copy file: '+file)
zip_name = file.replace('.zip', '')
zip_folder = os.path.join(output_folder, zip_name)
if not os.path.exists(zip_folder):
os.makedirs(zip_folder)
zip_path = os.path.join(output_folder, file)
command = 'unzip -d {} {}'.format(zip_folder + '/', zip_path)
print(command)
os.system(command)
for root, folders, files in os.walk(zip_folder):
for file in files:
file_path = os.path.join(root, file)
file_path_info = file_path.replace(output_folder, '')
file_path_info = file_path_info.split(os.sep)
desc_path = obs_folder
for info in file_path_info:
if info != '':
desc_path = os.path.join(desc_path, info)
mox.file.copy_parallel(file_path, desc_path)
os.system('rm -rf {}'.format(zip_path))
os.system('rm -rf {}'.format(zip_folder))
if __name__ == '__main__':
obs_folder = 'obs://gyz/data/CLS-LOC/train/'
output_folder = 'train'
unzip_in_obs(obs_folder, output_folder)