不使用异步的代码
# # -*- coding: utf-8 -*-
import json
import os
from urllib import request
import csv
import sys
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--csv_dir', default='tmp/csv', type=str,
help="root path of images and labels, include ./images and ./labels and classes.txt")
parser.add_argument('--save_dir', default='tmp/save_1', type=str,
help="root path of images and labels, include ./images and ./labels and classes.txt")
arg = parser.parse_args()
csv.field_size_limit(sys.maxsize)
csv_dir=arg.csv_dir
csv_all=os.listdir(csv_dir)
save_dir=arg.save_dir
save_images_path=os.path.join(save_dir,"images")
save_labels_path=os.path.join(save_dir,"labels_json")
if not os.path.exists(save_images_path):
os.mkdir(save_images_path)
if not os.path.exists(save_labels_path):
os.mkdir(save_labels_path)
for csv_name in csv_all:
f=open(os.path.join(csv_dir,csv_name),encoding='UTF8') #encoding = 'gb2312';encoding='UTF8'
csv_reader=csv.reader(f)
i=0
for line in csv_reader:
if i>0:
data_id=line[0]
jpg_name=line[2].split('/')[-1]
if not jpg_name.endswith('jpg'):
continue
jpg_url=line[2]
json_1=line[3]
json_result=json.loads(json_1)
is_value = json_result['info'][0]['value']
if is_value=='No':
continue
with open(save_labels_path +'/' +jpg_name.split('.')[0]+'.json','w',encoding='utf-8') as ftxt:
json.dump(json_result,ftxt,ensure_ascii=False)
try:
request.urlretrieve(jpg_url, save_images_path+'/' +jpg_name)
except:
print("error image", jpg_name)
continue
i=i+1
print(i)
使用异步编程,可以同时下载多张图片,但是会比上边的方式少下载一些图片,具体原因还未排查。
# # -*- coding: utf-8 -*-
import json
import os
import csv
import sys
import argparse
import asyncio
import requests_async as requests
parser = argparse.ArgumentParser()
parser.add_argument('--csv_dir', default='tmp/csv', type=str,
help="root path of images and labels, include ./images and ./labels and classes.txt")
parser.add_argument('--save_dir', default='tmp/save_1', type=str,
help="root path of images and labels, include ./images and ./labels and classes.txt")
arg = parser.parse_args()
csv.field_size_limit(sys.maxsize)
csv_dir=arg.csv_dir
csv_all=os.listdir(csv_dir)
save_dir=arg.save_dir
save_images_path=os.path.join(save_dir,"images")
save_labels_path=os.path.join(save_dir,"labels_json")
async def download_images_and_save2file(page, inx, jpg_name, image_url):
try:
response = await requests.get(image_url)
with open(f"{save_images_path}/{jpg_name}", "wb") as fp:
fp.write(response.content)
print(f"page: {page}, inx: {inx}, url: {image_url} 完成!")
except Exception as e:
print(f"e: {e}")
async def download_imgs():
if not os.path.exists(save_images_path):
os.mkdir(save_images_path)
if not os.path.exists(save_labels_path):
os.mkdir(save_labels_path)
for csv_name in csv_all:
f=open(os.path.join(csv_dir,csv_name),encoding='UTF8') #encoding = 'gb2312';encoding='UTF8'
csv_reader = csv.reader(f)
csv.field_size_limit(sys.maxsize)
inx = 1
page = 1
await_list = []
for line in csv_reader:
#print(line)
if "data_id" in line :
continue
jpg_name = line[2].split('/')[-1]
jpg_url = line[2]
json_1 = line[3]
json_result = json.loads(json_1)
is_value = json_result['info'][0]['value']
if is_value=='No':
continue
with open(save_labels_path +'/' +jpg_name.split('.')[0]+'.json','w',encoding='utf-8') as ftxt:
json.dump(json_result,ftxt,ensure_ascii=False)
if inx == 0 or inx % 50 != 0: # 50个一组
inx += 1
print(f"page: {page}, inx: {inx} 进行中...")
await_list.append(download_images_and_save2file(page=page, inx=inx, jpg_name=jpg_name, image_url=jpg_url))
else:
inx = 0
page += 1
await asyncio.gather(*await_list)
await_list = []
if __name__ == '__main__':
asyncio.run(download_imgs())