数据下载


AI Meets Beauty

Perfect Half Million Beauty Product Image Recognition Challenge


分析:比赛给的数据集是一个csv文件,包括了名称和下载地址,因此首先要下载数据。
数据集总共包含520k张图像,来自14个电商网站。

data.csv
数据下载_第1张图片
Dataset

数据下载脚本如下

# -*- coding: utf-8 -*-
# download.py --下载数据集

from __future__ import print_function 
import os, csv
from skimage import io
from multiprocessing import Pool

def signal_handler(signum, frame, img_url):
    raise Exception('Time out--', img_url)

def getFile(url):
    url = url.split(',')
    imgname, imgurl = url[0], url[1].strip('"')
    imgformat = '.jpg'
    
    try: 
        image = io.imread(imgurl)
        io.imsave("./dataset/" + imgname + imgformat, image)
    except Exception as e:
        failures = open('failure.txt', 'a')
        print (imgname + ' ' + imgurl)
        failures.write(imgname + ' ' + imgurl + '\n')
    
if __name__ == "__main__":
    os.chdir(os.path.join(os.getcwd()))
    if not os.path.exists('./dataset'):
        os.mkdir('./dataset')
    
    with open ('data.csv', 'r') as f:
        lines=f.readlines()
        csvf = lines[:]
        #csvf.extend(lines[:])
        parallel_num = 10
        line_num = len(csvf)
        p = Pool(parallel_num)
        for i in range(int(line_num/parallel_num)):
            p.map(getFile, csvf[i*parallel_num:i*parallel_num+parallel_num])
    f.close()
    # failures.close()

你可能感兴趣的:(数据下载)