threading模块多线程下载斗图图片

涉及
lxml模块的css选择器
requests库
threading多线程
测试环境为python 2.7

#coding=utf8

import re
import os
import lxml.html
import requests
import threading
import time

def get_url(url):
    html = requests.get(url).content
    soup = lxml.html.fromstring(html)
    src_list = soup.cssselect('a.col-xs-6.col-sm-3') #css选择器选择图片class
    url_list = []
    title_list = []
    for src in src_list:
        img_url = re.compile('data-original="(http:)?//(.*?)"').search(lxml.html.tostring(src)).group(2) #获取图片class中的图片url
        url_list.append(img_url)
        title = src.text_content().replace('\n','').replace(' ', '').strip() #获取图片标题
        title_list.append(title)
    start_save_img(url_list,title_list)

def save_img(img_url,title):
    img_url = 'http://'+img_url
    img_format = re.compile('\.jpg|\.png|.gif').search(img_url).group()
    img_content = requests.get(img_url).content
    print '正在下载'+ img_url
    if not os.path.exists('./img'):
        os.mkdir('img')
    with open('img/{}{}'.format(title.encode('utf-8'), img_format), 'wb') as f:
        f.write(img_content)

def start_save_img(url_list,title_list):
    for i in range(len(url_list)):
        th = threading.Thread(target=save_img,args=(url_list[i],title_list[i])) #多线程下载
        th.start() #开启多线程

def start():
    for i in range(1,10):
        url = 'https://www.doutula.com/photo/list/?page={}'.format(i)
        get_url(url)
        time.sleep(3)

if __name__ == '__main__':
    start()

你可能感兴趣的:(threading模块多线程下载斗图图片)