爬取斗图网的图片

单线程爬取斗图网的图片

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import lxml,urllib
from lxml import etree
import os
#基本url
BASE_URL = 'https://www.doutula.com/photo/list/?page='
#获取每一页的url
PAGE_URLS = []
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
#下载每一页的图片
def get_down_image(url,index):
    filename = url.split('/')[-1]
    # print(index)
    os.makedirs('./images/page{}_image/'.format(index), exist_ok=True)#创建一个文件夹
    filename = filename.split('!')[-2]
    path = os.path.join('images/page{}_image'.format(index),filename)
    urlretrieve(url,filename=path)#下载图片

#获取每一个图片的url
def get_image_urls(url,index):
    response = requests.get(url,headers=headers)
    context = response.text
    html = etree.HTML(context)
    # soup = BeautifulSoup(context,'lxml')
    image_urls = html.xpath("//div[@class='page-content text-center']//img/@data-original")
    # print(context)
    for image_url in image_urls:
        # print(image_url)
        get_down_image(image_url,index)

#获取每一页url
def get_urls_list():
    for x in range(5):
        url = BASE_URL+str(x)
        PAGE_URLS.append(url)
    return PAGE_URLS

def main():
    urls = get_urls_list()
    for index,url in enumerate(urls):
        get_image_urls(url,index)
        # break

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫)