# -*- coding: utf-8 -*-
import requests
import re
import os
import time
from scrapy import Selector
from fake_useragent import UserAgent
# 替换随机请求头
ua = UserAgent()
headers = {'User-Agent': ua.random}
headers = {
'Referer': '**********',
'Sec-Fetch-Mode': 'no-cors',
'User-Agent': ua.random
}
# 得到列表页信息
for first_i in range(1, 249):
url = '*********/page/' + str(first_i) + '/'
response_first = requests.get(url = url, headers = headers)
print('response_first ---code===' + str(response_first.status_code))
selector_first = Selector(response_first)
# xpath语法获取到详情页的超链接
href_list_first = selector_first.xpath('//*[@id="pins"]/li/a/@href').getall()
for href_first in href_list_first:
print(href_first)
response_first = requests.get(url = href_first, headers = headers)
response_first.encoding = 'utf-8'
selector_first = Selector(response_first)
# 获取到各详情页的名字,为后面文件名安排
name = selector_first.xpath('/html/body/div[2]/div[1]/h2/text()').extract_first()
# 正则去除windows文件夹不符合的名称
name = re.findall(r'[^\*"/:?\\|<>]', name)
name = "".join(name)
print(name)
# 获取到各详情页的图片数量
page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract_first()
if page == None:
pass
elif page == '下一页»':
page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[3]/span/text()').extract_first()
page = int(page) + 1
else:
page = int(page) + 1
print(page)
# 对详情页的每张图片发起请求
for i in range(1, page):
url_second = href_first + '/' + str(i)
response_second = requests.get(url = url_second, headers = headers)
print('response_second ---code===' + str(response_second.status_code))
picture_url = re.findall('''>>>防止封IP
time.sleep(1)
else:
pass
else:
pass
由于一秒一张图!图片数量较多!有大佬搞定ip池的欢迎交流下!!!