爬虫学习计报存--报存图片

爬虫学习计-=-meiztu


# -*- coding: utf-8 -*-
import requests
import re
import os
import time
from scrapy import Selector
from fake_useragent import UserAgent

# 替换随机请求头
ua = UserAgent()
headers = {'User-Agent': ua.random}

headers = {
	'Referer': '**********',
	'Sec-Fetch-Mode': 'no-cors',
	'User-Agent': ua.random
}

# 得到列表页信息
for first_i in range(1, 249):
	url = '*********/page/' + str(first_i) + '/'
	response_first = requests.get(url = url, headers = headers)
	print('response_first ---code===' + str(response_first.status_code))
	
	selector_first = Selector(response_first)
	# xpath语法获取到详情页的超链接
	href_list_first = selector_first.xpath('//*[@id="pins"]/li/a/@href').getall()
	for href_first in href_list_first:
		print(href_first)
		
		response_first = requests.get(url = href_first, headers = headers)
		response_first.encoding = 'utf-8'
		selector_first = Selector(response_first)
		# 获取到各详情页的名字,为后面文件名安排
		name = selector_first.xpath('/html/body/div[2]/div[1]/h2/text()').extract_first()
		# 正则去除windows文件夹不符合的名称
		name = re.findall(r'[^\*"/:?\\|<>]', name)
		name = "".join(name)
		print(name)
		# 获取到各详情页的图片数量
		page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract_first()
		if page == None:
			pass
		elif page == '下一页»':
			page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[3]/span/text()').extract_first()
			page = int(page) + 1
		else:
			page = int(page) + 1
			print(page)
		# 对详情页的每张图片发起请求
		for i in range(1, page):
			
			url_second = href_first + '/' + str(i)
			response_second = requests.get(url = url_second, headers = headers)
			print('response_second ---code===' + str(response_second.status_code))
			picture_url = re.findall('''>>>防止封IP
					time.sleep(1)
				else:
					pass
			else:
				pass

由于一秒一张图!图片数量较多!有大佬搞定ip池的欢迎交流下!!!

你可能感兴趣的:(爬虫,python,xpath)