批量爬取27270美女栏目图片

批量爬取27270美女栏目图片

运行了一个晚上小水管太慢了,才爬了几万张图片。

做了一下重复抓取,设定抓取八次

写了一下日志,但是想了一下还是注释掉了

代码里面有很多修修改改的痕迹,

如果愿意的话可以拿去把这个程序修改一下


采集的网页是:http://www.27270.com/

当前使用的python版本是python3.5.2

# -*- coding:utf-8 -*-
import os
import sys
import time
import random
import logging
import requests
import multiprocessing
from multiprocessing import Pool
from bs4 import BeautifulSoup

img_href = []
a_index = {}
flag = 'true'
html_index = ''
error_num = []
error_href = []
error_path = []
index = {'start': '', 'end': ''}
url_index = 'http://www.27270.com/ent/meinvtupian/'

sys.setrecursionlimit(1000000)

# 获取logger实例,如果参数为空则返回root logger
logger = logging.getLogger("AppName")

# 指定logger输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)-8s: %(message)s')

# 文件日志
file_handler = logging.FileHandler("test.log")
file_handler.setFormatter(formatter)  # 可以通过setFormatter指定输出格式

# 为logger添加的日志处理器
logger.addHandler(file_handler)

# 指定日志的最低输出级别,默认为WARN级别
logger.setLevel(logging.INFO)


class flag(object):

	def __init__(self):
		f = True
	
	def get_f(self):
		return self.f
	
	@staticmethod
	def set_f(self):
		self.f = False


def is_folder(file_name=''):
	# 判断是否存在图片存储文件夹,如不存在则创建
	cwd = os.getcwd() + file_name
	if not os.path.exists(cwd):
		os.mkdir(cwd)
		print('已创建图片存储文件夹%s' % file_name)
	else:
		# print("检测到已有图片存储文件夹")
		pass


def get_url(url='', host=''):
	# 获取response
	response = ''
	header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
			  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
			'Accept - Encoding' :  'gzip, deflate',
			'Accept - Language' : 'zh-CN,zh;q=0.9',
			'Cache - Control' : 'max - age = 0',
			'Connection' : 'keep - alive',
			  'Upgrade - Insecure - Requests' : '1'
	}
	cooke = {
		'Cookie': 'Hm_lvt_63a864f136a45557b3e0cbce07b7e572=1519296125,1519296217,1519306647,1519309454; Hm_lpvt_63a864f136a45557b3e0cbce07b7e572=1519310130'}
	# 可设置代理
	proxies = {
		"http": "http://"+'61.155.164.106:3128',
		"https": "http://"+'61.155.164.106:3128',
	 }
	if host != '':
		header['Host'] = host
	'''
	try:
		print(header['Host'])
	except Exception:
		print('none host')
	'''
	try:
		response = requests.get(url, headers=header, timeout=30)
	except Exception:
		response = 'error'
		logger.error('%s \t\t get error' % url)
	finally:
		# print(url)
		if host != '':
			del header['Host']
		time.sleep(random.randint(1, 4))
		return response


def download_img(url = 'http://t1.27270.com/uploads/tu/201802/726/e6e5afe62c.jpg', name='', the_path='', num=8):
	# 下载单张图片
	
	response = get_url(url, host='t2.hddhhn.com')
	if response != 'error':
		cwd = os.getcwd() + r'\woman'
		file_name = name + '.' + url.split('/')[-1].split('.')[-1]
		logger.warn('%s \t\t download...' % (url))
		
		with open(cwd + '\\' + the_path + file_name, 'wb') as f:
			file_data = response.content
			f.write(file_data)
	else:
		if num > 0:
			return download_img(url, name=name, the_path=the_path, num=num - 1)
		print('download error')
	return
		
		
def get_index(url_index):
	# 获取主页html文件
	response = get_url(url_index)
	response.encoding = 'gb2312'
	return response.text

		
def get_start_end(url=''):
	response = get_url(url)
	response.encoding = 'gb2312'
	html_index = response.text
	soup = BeautifulSoup(html_index, "html.parser")
	a_index_a_all = soup.find("div", class_="NewPages").find('ul').find_all('a', target='_self')
	for a_index_a in a_index_a_all:
		a_index[a_index_a.string] = (url_index + a_index_a['href'])
	
	html_index = get_index(url_index)
	soup = BeautifulSoup(html_index, "html.parser")
	index['start'] = soup.find("div", class_="NewPages").find('li', class_='thisclass').find('a').string
	response = get_url(a_index['末页'])
	response.encoding = 'gb2312'
	html_index = response.text
	soup = BeautifulSoup(html_index, "html.parser")
	index['end'] = soup.find("div", class_="NewPages").find('li', class_='thisclass').find('a').string

def get_page_href(url=''):
	# 获取分页按钮跳转的网页
	new_num = 0
	response = get_url(url)
	if response != 'error':
		response.encoding = 'gb2312'
		html_index = response.text
		soup = BeautifulSoup(html_index, "html.parser")
		a_index_a_all = soup.find("div", class_="NewPages").find('ul').find_all('a', target='_self')
		for a_index_a in a_index_a_all:
			a_index[a_index_a.string] = (url_index + a_index_a['href'])
			if str(a_index_a.string).isdigit() and int(a_index_a.string) > int(new_num):
				new_num = a_index_a.string
		print('已进行:%.2f%%' % (int(new_num)*100/int(index['end'])))
		if (int(new_num) >= int(index['end'])):
			return
	else:
		new_num -= 1
		print('page error')
	get_page_href(a_index[new_num])
	
	
def get_father_img(url_index_child):
	a_index_a_all = ''
	response = get_url(url_index_child)
	if response != 'error':
		response.encoding = 'gb2312'
		html_index = response.text
		soup = BeautifulSoup(html_index, "html.parser")
		a_index_a_all = soup.find('div', class_='MeinvTuPianBox').find('ul').find_all('a', class_='MMPic')
	return a_index_a_all


def download_children_img(url, title):
	num = 0
	global child_img_href
	max_index = '0'
	child_img_href = {'1' : url}
	# print(child_img_href)
	get_child_href(url, max_index, title)
	print('%d张图片,正在下载\n' % len(child_img_href))
	for key, val in child_img_href.items():
		try:
			response = get_url(val)
			if response != 'error':
				response.encoding = 'gb2312'
				html_index = response.text
				soup = BeautifulSoup(html_index, "html.parser")
				href = str(soup.find('div', class_='articleV4Body').find('img')['src'])
				# print(href)
				is_folder(r'\woman\\' + title)
				download_img(href, str(num), title+'\\')
				num += 1
		except Exception:
			print('下载图片失败')
	
def get_child_href(url_index_child, max_index, file_name=''):
	num = '0'
	response = get_url(url_index_child)
	if response != 'error':
		if file_name != '':
			is_folder(r'\woman\\' + file_name)
		response.encoding = 'gb2312'
		html_index = response.text
		# print(html_index)
		soup = BeautifulSoup(html_index, "html.parser")
		max_index = soup.find('div', class_='page-tag oh').find('ul').find('li', class_='hide')['pageinfo']
		a_index_a_first = soup.find("div", class_="page-tag oh").find('ul').find('li', class_='thisclass')
		for sibling in a_index_a_first.next_siblings:
			if str(sibling.string).isdigit():
				if int(sibling.string) > int(num):
					num = int(sibling.string)
				child_img_href[str(sibling.string)] = '/'.join(url_index_child.split('/')[:-1]) + '/' + sibling.find('a')['href']
		# print(num)
		if int(num) >= int(max_index):
			return
	else:
		num = ''+str(int(num)+1)
	# print(num)
	get_child_href(child_img_href[str(num)],  max_index)
	
	
def download_url_all():
	index = 1
	zz = 0
	# a_index = {'1': 'http://www.27270.com/ent/meinvtupian/list_11_1.html', 2: 'http://www.27270.com/ent/meinvtupian/list_11_2.html'}
	for key, value in a_index.items():
		img_index = []
		a_index_a_all = get_father_img(value)
		print('%d / %s' % (index, len(a_index)))
		# print('第'+str(index)+'轮下载即将开始')
		for a_index_a in a_index_a_all:
			# print(a_index_a)
			img_href.append(a_index_a)
			# download_children_img(a_index_a['href'], a_index_a['title'])
			# print(a_index_a)
			# logger.warn('图片合集:%d  :  %s %s' % (zz, a_index_a['href'], a_index_a['title']))
			# download_img(a_index_a['href'], str(zz))
			zz += 1
		# print('请等待下一轮下载\n\n')
		index += 1
		# 使用进程池,并发数为2
	print('zhong:%d' % int(len(img_href)/2))

def func(all_href):
	for a_index_a in all_href:
		# print(a_index_a)
		download_children_img(a_index_a['href'], a_index_a['title'])

if __name__ == '__main__':
	
	get_start_end(url_index)
	get_page_href(url_index)
	del a_index['首页']
	del a_index['末页']
	del a_index['上一页']
	del a_index['下一页']
	# for key, value in a_index.items():
	# 	logger.warn('分页按钮:%s  :  %s' % (key, value))
	
	is_folder(r'\woman')
	download_url_all()
	# print(len(img_href))
	img_href_first = img_href[:int(len(img_href)/2)]
	img_href_second = img_href[int(len(img_href)/2+1):]
	p1 = multiprocessing.Process(target=func, args=(img_href_first,))
	p2 = multiprocessing.Process(target=func, args=(img_href_second,))
	p1.start()
	p2.start()
	p1.join()
	p2.join()
	input('end')


你可能感兴趣的:(网络爬虫,python,python)