运行了一个晚上小水管太慢了,才爬了几万张图片。
做了一下重复抓取,设定抓取八次
写了一下日志,但是想了一下还是注释掉了
代码里面有很多修修改改的痕迹,
如果愿意的话可以拿去把这个程序修改一下
采集的网页是:http://www.27270.com/
当前使用的python版本是python3.5.2
# -*- coding:utf-8 -*-
import os
import sys
import time
import random
import logging
import requests
import multiprocessing
from multiprocessing import Pool
from bs4 import BeautifulSoup
img_href = []
a_index = {}
flag = 'true'
html_index = ''
error_num = []
error_href = []
error_path = []
index = {'start': '', 'end': ''}
url_index = 'http://www.27270.com/ent/meinvtupian/'
sys.setrecursionlimit(1000000)
# 获取logger实例,如果参数为空则返回root logger
logger = logging.getLogger("AppName")
# 指定logger输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)-8s: %(message)s')
# 文件日志
file_handler = logging.FileHandler("test.log")
file_handler.setFormatter(formatter) # 可以通过setFormatter指定输出格式
# 为logger添加的日志处理器
logger.addHandler(file_handler)
# 指定日志的最低输出级别,默认为WARN级别
logger.setLevel(logging.INFO)
class flag(object):
def __init__(self):
f = True
def get_f(self):
return self.f
@staticmethod
def set_f(self):
self.f = False
def is_folder(file_name=''):
# 判断是否存在图片存储文件夹,如不存在则创建
cwd = os.getcwd() + file_name
if not os.path.exists(cwd):
os.mkdir(cwd)
print('已创建图片存储文件夹%s' % file_name)
else:
# print("检测到已有图片存储文件夹")
pass
def get_url(url='', host=''):
# 获取response
response = ''
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept - Encoding' : 'gzip, deflate',
'Accept - Language' : 'zh-CN,zh;q=0.9',
'Cache - Control' : 'max - age = 0',
'Connection' : 'keep - alive',
'Upgrade - Insecure - Requests' : '1'
}
cooke = {
'Cookie': 'Hm_lvt_63a864f136a45557b3e0cbce07b7e572=1519296125,1519296217,1519306647,1519309454; Hm_lpvt_63a864f136a45557b3e0cbce07b7e572=1519310130'}
# 可设置代理
proxies = {
"http": "http://"+'61.155.164.106:3128',
"https": "http://"+'61.155.164.106:3128',
}
if host != '':
header['Host'] = host
'''
try:
print(header['Host'])
except Exception:
print('none host')
'''
try:
response = requests.get(url, headers=header, timeout=30)
except Exception:
response = 'error'
logger.error('%s \t\t get error' % url)
finally:
# print(url)
if host != '':
del header['Host']
time.sleep(random.randint(1, 4))
return response
def download_img(url = 'http://t1.27270.com/uploads/tu/201802/726/e6e5afe62c.jpg', name='', the_path='', num=8):
# 下载单张图片
response = get_url(url, host='t2.hddhhn.com')
if response != 'error':
cwd = os.getcwd() + r'\woman'
file_name = name + '.' + url.split('/')[-1].split('.')[-1]
logger.warn('%s \t\t download...' % (url))
with open(cwd + '\\' + the_path + file_name, 'wb') as f:
file_data = response.content
f.write(file_data)
else:
if num > 0:
return download_img(url, name=name, the_path=the_path, num=num - 1)
print('download error')
return
def get_index(url_index):
# 获取主页html文件
response = get_url(url_index)
response.encoding = 'gb2312'
return response.text
def get_start_end(url=''):
response = get_url(url)
response.encoding = 'gb2312'
html_index = response.text
soup = BeautifulSoup(html_index, "html.parser")
a_index_a_all = soup.find("div", class_="NewPages").find('ul').find_all('a', target='_self')
for a_index_a in a_index_a_all:
a_index[a_index_a.string] = (url_index + a_index_a['href'])
html_index = get_index(url_index)
soup = BeautifulSoup(html_index, "html.parser")
index['start'] = soup.find("div", class_="NewPages").find('li', class_='thisclass').find('a').string
response = get_url(a_index['末页'])
response.encoding = 'gb2312'
html_index = response.text
soup = BeautifulSoup(html_index, "html.parser")
index['end'] = soup.find("div", class_="NewPages").find('li', class_='thisclass').find('a').string
def get_page_href(url=''):
# 获取分页按钮跳转的网页
new_num = 0
response = get_url(url)
if response != 'error':
response.encoding = 'gb2312'
html_index = response.text
soup = BeautifulSoup(html_index, "html.parser")
a_index_a_all = soup.find("div", class_="NewPages").find('ul').find_all('a', target='_self')
for a_index_a in a_index_a_all:
a_index[a_index_a.string] = (url_index + a_index_a['href'])
if str(a_index_a.string).isdigit() and int(a_index_a.string) > int(new_num):
new_num = a_index_a.string
print('已进行:%.2f%%' % (int(new_num)*100/int(index['end'])))
if (int(new_num) >= int(index['end'])):
return
else:
new_num -= 1
print('page error')
get_page_href(a_index[new_num])
def get_father_img(url_index_child):
a_index_a_all = ''
response = get_url(url_index_child)
if response != 'error':
response.encoding = 'gb2312'
html_index = response.text
soup = BeautifulSoup(html_index, "html.parser")
a_index_a_all = soup.find('div', class_='MeinvTuPianBox').find('ul').find_all('a', class_='MMPic')
return a_index_a_all
def download_children_img(url, title):
num = 0
global child_img_href
max_index = '0'
child_img_href = {'1' : url}
# print(child_img_href)
get_child_href(url, max_index, title)
print('%d张图片,正在下载\n' % len(child_img_href))
for key, val in child_img_href.items():
try:
response = get_url(val)
if response != 'error':
response.encoding = 'gb2312'
html_index = response.text
soup = BeautifulSoup(html_index, "html.parser")
href = str(soup.find('div', class_='articleV4Body').find('img')['src'])
# print(href)
is_folder(r'\woman\\' + title)
download_img(href, str(num), title+'\\')
num += 1
except Exception:
print('下载图片失败')
def get_child_href(url_index_child, max_index, file_name=''):
num = '0'
response = get_url(url_index_child)
if response != 'error':
if file_name != '':
is_folder(r'\woman\\' + file_name)
response.encoding = 'gb2312'
html_index = response.text
# print(html_index)
soup = BeautifulSoup(html_index, "html.parser")
max_index = soup.find('div', class_='page-tag oh').find('ul').find('li', class_='hide')['pageinfo']
a_index_a_first = soup.find("div", class_="page-tag oh").find('ul').find('li', class_='thisclass')
for sibling in a_index_a_first.next_siblings:
if str(sibling.string).isdigit():
if int(sibling.string) > int(num):
num = int(sibling.string)
child_img_href[str(sibling.string)] = '/'.join(url_index_child.split('/')[:-1]) + '/' + sibling.find('a')['href']
# print(num)
if int(num) >= int(max_index):
return
else:
num = ''+str(int(num)+1)
# print(num)
get_child_href(child_img_href[str(num)], max_index)
def download_url_all():
index = 1
zz = 0
# a_index = {'1': 'http://www.27270.com/ent/meinvtupian/list_11_1.html', 2: 'http://www.27270.com/ent/meinvtupian/list_11_2.html'}
for key, value in a_index.items():
img_index = []
a_index_a_all = get_father_img(value)
print('%d / %s' % (index, len(a_index)))
# print('第'+str(index)+'轮下载即将开始')
for a_index_a in a_index_a_all:
# print(a_index_a)
img_href.append(a_index_a)
# download_children_img(a_index_a['href'], a_index_a['title'])
# print(a_index_a)
# logger.warn('图片合集:%d : %s %s' % (zz, a_index_a['href'], a_index_a['title']))
# download_img(a_index_a['href'], str(zz))
zz += 1
# print('请等待下一轮下载\n\n')
index += 1
# 使用进程池,并发数为2
print('zhong:%d' % int(len(img_href)/2))
def func(all_href):
for a_index_a in all_href:
# print(a_index_a)
download_children_img(a_index_a['href'], a_index_a['title'])
if __name__ == '__main__':
get_start_end(url_index)
get_page_href(url_index)
del a_index['首页']
del a_index['末页']
del a_index['上一页']
del a_index['下一页']
# for key, value in a_index.items():
# logger.warn('分页按钮:%s : %s' % (key, value))
is_folder(r'\woman')
download_url_all()
# print(len(img_href))
img_href_first = img_href[:int(len(img_href)/2)]
img_href_second = img_href[int(len(img_href)/2+1):]
p1 = multiprocessing.Process(target=func, args=(img_href_first,))
p2 = multiprocessing.Process(target=func, args=(img_href_second,))
p1.start()
p2.start()
p1.join()
p2.join()
input('end')