#!usr/bin/env python
# -*- coding:utf-8 _*-
from urllib import request
from lxml import etree
class Spider:
def __init__(self, page):
self.page = page
def get_html(self, url):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
return response.read()
def get_image_list(self, html):
html_obj = etree.HTML(html) #转换为对象
imgsrc_list = html_obj.xpath('//a/img/@data-original')
return imgsrc_list
def save_iamge(self, img_data):
for img_url in img_data:
img_bytes = self.get_html(img_url[:-4])
with open(img_url[-14:-4], "wb") as f:
f.write(img_bytes)
def start(self):
print("正在准备开始采集...")
while True:
html = self.get_html("http://www.doutula.com/photo/list/?page=" + str(self.page))
iamge_list = self.get_image_list(html)
self.save_iamge(iamge_list)
print("***第%d页处理完毕..***" % self.page)
command = input("按回车键继续采集下一页,quit退出")
if command == "quit":
break
self.page += 1
if __name__ == '__main__':
spider = Spider(1)
spider.start()