Python3爬虫lxml的应用之爬取斗图啦表情包

#!usr/bin/env python  
# -*- coding:utf-8 _*-
from urllib import request
from lxml import etree


class Spider:
    def __init__(self, page):
        self.page = page

    def get_html(self, url):
        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
        req = request.Request(url, headers=headers)
        response = request.urlopen(req)
        return response.read()

    def get_image_list(self, html):
        html_obj = etree.HTML(html)  #转换为对象
        imgsrc_list = html_obj.xpath('//a/img/@data-original')
        return imgsrc_list

    def save_iamge(self, img_data):
        for img_url in img_data:
            img_bytes = self.get_html(img_url[:-4])
            with open(img_url[-14:-4], "wb") as f:
                f.write(img_bytes)

    def start(self):
        print("正在准备开始采集...")
        while True:
            html = self.get_html("http://www.doutula.com/photo/list/?page=" + str(self.page))
            iamge_list = self.get_image_list(html)
            self.save_iamge(iamge_list)
            print("***第%d页处理完毕..***" % self.page)
            command = input("按回车键继续采集下一页,quit退出")
            if command == "quit":
                break
            self.page += 1


if __name__ == '__main__':
    spider = Spider(1)
    spider.start()

你可能感兴趣的:(Python)