Python3爬虫之爬取百度高清图片

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:  OFZFZS 
# Datetime:2018/3/23 11:00
# Description: 百度图片爬取 这里只做了简单处理,注意百度图片返回的数据是ajax数据,
# 每次返回的是30条,但是我只看到普通图片,高清的图片地址好像加密了,所以我这里只取三十张,没有用Ajax.
import re
import urllib
from urllib import request
from urllib import parse
import os


class Spider:
    def __init__(self, name):
        self.name = name
        self.item_list_middle = []
        self.item_list_obj = []
        self.i = 1  #保存图片时起名字用的变量

    def loadPage(self):
        # request.quote(self.name) 和parse.urlencode()这里的效果是一样的,需要注意的是第二个里面只能放字典
        dict1 = {"word": self.name}
        encodeName = parse.urlencode(dict1)
        url = "https://image.baidu.com/search/index?tn=baiduimage&" + encodeName
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
        req = request.Request(url, headers=headers)
        response = request.urlopen(req)
        html = response.read().decode("utf-8")

        # 正则处理
        # 标清图片地址
        pattern = re.compile(r'\"middleURL\"\:\"(.*?)\"\,', re.S)
        self.item_list_middle = pattern.findall(html)

        # 高清图片地址
        #正则 瞎写的
        pattern = re.compile(r'\"objURL\"\:\"(.*?)\"\,', re.S)
        self.item_list_obj = pattern.findall(html)



    def writeToFile(self):
        if os.path.exists("爬取图片") == False:
            os.mkdir("爬取图片")
        for item in self.item_list_middle:
            print(item)
            urllib.request.urlretrieve(item, "爬取图片/%d.jpg" % self.i)
            self.i += 1
        print("处理成功..")


if __name__ == "__main__":
    spider = Spider("美女")
    spider.loadPage()
    spider.writeToFile()

你可能感兴趣的:(Python)