#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: OFZFZS
# Datetime:2018/3/23 11:00
# Description: 百度图片爬取 这里只做了简单处理,注意百度图片返回的数据是ajax数据,
# 每次返回的是30条,但是我只看到普通图片,高清的图片地址好像加密了,所以我这里只取三十张,没有用Ajax.
import re
import urllib
from urllib import request
from urllib import parse
import os
class Spider:
def __init__(self, name):
self.name = name
self.item_list_middle = []
self.item_list_obj = []
self.i = 1 #保存图片时起名字用的变量
def loadPage(self):
# request.quote(self.name) 和parse.urlencode()这里的效果是一样的,需要注意的是第二个里面只能放字典
dict1 = {"word": self.name}
encodeName = parse.urlencode(dict1)
url = "https://image.baidu.com/search/index?tn=baiduimage&" + encodeName
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
html = response.read().decode("utf-8")
# 正则处理
# 标清图片地址
pattern = re.compile(r'\"middleURL\"\:\"(.*?)\"\,', re.S)
self.item_list_middle = pattern.findall(html)
# 高清图片地址
#正则 瞎写的
pattern = re.compile(r'\"objURL\"\:\"(.*?)\"\,', re.S)
self.item_list_obj = pattern.findall(html)
def writeToFile(self):
if os.path.exists("爬取图片") == False:
os.mkdir("爬取图片")
for item in self.item_list_middle:
print(item)
urllib.request.urlretrieve(item, "爬取图片/%d.jpg" % self.i)
self.i += 1
print("处理成功..")
if __name__ == "__main__":
spider = Spider("美女")
spider.loadPage()
spider.writeToFile()