1 import urllib2 2 import re 3 import os 4 5 def process_item(self, item, spider): 6 headers = { 7 "Host": 'img31.mtime.cn', 8 "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:53.0) Gecko/20100101 Firefox/53.0', 9 "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 10 "Accept-Language": 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 11 "Accept-Encoding": 'gzip, deflate', 12 "Connection": 'keep-alive', 13 "Upgrade-Insecure-Requests": "1", 14 } 15 16 req = urllib2.Request(url=item['addr'], headers=headers) 17 res = urllib2.urlopen(req) 18 19 saveFilePath = os.path.join(os.path.curdir, "down_pic", item['name'].split("_")[0]+"_"+str(item["picCount"])) 20 if os.path.exists(saveFilePath): 21 pass 22 else: 23 #os.mkdir(saveFilePath) #只能建单层文件夹 24 os.makedirs(saveFilePath) 25 file_name = os.path.join(saveFilePath, item['name'] + '.jpg') 26 with open(file_name, 'wb') as fp: 27 fp.write(res.read()) 28 with open("./savePath.txt", "a") as fh: 29 fh.write(file_name+"\n")
用urllib2抓取图片的时候,部分request报HTTP Error 400: Bad Request
参照:https://stackoverflow.com/questions/8840303/urllib2-http-error-400-bad-request?answertab=active#tab-top
because you aren't escaping the string for a URL.
改用requests成功解决问题。
1 import os 2 import requests 3 4 def process_item(self, item, spider): 5 headers = { 6 "Host": 'img31.mtime.cn', 7 "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:53.0) Gecko/20100101 Firefox/53.0', 8 "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 9 "Accept-Language": 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 10 "Accept-Encoding": 'gzip, deflate', 11 "Connection": 'keep-alive', 12 "Upgrade-Insecure-Requests": "1", 13 } 14 15 res = requests.get(item['addr'], headers=headers) 16 #print res 17 #print res.url 18 #print res.headers 19 #print res.content 二进制数据 20 21 saveFilePath = os.path.join(os.path.curdir, "down_pic", item['name'].split("_")[0]+"_"+str(item["picCount"])) 22 if os.path.exists(saveFilePath): 23 pass 24 else: 25 #os.mkdir(saveFilePath) #只能建单层文件夹 26 os.makedirs(saveFilePath) 27 file_name = os.path.join(saveFilePath, item['name'] + '.jpg') 28 with open(file_name, 'wb') as fp: 29 fp.write(res.content) 30 with open("./savePath.txt", "a") as fh: 31 fh.write(file_name+"\n")