由于微信的反爬虫策略升级,原有的方法已经失效,上一篇文章已经不可用
上一篇文章: Python2.7下载微信公众号文章的图片
大概是10月份前后的一次升级,微信对于任何爬虫和浏览器,无论对方有没有提供Accept-Encoding字段,都返回gzip压缩数据,但是httplib2对解压gzip数据有天生的缺陷,所以会报异常
File "d:\WingIDEProject\getWeiXinGZHPhoto.py", line 13, in <module> resp, content = h.request(url) File "D:\Python27\Lib\site-packages\httplib2\__init__.py", line 1609, in request (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) File "D:\Python27\Lib\site-packages\httplib2\__init__.py", line 1351, in _request (response, content) = self._conn_request(conn, request_uri, method, body, headers) File "D:\Python27\Lib\site-packages\httplib2\__init__.py", line 1337, in _conn_request content = _decompressContent(response, content) File "D:\Python27\Lib\site-packages\httplib2\__init__.py", line 403, in _decompressContent content = zlib.decompress(content) zlib.error: Error -3 while decompressing data: incorrect header check
网上提供的方法都是要修改模块内代码,这样并不好,万一哪天忘了可能又要重新改,所以笔者这里直接改成urllib2访问,这样至少不用改模块了
本文使用的httplib2和BeautifulSoup4模块不在默认的python安装包里,可以通过pip命令进行安装,pip命令如下
pip install httplib2 pip install BeautifulSoup4
代码如下
# -*- coding: UTF-8 -*- import os import shutil import httplib2 from bs4 import BeautifulSoup import re import binascii import urllib2 import zlib import time h = httplib2.Http() url = 'http://mp.weixin.qq.com/s?__biz=MjM5MDk0OTEyNg==&mid=213324685&idx=1&sn=186ee1433c04b23f6f123566e33e0b79&3rd=MzA3MDU4NTYzMw==&scene=6#rd' request = urllib2.Request(url) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener() response = opener.open(request) html = response.read() gzipped = response.headers.get('Content-Encoding') if gzipped: html = zlib.decompress(html, 16+zlib.MAX_WBITS) #print html content = html # 正则表达式javascript里的获取相关变量 matchnickname = re.search(r'var\s*nickname\s*=\s*[\'\"](?P<nickname>\S*)[\'\"];', content) matchappuin = re.search(r'var\s*appuin\s*=\s*[\'\"](?P<appuin>\S*)[\'\"];',content) matchct = re.search(r'var\s*ct\s*=\s*[\'\"](?P<ct>\S*)[\'\"];', content) matchuser_name = re.search(r'var\s*user_name\s*=\s*[\'\"](?P<user_name>\S*)[\'\"];', content) matchmsg_cdn_url = re.search(r'var\s*msg_cdn_url\s*=\s*[\'\"](?P<msg_cdn_url>\S*)[\'\"];', content) idx = re.search(r'idx=(?P<idx>[0-9]+)',url).group('idx') nickname = matchnickname.group('nickname') appuin = matchappuin.group('appuin') ct = matchct.group('ct') user_name = matchuser_name.group('user_name') msg_cdn_url = matchmsg_cdn_url.group('msg_cdn_url') ctime = time.strftime("%Y%m%d%H%M%S",time.localtime(int(ct))) # int将字符串转成数字,不区分int和long,这里将时间秒数转成日期格式 # 建立文件夹 dir='WeiXinGZH/'+nickname.decode('utf-8').encode('gb2312') + '/' + ctime + '/' + idx + '/' print '文件夹为:'+ dir.decode('gb2312').encode('utf-8') try: os.makedirs(dir) # 建立相应的文件夹 except: shutil.rmtree(dir) # 无论文件夹是否为空都移除该文件夹 os.makedirs(dir) # 下载封面 url = msg_cdn_url print u'正在下载封面:'+url resp, contentface = h.request(url) file_name = dir +'封面'.decode('utf-8').encode('gb2312') + '.jpg' open(file_name, 'wb').write(contentface) # 下载其他图片 soup = BeautifulSoup(content, 'html.parser') count = 0 for link in soup.find_all('img'): if None != link.get('data-src'): count = count + 1 orurl = link.get('data-src') url = orurl.split('?')[0] # 重新构造url,原来的url有一部分无法下载 print u'正在下载:'+url resp, content = h.request(url) matchurlvalue = re.search(r'wx_fmt=(?P<wx_fmt>[^&]*)', orurl) # 无参数的可能是gif,也有可能是jpg if None!= matchurlvalue: wx_fmt = matchurlvalue.group('wx_fmt') # 优先通过wx_fmt参数的值判断文件类型 else: wx_fmt = binascii.b2a_hex(content[0:4]) # 读取前4字节转化为16进制字符串 print wx_fmt phototype = {'jpeg': '.jpg', 'gif': '.gif', 'png': '.png', 'jpg': '.jpg', '47494638': '.gif', 'ffd8ffe0': '.jpg', 'ffd8ffe1': '.jpg', 'ffd8ffdb': '.jpg', '89504e47': '.png'} # 方便写文件格式 file_name = dir + 'Picture' + str(count) + phototype[wx_fmt] open(file_name, 'wb').write(content) print u'图片下载完成'