python—get/post请求下载指定URL返回的网页内容,出现gzip乱码处理。设置Accept-Encoding为gzip,deflate,返回的网页是乱码
1、脚本
# --*-- coding:utf-8 --*--
#coding:utf-8
import string
import urllib
import urllib2
import ssl
def getpicyanzhengma():#实时请求服务器最新的验证码,并保存pic.png图片格式,与服务器互动
urlget = "https://xianzhi.aliyun.com/forum/topic/1805/"
#ctl = {"ctl":"code"}
#ctldata = urllib.urlencode(ctl)
#reqget = urllib2.Request(urlget+'?'+ctldata)#构造get请求与参数
reqget = urllib2.Request(urlget)#构造get请求与参数
#添加get请求的头信息
reqget.add_header("Host","xianzhi.aliyun.com")
reqget.add_header("Cache-Control","max-age=0")
reqget.add_header("Upgrade-Insecure-Requests","1")
reqget.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
reqget.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
reqget.add_header("Accept-Language","zh-CN,zh;q=0.8")
reqget.add_header("Accept-Encoding","gzip, deflate, sdch, br")
reqget.add_header("Cookie","cnz=X6ejEtcUBVMCAfJ77XgdkdPH; cna=YKejEpKOFU0CAXjte/LuiuWB; UM_distinctid=16000343ca4183-0e8093cc5e7b3-64191279-15f900-16000343ca575a; _uab_collina=151183659981086744617448; _ga=GA1.2.668866163.1511831906; aliyun_country=CN; aliyun_site=CN; isg=ApmZtNphJydPxfuAkp4Fb9c1qIWzjqX8QOIT1rtOAUA_wrlUA3adqAfSsrFO; _umdata=ED82BDCEC1AA6EB94F984760A4C6465E6DD138CC3777AF0CB131A783FCB0E006227E021A199C6A8DCD43AD3E795C914C3303D9E6CB380052D470743247B79D15; acw_tc=AQAAAJMuFXttQgkA8nvteBqARscCdcug; csrftoken=CkpJbhBYBvg6oTBvrwTrsrYcsF1SJXC4mdv0A0k1BmX6mDFT0K2izVlfJkaZI4zx; CNZZDATA1260716569=1195371503-1511830276-https%253A%252F%252Fwww.baidu.com%252F%7C1515457887")
reqget.add_header("Connection","keep-alive")
#使用本机进行代理抓包,查看详细的数据包
#proxy_handler = urllib2.ProxyHandler({'http': '192.168.40.36:4455'})
#opener = urllib2.build_opener(proxy_handler)
#urllib2.install_opener(opener)
context = ssl._create_unverified_context()#启用ssl。如果是http的话此行去除
resget = urllib2.urlopen(reqget,context=context)#在urllib2启用ssl字段,打开请求的数据。如果是http的话此
resgetdata = resget.read()
print resgetdata
#对get请求的数据回包的图片验证码数据,保存为pic.png的图片
f = open("e:/pic/downloadxianzhi.html","wb")
f.write(resgetdata)
f.close()
getpicyanzhengma()
2、运行脚本发现乱码
��<鵶壑�?�3�? �4挧OQW$礩'郐蚽移涧懔�(�%+mf鐲謒蓫,_�!踨':\'柆��%@颛�� 奛獫dv9嘟飣鲄 x踅脀櫖憮N翎F鏀窿R"�饽�贱r揉!薸:2�##胿�z螑 榗妍+迩嫣N_�;釞琾9��.hR迱T%�猙 鄖鐍�7C氹撴鬲5U礀6瑭菮糰 嶄U蛨�3翦�慏#�/I@,鵴JR$C鈊V8�'ⅠS98�+浼G阁uG :胰O巼#婈�.K��!�?" 槩瑔2龖XF� 箻np�$酿橷�茻Qx�0苃P梤� 姖g蒐洸譟杫1�1*#漚Yz个FZ匴UC74.偄偖G(^T!肶崇\ L$J啰Esb噘縭⒒@Sx擣�7b� ��%醜pa觵@€湿��肏摴褟餚�楚i斀*尲\�4OFy鮸燔_ H�:�=b|e�?�)3Ja礌挘ガ嗶吉枰0jΠ甎麵�0瞾橑辝��<�{�&尞 龖琣鋥c1AQ�&VPs6輑"欻DSd眘€p_孨u颫Hヌ�搒謡w�<�⒊淕瓜q�=鴫>�;�'M�籵泪D� �憅ZU�$撮L靠h溳 絬窭^)6錮I聖]�)
3、处理返回乱码的原因
参考:https://zhidao.baidu.com/question/245594784767944644.html
https://www.crifan.com/set_accept_encoding_header_to_gzip_deflate_return_messy_code/
注释字段:
#reqget.add_header("Accept-Encoding","gzip, deflate, sdch, br")
运行情况
DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>先知社区</title>
<meta name="description" content="先知社区,先知安全技术社区">
<meta name="viewport"
content="width=device-width,initial-scale=1.0,minimum-scale=1.0,maximum-scale=1.0,user-scalable=no">
<link rel="icon" href="/forum/static/icon/favicon.ico" type="image/x-icon">
....................
.....................
.....................
......................
4、解决html回包的乱码方式:
想要获得正确网页内容,而非乱码的话,就有两种方式了:
1.不要设置Accept-Encoding的Header
//req.Headers.Add("Accept-Encoding", "gzip,deflate");
2.设置Accept-Encoding的Header,同时设置对应的自动解压缩的模式
req.Headers["Accept-Encoding"] = "gzip,deflate";
req.AutomaticDecompression = DecompressionMethods.GZip;
具体采用哪种方法,自己根据需要选择。