python获取cookies

在用python 写爬虫的时候,经常需要获取cookies,然后才能开始其他的一起爬取操作。这里整理下,网上一些的资料。在这里我们以豆瓣网 https://accounts.douban.com/passport/login为例。

方法1:python3+requests库获取:

import requests
from requests.cookies import RequestsCookieJar

headers = {
    'Host': 'accounts.douban.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive'
           }
request_url = "https://accounts.douban.com/passport/login"
res = requests.get(request_url, headers=headers)

status_code = res.status_code
res_header = res.headers
res_cookies = res.cookies
cookie1111 = res.cookies.get_dict()                             # 格式化 字典形式输出
cookie2222 = requests.utils.dict_from_cookiejar(res_cookies)    # 格式化 字典形式输出
for cookie in res_cookies:
    print(cookie.name+"\t"+cookie.value)

print("响应状态码:", status_code)
print("响应请求请求头:", res_header)
print("响应cookies:", res_cookies)
print("格式化cookie1111 :", cookie1111)
print("格式化cookie2222 :", cookie2222)

输出结果:

响应状态码: 200
响应请求请求头: {'Date': 'Wed, 22 Jul 2020 07:41:04 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Keep-Alive': 'timeout=30', 'Vary': 'Accept-Encoding, Accept-Encoding', 'X-Xss-Protection': '1; mode=block', 'X-Douban-Mobileapp': '0', 'Expires': 'Sun, 1 Jan 2006 01:00:00 GMT', 'Pragma': 'no-cache', 'Cache-Control': 'must-revalidate, no-cache, private', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': 'bid=tGBxkXjx3OI; Expires=Thu, 22-Jul-21 07:41:04 GMT; Domain=.douban.com; Path=/', 'X-DOUBAN-NEWBID': 'tGBxkXjx3OI', 'X-DAE-App': 'accounts', 'X-DAE-Instance': 'default', 'Server': 'dae', 'Strict-Transport-Security': 'max-age=15552000', 'X-Content-Type-Options': 'nosniff', 'Content-Encoding': 'br'}
响应cookies: <RequestsCookieJar[<Cookie bid=tGBxkXjx3OI for .douban.com/>]>
格式化cookie1111 : {'bid': 'tGBxkXjx3OI'}
格式化cookie2222 : {'bid': 'tGBxkXjx3OI'}
bid	tGBxkXjx3OI

方法2:使用response headers的set_cookie

set_cookie = res.headers['Set-Cookie']
print("set_cookie:", set_cookie)

for cookie in res.headers['Set-Cookie'].split(";"):
    key = cookie.split('=')[0]
    value = cookie.split('=')[1]
    print(key, value)

运行结果:

set_cookie: bid=sH5N2LFyPog; Expires=Thu, 22-Jul-21 07:55:11 GMT; Domain=.douban.com; Path=/
bid sH5N2LFyPog
Expires Thu, 22-Jul-21 07:55:11 GMT
Domain .douban.com
Path /

如果要设置cookies,可以这样做:

import requests
from requests.cookies import RequestsCookieJar

cookieJar1 = RequestsCookieJar()
cookieJar2 = RequestsCookieJar()
print(cookieJar1)
for cookie in res.cookies:
    cookieJar1.set(cookie.name, cookie.value)
print("cookieJar1:", cookieJar1)

# 向请求头中添加cookie
res = requests.get(url, headers, cookies=cookieJar1)
# 使用Set-Cookie 获取的cookies比较完整
for cookie in res.headers['Set-Cookie'].split(";"):
    key = cookie.split('=')[0]
    value = cookie.split('=')[1]
    cookieJar2.set(key, value)
print("cookieJar2:", cookieJar2)

运行结果:

<RequestsCookieJar[]>
cookieJar1: <RequestsCookieJar[<Cookie bid=sYSpXToGQ1o for />]>
cookieJar2: <RequestsCookieJar[<Cookie  Domain=.douban.com for />, <Cookie  Expires=Thu, 22-Jul-21 08:01:01 GMT for />, <Cookie  Path=/ for />, <Cookie bid=sYSpXToGQ1o for />]>

方法3:利用selenium 模拟驱动的方式

from selenium import webdriver
 
driver=webdriver.PhantomJS()
url="https://et.xiamenair.com/xiamenair/book/findFlights.action?lang=zh&tripType=0&queryFlightInfo=XMN,PEK,2018-01-15"
driver.get(url)
# 获取cookie列表
cookie_list=driver.get_cookies()
# 格式化打印cookie
for cookie in cookie_list:
    cookie_dict[cookie['name']]=cookie['value']

方法4:使用cookielib + http.cookiejar

在python2使用

import cookielib
import urllib2
Url = "https://et.xiamenair.com/xiamenair/book/findFlights.action?lang=zh&tripType=0&queryFlightInfo=XMN,PEK,2018-01-15"
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
resp = urllib2.urlopen(Url)
for index, cookie in enumerate(cj):
    print '[',index, ']',cookie

python3中使用:

from urllib import request
from http import cookiejar

#跳过SSL验证证书
import ssl
#设置忽略SSL验证
ssl._create_default_https_context = ssl._create_unverified_context

if __name__ == '__main__':
    #声明一个CookieJar对象实例来保存cookie
    cookie = cookiejar.CookieJar()
    #利用urllib.request库的HTTPCookieProcessor对象来创建cookie处理器,也就CookieHandler
    handler=request.HTTPCookieProcessor(cookie)
    #通过CookieHandler创建opener
    opener = request.build_opener(handler)
    #此处的open方法打开网页
    response = opener.open('http://www.baidu.com')
    #打印cookie信息
    for item in cookie:
        print('Name = %s' % item.name)
        print('Value = %s' % item.value)

运行结果:

Name = BAIDUID
Value = 94CCDAFB95C81F517566AAE5796693AC:FG=1
Name = BIDUPSID
Value = 94CCDAFB95C81F515B73F5C9A79CE9BC
Name = H_PS_PSSID
Value = 
Name = PSTM
Value = 1595406265
Name = BDSVRTM
Value = 0
Name = BD_HOME
Value = 1

你可能感兴趣的:(【Python爬虫】)