大师兄的Python学习笔记(二十五): 爬虫(六)
大师兄的Python学习笔记(二十七): 爬虫(八)
八、使用Cookies模拟登录状态
- 通常,为了爬取数据,我们需要先登录网站。
- 为此,我们可以通过获取登录Cookies,并使用Cookies模拟登录状态。
- 以微博为例,思路如下:
1) 观察页面,了解到需要通过请求获取预登陆数据
2 ) 输入密码后,观察登录表单,获取预登陆数据格式。
3) 使用预登陆数据登录微博,并获取cookies。
4) 使用session + cookies访问页面保持登陆状态
>>>import requests
>>>import json
>>>import urllib
>>>import base64
>>>import rsa
>>>import binascii
>>>import re
>>>import time
>>>class Login:
>>> def __init__(self,username,password):
>>> self.login_url = "https://login.sina.com.cn/sso/login.php"
>>> self.prelogin_url = r'https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.15)' #
>>> self.username = username
>>> self.password = password
>>> self.session = requests.Session()
>>> def Prelogin(self):
>>> # 请求prelogin并获取参数
>>> su_url = urllib.parse.quote_plus(self.username) # 用户名先用url加密
>>> su_encoded = su_url.encode('utf-8')
>>> su = base64.b64encode(su_encoded) # 再用base64加密
>>> su = su.decode('utf-8')
>>> per_login_params = {
>>> 'entry': "account",
>>> 'callback': "sinaSSOController.preloginCallBack",
>>> 'su': su,
>>> 'rsakt': "mod",
>>> 'client': "ssologin.js(v1.4.15)",
>>> '_': int(time.time() * 1000),
>>> }
>>> try:
>>> response = self.session.get(self.prelogin_url, params=per_login_params)
>>> # 响应异常时,抛出异常信息
>>> response.raise_for_status()
>>> except Exception as error:
>>> print('failed to prelogin:')
>>> raise error
>>> data_dict = json.loads(response.text.strip('sinaSSOController.preloginCallBack(').strip(')'))
>>> pubkey = data_dict.get('pubkey')
>>> servertime = data_dict.get('servertime')
>>> nonce = data_dict.get('nonce')
>>> rsakv = data_dict.get('rsakv')
>>> return (su,pubkey, servertime, nonce, rsakv)
>>> def RSAEncoder(self,pubkey, servertime, nonce):
>>> # 增加密码
>>> rsaPublickey = int(pubkey, 16)
>>> e = int('10001', 16)
>>> key = rsa.PublicKey(rsaPublickey, e)
>>> message = str(servertime) + '\t' + str(nonce) + '\n' + str(self.password)
>>> password = rsa.encrypt(message.encode('utf-8'), key) # 加密密码
>>> sp = binascii.b2a_hex(password).decode('utf-8')
>>> return sp
>>> def PostData(self,su,pubkey, servertime, nonce, rsakv):
>>> # 模拟ssologin.js发送请求并获得cookies
>>> sp = self.RSAEncoder(pubkey, servertime, nonce)
>>> post_data = {
>>> 'entry': "account",
>>> 'gateway': "1",
>>> 'from': "null",
>>> 'savestate': "30",
>>> 'useticket': "0",
>>> 'vsnf': "1",
>>> 'su': su,
>>> 'service': "account",
>>> 'servertime': servertime,
>>> 'nonce': nonce,
>>> 'pwencode': "rsa2",
>>> 'rsakv': rsakv,
>>> 'sp': sp,
>>> 'sr': "1280*720",
>>> 'encoding': "UTF-8",
>>> 'cdult': "3",
>>> 'domain': "sina.com.cn",
>>> 'prelt': "170",
>>> 'returntype': "TEXT",
>>> }
>>> return post_data
>>> def login(self):
>>> login_params = {
>>> 'client': "ssologin.js(v1.4.15)",
>>> '_': int(time.time() * 1000),
>>> }
>>> su,pubkey, servertime, nonce, rsakv = self.Prelogin()
>>> post_data = self.PostData(su,pubkey,servertime,nonce,rsakv)
>>> try:
>>> response = self.session.post(self.login_url, params=login_params, data=post_data)
>>> response.raise_for_status()
>>> except Exception as error:
>>> print("failed to login:")
>>> raise error
>>> cross_url = re.search(r'"(https:.*?)"', response.text).group(1)
>>> cross_url = re.sub(r'\\', '', re.sub(r'\\', '', cross_url))
>>> if cross_url:
>>> print(f"redirecting to {cross_url}")
>>> response = self.session.get(cross_url)
>>> print(response.text)
>>> else:
>>> print(f"failed to login.")
>>>if __name__ == '__main__':
>>> l = Login("yourusername","yourpassword")
>>> l.login()
redirecting to https://passport.weibo.com/wbsso/login?ticket=ST-MTE4ODA5MTk3MQ%3D%3D-1597372265-tc-8B866F78C8DDA0E122A23B69D585454D-1&ssosavestate=1628908265
({"result":true,"userinfo":{"uniqueid":"xxxxxx","displayname":"xxxxxx"}});
参考资料
- https://blog.csdn.net/u010138758/article/details/80152151 J-Ombudsman
- https://www.cnblogs.com/zhuluqing/p/8832205.html moisiet
- https://www.runoob.com 菜鸟教程
- http://www.tulingxueyuan.com/ 北京图灵学院
- http://www.imooc.com/article/19184?block_id=tuijian_wz#child_5_1 两点水
- https://blog.csdn.net/weixin_44213550/article/details/91346411 python老菜鸟
- https://realpython.com/python-string-formatting/ Dan Bader
- https://www.liaoxuefeng.com/ 廖雪峰
- https://blog.csdn.net/Gnewocean/article/details/85319590 新海说
- https://www.cnblogs.com/Nicholas0707/p/9021672.html Nicholas
- https://www.cnblogs.com/dalaoban/p/9331113.html 超天大圣
- https://blog.csdn.net/zhubao124/article/details/81662775 zhubao124
- https://blog.csdn.net/z59d8m6e40/article/details/72871485 z59d8m6e40
- https://www.jianshu.com/p/2b04f5eb5785 MR_ChanHwang
- 《Python学习手册》Mark Lutz
- 《Python编程 从入门到实践》Eric Matthes
- 《Python3网络爬虫开发实战》崔庆才
本文作者:大师兄(superkmi)