一、查看登陆
先在浏览器中 按住 ctrl + shift + N 打开隐身模式,避免我们已经登陆的cookie信息。
然后进入知乎登陆页 https://www.zhihu.com/#signin
按F12,点击 Network 。尝试随便输个手机号登陆下,可以看到所要登陆页的请求了
从headers中可以看到请求页和所需传送的data
即用手机号登陆请求页面的url为 https://www.zhihu.com/login/phone_num
phone_num 为手机号
password 为密码
那么 _xsrf是什么呢? 点击 element,按ctrl + F 查找xsrf,可以在网页源代码中搜索到,不难猜出这其实是一个动态验证码,既然在网页中,那么我们同样可以轻松获得它。
那么可以写出一个登陆代码了
二、登陆代码
import requests
import re
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}
# 知乎有反爬,用个浏览器头
def get_xsrf(): #获取网页_xsrf验证码
#'''_xsrf 是一个动态变化的参数'''
index_url = 'https://www.zhihu.com/#signin'
#获取登录时要用到的_xsrf
index_page = requests.get(index_url, headers=headers)
html = index_page.text
pattern = r'name="_xsrf" value="(.*?)"'
#这里的_xsrf返回的是一个list
_xsrf = re.findall(pattern, html)
return _xsrf[0]
def login(account, secret):
xsrf = get_xsrf()
#通过输入的用户名判断是否是手机号
if re.match(r"^1\d{10}$", account): #正则检验是否为手机号
print('login by phone_num\n')
login_url = 'https://www.zhihu.com/login/phone_num' #手机登陆请求url
formdata = {
'phone_num':account,
'password':secret,
'_xsrf':xsrf
}
else:
if "@" in account:
print('login by Email\n')
else:
print('你的帐号有问题,请重新输入')
return 0
login_url = 'https://www.zhihu.com/login/email' #邮箱登陆请求url
formdata = {
'email':account,
'password':secret,
'_xsrf':_xsrf
}
login_page = requests.post(url=login_url, data=formdata, headers=headers)
print(login_page.status_code) #检验网页响应状态码
print(login_page.content)
print(login_page.json()['msg']) #检验登陆是否成功
if __name__ == "__main__":
username = input('请输入你的用户名\n> ')
password = input("请输入你的密码\n> ")
login(username, password)
三、使用cookie
接着我们如果想用我们登陆的ID号做点什么别的事情时,就得保存下cookie,存进headers里就行了。
由于我上面代码的登陆封装在函数里,下面用交互操作演示一下后续步骤:
>>> import re
>>> import requests
>>> headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}
>>> def get_xsrf():
#'''_xsrf 是一个动态变化的参数'''
index_url = 'https://www.zhihu.com/#signin'
#获取登录时要用到的_xsrf
index_page = requests.get(index_url, headers=headers)
html = index_page.text
pattern = r'name="_xsrf" value="(.*?)"'
#这里的_xsrf返回的是一个list
_xsrf = re.findall(pattern, html)
return _xsrf[0]
>>> xsrf = get_xsrf()
>>> formdata = {
'phone_num':'手机号', #输入手机号
'password':'密码', #输入密码
'_xsrf':xsrf
}
>>> url = 'https://www.zhihu.com/login/phone_num'
>>> r = requests.post(url=url, data=formdata)
>>> r.json()['msg']
'登录成功'
>>> r.headers['Set-Cookie']
'aliyungf_tc=AQAAANuoyleZZwEADpFq2sqbL1fysy78; Path=/; HttpOnly, q_c1=41d9412dc9d6431cb6849a754f41f7a0|1489474734000|1489474734000; Domain=zhihu.com; expires=Fri, 13 Mar 2020 06:58:54 GMT; Path=/, z_c0="QUJCS0pUaFBYZ2dYQUFBQVlRSlZUYTRoNzFpUE5ScjNaNk5oTXJ3Q29ZVW9lT1JlM245Yk93PT0=|1489474734|79a8a1792039b9f1bb14f7562b6cba8068d95721"; Domain=zhihu.com; expires=Thu, 13 Apr 2017 06:58:54 GMT; httponly; Path=/, _xsrf=; Domain=zhihu.com; expires=Mon, 14 Mar 2016 06:58:54 GMT; Path=/, r_cap_id="ZWEwNTVlMjAxYjliNDcyNTgwZjA4MjQ4Y2ZlNGRjMmU=|1489474734|f422359103c09d0b366fa17960ef06bdb214dbe4"; Domain=zhihu.com; expires=Thu, 13 Apr 2017 06:58:54 GMT; Path=/, cap_id="NTBjZWJjNGI1MjkwNDRhNGI5YmU3N2Q4ZWY0YTJiOTc=|1489474734|451abe100589202b9e67ed860b77dfc4d076e051"; Domain=zhihu.com; expires=Thu, 13 Apr 2017 06:58:54 GMT; Path=/, l_cap_id=; Domain=zhihu.com; expires=Mon, 14 Mar 2016 06:58:54 GMT; Path=/'
>>> headers["Cookie"] = r2.headers['Set-Cookie']
>>> headers
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Cookie': 'aliyungf_tc=AQAAANuoyleZZwEADpFq2sqbL1fysy78; Path=/; HttpOnly, q_c1=41d9412dc9d6431cb6849a754f41f7a0|1489474734000|1489474734000; Domain=zhihu.com; expires=Fri, 13 Mar 2020 06:58:54 GMT; Path=/, z_c0="QUJCS0pUaFBYZ2dYQUFBQVlRSlZUYTRoNzFpUE5ScjNaNk5oTXJ3Q29ZVW9lT1JlM245Yk93PT0=|1489474734|79a8a1792039b9f1bb14f7562b6cba8068d95721"; Domain=zhihu.com; expires=Thu, 13 Apr 2017 06:58:54 GMT; httponly; Path=/, _xsrf=; Domain=zhihu.com; expires=Mon, 14 Mar 2016 06:58:54 GMT; Path=/, r_cap_id="ZWEwNTVlMjAxYjliNDcyNTgwZjA4MjQ4Y2ZlNGRjMmU=|1489474734|f422359103c09d0b366fa17960ef06bdb214dbe4"; Domain=zhihu.com; expires=Thu, 13 Apr 2017 06:58:54 GMT; Path=/, cap_id="NTBjZWJjNGI1MjkwNDRhNGI5YmU3N2Q4ZWY0YTJiOTc=|1489474734|451abe100589202b9e67ed860b77dfc4d076e051"; Domain=zhihu.com; expires=Thu, 13 Apr 2017 06:58:54 GMT; Path=/, l_cap_id=; Domain=zhihu.com; expires=Mon, 14 Mar 2016 06:58:54 GMT; Path=/'}
可以看到登陆后的 cookie 已经存入 headers 了
接着传入这个 headers 就可以保持登陆状态。
用requests库的session会是更方便的方法
四、使用session
import requests
import re
s = requests.Session()
#session可以自动跟踪cookie
s.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}
def get_xsrf():
#'''_xsrf 是一个动态变化的参数'''
index_url = 'https://www.zhihu.com/#signin'
#获取登录时要用到的_xsrf
index_page = s.get(index_url)
html = index_page.text
pattern = r'name="_xsrf" value="(.*?)"'
#这里的_xsrf返回的是一个list
_xsrf = re.findall(pattern, html)
return _xsrf[0]
xsrf = get_xsrf()
login_url = 'https://www.zhihu.com/login/phone_num' #用手机登陆
formdata ={
'phone_num':'手机号', #输入手机号
'password':'密码', #输入密码
'_xsrf':xsrf
}
r = s.post(url=login_url, data=formdata)
print(r.json()['msg'])
print(r.headers)
以后通过 s 对象的请求 headers 都会自动加上 cookie