python爬虫——利用 session 处理登录状态 github 登录实例


Cookie

Cookies是服务器在本地机器上存储的小段文本并随每一个请求发送至同一个服务器

Session

session机制是一种服务器端的机制,服务器使用一种类似于散列表的结构(也可能就是使用散列表)来保存信息。

使用Cookies直接访问需要登录的获取数据
#!/usr/bin/python
# -*- coding: utf-8 -*-

import  requests


# 1. 利用 hTTP headers 中 Cookie 字段实现登录后的数据抓取

# 如果请求中携带 登录后 Cookie 发送请求,就会表示已经登录成功
headers = {
    "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; _gat=1; user_session=kYlCFIbmw-cQzLHcexbjA365OWA7ecKmWA2sGN4oXTCNx9ae; __Host-user_session_same_site=kYlCFIbmw-cQzLHcexbjA365OWA7ecKmWA2sGN4oXTCNx9ae; logged_in=yes; dotcom_user=czwspider; _gh_sess=cVhpRy95OXJNdE85NWkwMlJST0NkK0oza3A2WEJ0aGxqTWQ0dzFFNHdRMTZMNUxLaHIyMmE2anc4TDh1VFdzT1UycitCbzJ6RHg1U2diYVJkdjU1d1phejk4S1ZKelcrLzFxOUhvb2hOTHZROUZ3RUM5NVN3RDdySjUzeXJQNjNTbUZBc0ZNYW9QdzFmZWFDSnRmd2VnNzMyNzBCOTUyazJudmxWeDRveHRBPS0tTjQ1Y1JFWXZlRFFKbnc3Vko1V3RrZz09--f2e6c23defd0c0e6d470eb2dcb91c7cc2ed54dc4",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36"
}

# 想要访问该页面就得放入登录成功后的 Cookies 值
setting_url = "https://github.com/settings/profile"

response = requests.get(setting_url,headers=headers)

with open('github_login_01.html','wb') as f:
    f.write(response.content)
使用requests模拟登录获取数据
#!/usr/bin/python
# -*- coding: utf-8 -*-

# 模拟 github 登录

# 0. 分析爬虫
'''
登录地址
确定请求地址 https://github.com/session

确定请求方式 POST

确定请求的内容

commit: Sign in
utf8: ✓
authenticity_token: FKPt8/jlSD6VqqKbJqQUylCZaArCLMEhyIYWtA12LSzK47nyaPOs8IoIZ04o5AGJiQIc04jX9b0lsWuETzc8+g==
login: czwspider
password: qwer1234

当请求成功 获取 页面的 Cookie 值并且,保存下来以后发送的请求都携带这个Cookie值

'''

import requests
import requests.utils

# 1。定义请求参数
login_url = "https://github.com/session"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
    "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; logged_in=no; _gat=1; _gh_sess=TTZZNnVPOTRwTlI2YW1UOWFqTUZwa1JHVENFOVpjYWxoekJLbS9GTGFQQkR4ODZVSTIrV1NTWUFQS0dETUhURmg4YVR3bGxjV1hJT2dMZ2NIOEptZlREUUxrOXV0eG1EcG5kUi9adUZQamFpWmxmMHVhY04vckJVWkxXbkNVa3ZHNCtjekF2WEVnMGVzaElqMnBpVUIvVGVSNzJmdjNQMFFxYWpONE1HMks2eDhpVzZ0Wk9ZQUZLMVJOOTRsYXJWUjV6VUNmRFhyaHlYczUzdUozSWR5M210akh3dkcvaXRhY2ZmanRNRC9IbElUMm5OSmkzVDhtbEwvSEdGWFMvd0xySWIxcFRrbGZ0RDQwQit5eGUvaGZKdWp5U1dYSnZ0VzRRb1FqUThZZXlreTBNU1RicUhheGJGQjFRcDlnN1N2b2RXRXRsT21lRFB5Q3RFSHQ3V1FpR05QSlo2TVBic2o3R0hDaUZOSmhST0l2ZXdabHVzVEdBcGdMRWpiS2lXME5jOWV2WTFJckFGUXI2WHpjeTZ5ZXpXMUJvSXlpdmRpZWNONUhFejFUMD0tLVhyaTZJdlZBMUdNWnBNU0QrWDFFQ1E9PQ%3D%3D--ab7943f35d872df42bb86b8086b5cec50d3ef0ce"
}
post_data = {
    "commit": "Sign in",
    "utf8": "✓",
    "authenticity_token": "UFKuL5RE8DTUXAc0cddcawtX3gWADuVQInNPqIacBfESUsUZwZ8jNQ24sYpQVHS6vqFXlci9FqeTV9aZ+wqa+Q==",
    "login": "czwspider",
    "password": "qwer1234"
}

# 2. 发送请求获取响应
response = requests.post(login_url,post_data,headers=headers)


# 3. 从响应中获取Cookies值并且在以后的通讯中都使用这个Cookies值
# requests.utils.dict_from_cookiejar 用于把Cookie对象转换成词典对象
# print(requests.utils.dict_from_cookiejar(response.cookies))

# 4. 爬取
setting_headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
}
setting_url = "https://github.com/settings/profile"
settting_resp = requests.get(setting_url,headers=setting_headers,cookies=response.cookies)

# print(response.content.decode('utf-8'))
print(settting_resp.status_code)

with open('github_login_02.html','wb') as f:
    f.write(settting_resp.content)
使用session对象进行登录获取数据
#!/usr/bin/python
# -*- coding: utf-8 -*-

# 模拟 github 登录

# 0. 分析爬虫
'''
登录地址
确定请求地址 https://github.com/session

确定请求方式 POST

确定请求的内容

commit: Sign in
utf8: ✓
authenticity_token: FKPt8/jlSD6VqqKbJqQUylCZaArCLMEhyIYWtA12LSzK47nyaPOs8IoIZ04o5AGJiQIc04jX9b0lsWuETzc8+g==
login: czwspider
password: qwer1234

当请求成功 获取 页面的 Cookie 值并且,保存下来以后发送的请求都携带这个Cookie值

'''

import requests
import requests.utils

# 1。定义请求参数
login_url = "https://github.com/session"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
    "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; logged_in=no; _gat=1; _gh_sess=TTZZNnVPOTRwTlI2YW1UOWFqTUZwa1JHVENFOVpjYWxoekJLbS9GTGFQQkR4ODZVSTIrV1NTWUFQS0dETUhURmg4YVR3bGxjV1hJT2dMZ2NIOEptZlREUUxrOXV0eG1EcG5kUi9adUZQamFpWmxmMHVhY04vckJVWkxXbkNVa3ZHNCtjekF2WEVnMGVzaElqMnBpVUIvVGVSNzJmdjNQMFFxYWpONE1HMks2eDhpVzZ0Wk9ZQUZLMVJOOTRsYXJWUjV6VUNmRFhyaHlYczUzdUozSWR5M210akh3dkcvaXRhY2ZmanRNRC9IbElUMm5OSmkzVDhtbEwvSEdGWFMvd0xySWIxcFRrbGZ0RDQwQit5eGUvaGZKdWp5U1dYSnZ0VzRRb1FqUThZZXlreTBNU1RicUhheGJGQjFRcDlnN1N2b2RXRXRsT21lRFB5Q3RFSHQ3V1FpR05QSlo2TVBic2o3R0hDaUZOSmhST0l2ZXdabHVzVEdBcGdMRWpiS2lXME5jOWV2WTFJckFGUXI2WHpjeTZ5ZXpXMUJvSXlpdmRpZWNONUhFejFUMD0tLVhyaTZJdlZBMUdNWnBNU0QrWDFFQ1E9PQ%3D%3D--ab7943f35d872df42bb86b8086b5cec50d3ef0ce"
}
post_data = {
    "commit": "Sign in",
    "utf8": "✓",
    "authenticity_token": "UFKuL5RE8DTUXAc0cddcawtX3gWADuVQInNPqIacBfESUsUZwZ8jNQ24sYpQVHS6vqFXlci9FqeTV9aZ+wqa+Q==",
    "login": "czwspider",
    "password": "qwer1234"
}


# 获取session对象通过session对象进行请求,session对象的作用就是自动记录Cookies值,代码中不需要关心
session = requests.session()
# 2. 发送请求获取响应
response = session.post(login_url,post_data,headers=headers)


# 3. 从响应中获取Cookies值并且在以后的通讯中都使用这个Cookies值
# requests.utils.dict_from_cookiejar 用于把Cookie对象转换成词典对象
# print(requests.utils.dict_from_cookiejar(response.cookies))

# 4. 爬取
setting_headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
}
setting_url = "https://github.com/settings/profile"
settting_resp = session.get(setting_url,headers=setting_headers)

# print(response.content.decode('utf-8'))
print(settting_resp.status_code)

with open('github_login_03.html','wb') as f:
    f.write(settting_resp.content)

你可能感兴趣的:(爬虫)