graph TD
A[Url-list]-->B[发送请求获取响应]
B --> C[解析响应]
C --> |提取数据|D[保存数据]
C --> A
User-Agent 浏览器的信息
Referer 页面跳转处
Cookie 识别用户
Host 请求目的地址
Connection 保持长连接
Upgrade-Insecure-Request 升级安全访问
Set-Cookie:服务器设置的用户cookie
所有的状态码都不可信,一切以是否从抓包得到的响应中获取到数据为准
response = requests.get(url)
import requests
url = 'http://www.baidu.com'
response = requests.get(url)
print(response.content.decode()) # 默认utf-8
print(response.url) # 返回实际请求的URL,有时会不同
print(response.status_code) # 返回状态码
print(response.headers) # 返回响应报文的请求头
print(response.request.headers) # 返回请求者的请求头
print(response.cookies) # 返回服务端给用户设置的cookie
print(response.request._cookies) # 返回请求者的cookie
response = requests.get(url, headers= header, params= kw)
,将参数设为字典传入import requests
url = 'https://cn.bing.com/search?' # 请求地址
header = { # 请求头
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
}
kw = { 'q' : 'python' } # 参数通过关键字形式传入
response = requests.get(url, headers= header, params=kw)
with open('request_params.html', 'wb') as f: # 存入文件中
f.write(response.content)
cookie
import requests
url = 'https://github.com/ahang1598'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
'Cookie': '_octo=GH1.1.33955........'
}
response = requests.get(url, headers= header)
with open('request_without_cookie_github.html', 'wb') as f:
f.write(response.content)
requests.get(url, headers=header, cookies=cookie)
import requests
url = 'https://github.com/ahang1598'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
Cookie='_octo=GH1.1.339554947.1578661733; ...'
cookie_list = Cookie.split('; ') # 切割生成列表
cookies = {}
# 1. 通过简单for循环生成
for cookie in cookie_list:
cookies[ cookie.split('=')[0] ] = cookie.split('=')[1]
# 2. 通过字典生成器
# cookies = { cookie.split('=')[0]:cookie.split('=')[1] for cookie in cookie_list }
response = requests.get(url, headers= header, cookies=cookies)
with open('request_cookie_params.html', 'wb') as f:
f.write(response.content)
cookies_dict = requests.utils.dict_from_cookiejar(response.cookies)
import requests
...
response = requests.get(url, headers=header)
# 方法三:通过requests中的utils.dict_from_cookiejar方法对response对象提取生成
cookie_dict = requests.utils.dict_from_cookiejar(response.cookies)
print(cookie_dict)
timeout
requests.get(url, timeout=3)
proxies = {
'http':'http://x.x.x.x:17940',
'https':'https://x.x.x.x:17940'
}
requests.get(url, proxies= proxies, timeout=5) # 防止代理不可用,等待时间过长
requests.get(url, verify=False)
https://sam.huat.edu.cn:8443/selfservice/
InSecure-Warning
但是能继续访问