今日内容
1.requests 之 post
2.response相关操作
3.requests模块高级用法
4.selenium模块基本用法
1.requests 之 post
'''
requests 之 post
post 请求之 GitHub 登录
请求url:
https://github.com/session
请求方式:
post
请求头:
#上一次请求从哪里来
Referer: https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
请求体:
commit: Sign in
utf8: ✓
authenticity_token: AJMT5QZPTZEDs8ETRUlgNVZQ44BjM3Pmnle6acTkUJxGC8RIs+ugkdGfy9vRwQt1SNKLJHRFKMI0s1h2foc4Cg==
login: nadai
password: 1234567
webauthn-support: supported
'''
# 一 访问login页面获取token信息
'''
请求url:
https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding
请求方式:
Get
响应头:
set-cokie//保存用于下次登录确认
请求头:
cookie
user-agent
正则:
正则写法:
'''
# 二 往session发送post请求
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
response = requests.get(url='https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding',headers=headers)
# print(response.text)
authenticity_token = re.findall('',response.text,re.S)
# print(authenticity_token)
# 把login页面返回的cookie转换成字典
login_cookies = response.cookies.get_dict()
headers2 = {
'Referer':'https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
# 拼接请求体信息
form_data = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token':authenticity_token,
'login':'Alicedorothy',
'password':'Githubmacpro2018',
'webauthn-support': 'supported'
}
# 往session地址发送post请求
# 携带请求头(referer,user-agent,cookie)
response2 = requests.post(url='https://github.com/session',data=form_data,headers=headers2,cookies=login_cookies)
print(response2.status_code)
2.response相关操作
import requests
response = requests.get('https://baidu.com')
print(response.status_code)
print(response.url)
print(response.encoding)
response.encoding = 'utf-8'
print(response.content)
print(response.headers)
print(response.history)
print(response.cookies)
print(response.cookies.get_dict())
print(response.cookies.items())
print(response.encoding)
print(response.elapsed)
import requests
url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
response = requests.get(url, stream=True)# stream = True,把content设置成迭代器对象
print(response.content)
with open('love_for_GD.mp4', 'wb') as f:
for content in response.iter_content():
f.write(content)
3.requests模块高级用法
'''
SSL Cert Verification
超时 设置
使用代理(***)
认证设置
异常处理
上传文件
'''
#证书验证(大部分网站都是https)
import requests
# 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端
response = requests.get('https://www.xiaohuar.com')
print(response.status_code)
# 改进1:去掉报错,但是会报警告
import requests
response = requests.get('https://www.xiaohuar.com', verif
y=False)
# 不验证证书,报警告,返回200
print(response.status_code)
# 改进2:去掉报错,并且去掉警报信息
import requests
import urllib3
urllib3.disable_warnings() # 关闭警告
response = requests.get('https://www.xiaohuar.com', verify=False)
print(response.status_code)
# 改进3:加上证书
# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
# 知乎\百度等都是可带可不带
# 有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站
import requests
import urllib3
# urllib3.disable_warnings() # 关闭警告
response = requests.get(
'https://www.xiaohuar.com',
# verify=False,
cert=('/path/server.crt', '/path/key'))#非真实目录
print(response.status_code)
# 超时设置
# 两种超时:float or tuple
# timeout=0.1 # 代表接收数据的超时时间
# timeout=(0.1,0.2) # 0.1代表链接超时 0.2代表接收数据的超时时间
import requests
response = requests.get('https://www.baidu.com',
timeout=0.0001)
4.selenium模块基本用法
'''
1.什么是selenium?
自动化测试工具,可以使用它帮助我们驱动浏览器自动去执行某些定义好的操作,例如页面中执行的js代码,跳过登录验证
2.为什么使用?
优点:
使用requests模块登录需要分析大量的复杂通信流程,使用selenium可以轻松跳过登录验证
缺点:
浏览器会加载css,js、图片、视频....数据,爬虫效率相比requests模块要低
3.如何使用
#pip3 install -i https://pypi.tsinghua.edu.cn/simple selenium
'''
from selenium import webdriver
import time
chrome = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
chrome.get('https://www.baidu.com')
time.sleep(3)
chrome.close()
wait = WebDriverWait(chrome,10)
# 访问JD
chrome.get('https://jd.com')
input_tag = wait.until(EC.presence_of_element_located((By.ID,'key')))
input_tag.send_keys('唐诗三百首')
隐式等待
from selenium import webdriver
import time
driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
# driver = webdriver.Chrome()
try:
# wait = WebDriverWait(chrome,10) # 显示等待:等待某个元素加载
driver.get('https://china.nba.com/')
# 隐式等待:等待所有元素加载
driver.implicitly_wait(10)
news_tag = driver.find_element_by_class_name('nav-news')
print(news_tag)
print(news_tag.tag_name)
time.sleep(10)
finally:
driver.close()