2019-06-17 python day-05

今日内容

    1.requests 之 post

    2.response相关操作

    3.requests模块高级用法

    4.selenium模块基本用法

1.requests 之 post

'''

    requests 之 post

    post 请求之 GitHub 登录

    请求url:

        https://github.com/session

    请求方式:

        post

    请求头:

        #上一次请求从哪里来

        Referer: https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding

        User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36

    请求体:

        commit: Sign in

        utf8: ✓

        authenticity_token: AJMT5QZPTZEDs8ETRUlgNVZQ44BjM3Pmnle6acTkUJxGC8RIs+ugkdGfy9vRwQt1SNKLJHRFKMI0s1h2foc4Cg==

        login: nadai

        password: 1234567

        webauthn-support: supported

'''

# 一 访问login页面获取token信息

'''

    请求url:

        https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding

    请求方式:

    Get

    响应头:

    set-cokie//保存用于下次登录确认

    请求头:

        cookie

        user-agent

正则:

正则写法:

'''

# 二 往session发送post请求

import requests

import re

headers = {

    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

response = requests.get(url='https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding',headers=headers)

# print(response.text)

authenticity_token = re.findall('',response.text,re.S)

# print(authenticity_token)

# 把login页面返回的cookie转换成字典

login_cookies = response.cookies.get_dict()

headers2 = {

    'Referer':'https://github.com/login?return_to=%2Fjasonim%2Febook%2Ftree%2Fmaster%2Fcoding',

    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

}

# 拼接请求体信息

form_data = {

    'commit': 'Sign in',

    'utf8': '✓',

    'authenticity_token':authenticity_token,

    'login':'Alicedorothy',

    'password':'Githubmacpro2018',

    'webauthn-support': 'supported'

}

# 往session地址发送post请求

# 携带请求头(referer,user-agent,cookie)

response2 = requests.post(url='https://github.com/session',data=form_data,headers=headers2,cookies=login_cookies)

print(response2.status_code)

2.response相关操作

import requests

response = requests.get('https://baidu.com')

print(response.status_code)

print(response.url)

print(response.encoding)

response.encoding = 'utf-8'

print(response.content)

print(response.headers)

print(response.history)

print(response.cookies)

print(response.cookies.get_dict())

print(response.cookies.items())

print(response.encoding)

print(response.elapsed)

import requests

url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'

response = requests.get(url, stream=True)# stream = True,把content设置成迭代器对象

print(response.content)

with open('love_for_GD.mp4', 'wb') as f:

    for content in response.iter_content():

        f.write(content)

3.requests模块高级用法

'''

SSL Cert Verification

超时 设置

使用代理(***)

认证设置

异常处理

上传文件

'''

#证书验证(大部分网站都是https)

import requests

# 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端

response = requests.get('https://www.xiaohuar.com')

print(response.status_code)

# 改进1:去掉报错,但是会报警告

import requests

response = requests.get('https://www.xiaohuar.com', verif

y=False)

# 不验证证书,报警告,返回200

print(response.status_code)

# 改进2:去掉报错,并且去掉警报信息

import requests

import urllib3

urllib3.disable_warnings()  # 关闭警告

response = requests.get('https://www.xiaohuar.com', verify=False)

print(response.status_code)

# 改进3:加上证书

# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书

# 知乎\百度等都是可带可不带

# 有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站

import requests

import urllib3

# urllib3.disable_warnings()  # 关闭警告

response = requests.get(

    'https://www.xiaohuar.com',

    # verify=False,

    cert=('/path/server.crt', '/path/key'))#非真实目录

print(response.status_code)

# 超时设置

# 两种超时:float or tuple

# timeout=0.1  # 代表接收数据的超时时间

# timeout=(0.1,0.2)  # 0.1代表链接超时  0.2代表接收数据的超时时间

import requests

response = requests.get('https://www.baidu.com',

                        timeout=0.0001)

    4.selenium模块基本用法

'''

1.什么是selenium?

    自动化测试工具,可以使用它帮助我们驱动浏览器自动去执行某些定义好的操作,例如页面中执行的js代码,跳过登录验证

2.为什么使用?

    优点:

    使用requests模块登录需要分析大量的复杂通信流程,使用selenium可以轻松跳过登录验证

    缺点:

    浏览器会加载css,js、图片、视频....数据,爬虫效率相比requests模块要低

3.如何使用

    #pip3 install -i https://pypi.tsinghua.edu.cn/simple selenium

'''

from selenium import webdriver

import time

chrome = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')

chrome.get('https://www.baidu.com')

time.sleep(3)

chrome.close()

wait = WebDriverWait(chrome,10)

# 访问JD

chrome.get('https://jd.com')

input_tag = wait.until(EC.presence_of_element_located((By.ID,'key')))

input_tag.send_keys('唐诗三百首')

隐式等待

from selenium import webdriver

import time

driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')

# driver = webdriver.Chrome()

try:

    # wait = WebDriverWait(chrome,10) # 显示等待:等待某个元素加载

    driver.get('https://china.nba.com/')

    # 隐式等待:等待所有元素加载

    driver.implicitly_wait(10)

    news_tag = driver.find_element_by_class_name('nav-news')

    print(news_tag)

    print(news_tag.tag_name)

    time.sleep(10)

finally:

    driver.close()

你可能感兴趣的:(2019-06-17 python day-05)