import aiohttp
import asyncio
from lxml import etree
urls = [
'http://127.0.0.1:8000/app_01/show/',
'http://127.0.0.1:8000/app_01/job/',
'http://127.0.0.1:8000/app_01/exec/',
]
# 特殊的函数:请求发送和响应数据的捕获
# 细节:在每一个with前加async,在每一个阻塞操作前加上await
async def get_request(url):
async with aiohttp.ClientSession() as s:
# with s.get(url, headers, proxy="http://ip:port",params)
async with await s.get(url) as response:
page_text = await response.text() # read()返回的是bytes类型数据
return page_text
# 回调函数
def parse(task):
page_text = task.result()
tree = etree.HTML(page_text)
tr_text = tree.xpath('/html/body/table/tbody/tr')
print(tr_text)
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
selenium基本使用
from selenium import webdriver
from time import sleep
bro = webdriver.Chrome(executable_path="chromedriver.exe") # 指定驱动位置
bro.get('https://www.jd.com/')
sleep(1)
# 进行标签定位
search_input = bro.find_element_by_id("key")
search_input.send_keys("Mac pro")
btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click() # 点击操作
sleep(2)
# 执行js
bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)
page_text = bro.page_source
print(page_text)
sleep(2)
bro.quit()
selenium动作链使用
from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path="chromedriver.exe")
bro.get("https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable")
bro.switch_to.frame("iframeResult")
div_tag = bro.find_element_by_id("draggable")
# 滑动=点击+滑动
action = ActionChains(bro)
action.click_and_hold(div_tag)
for i in range(5):
# perform 让动作链立即执行
action.move_by_offset(15, 10).perform()
sleep(0.5)
action.release() # 释放资源
sleep(2)
bro.quit()
爬个动态加载数据的网站
from selenium import webdriver
from time import sleep
from lxml import etree
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('http://125.35.6.84:81/xk/')
sleep(2)
page_text = bro.page_source
page_list = [page_text]
for i in range(3):
bro.find_element_by_id("pageIto_next").click()
sleep(2)
page_list.append(bro.page_source)
bro.quit()
for page in page_list:
tree = etree.HTML(page)
li_list = tree.xpath('//*[@id="gzlist"]/li')
for li in li_list:
title = li.xpath("./dl/@title")[0]
id = li.xpath("./ol/@title")[0]
print(id, ":", title)
12306模拟登录——图片点击
from selenium import webdriver
from time import sleep
from PIL import Image
from chaojiying_Python.chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
def tranformImageData(img_path, t_type):
chaojiying = Chaojiying_Client('xxx', 'xxx', '1004')
im = open(img_path, 'rb').read()
return chaojiying.PostPic(im, t_type)['pic_str']
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = Chrome(options=option)
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get("https://kyfw.12306.cn/otn/login/init")
sleep(1)
bro.save_screenshot("main.png")
code_img = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img.location # 坐下角原点
size = code_img.size # 宽和高
# 裁剪的区域范围,左下角,右上角
rangle = (int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))
i = Image.open("main.png")
frame = i.crop(rangle)
frame.save('code.png')
image_location = tranformImageData("code.png", 9004)
print(image_location)
# image_location = "114,58|185,82|259,66"
image_list = image_location.split("|")
result = [] # [['114', '58'], ['185', '82']]
for i in image_list:
x_y_list = i.split(",")
result.append(x_y_list)
new_result = [] # [[114, 58], [185, 82]]
for i in result:
j = [int(k) for k in i]
new_result.append(j)
action = ActionChains(bro)
for a in new_result:
x = a[0]
y = a[1]
action.move_to_element_with_offset(code_img, x, y).click().perform() # 先偏移到图片位置,再进行点击
sleep(1)
无头浏览器设置
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
# 设置无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path='chromedriver.exe', options=chrome_options)
driver.get("https://www.baidu.com")
print(driver.page_source)
规避selenium检测
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
# 新老版本google兼容
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path='./chromedriver')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.get('http://www.baidu.com')