python爬虫---selenium模块、12306模拟登录

文章目录

  • python爬虫---selenium模块、12306模拟登录
    • selenium模块在爬虫中的使用
    • 动态获取数据
    • 模拟登录12306

python爬虫—selenium模块、12306模拟登录

csdn自动把你手写的标签里的style属性给过滤掉了,而博客园就不会!这些内容本不会显示的。
回顾:单线程+多任务,速度会很快。

server.py

import time
from flask import Flask, render_template

app = Flask(__name__)
@app.route('/index')
def index():
    time.sleep(2)
    return render_template('test.html')


@app.route('/index1')
def index1():
    time.sleep(2)
    return render_template('test.html')


@app.route('/index2')
def index2():
    time.sleep(2)
    return render_template('test.html')


if __name__ == '__main__':
    app.run()

jiexi.py

import aiohttp
import asyncio
import time

from lxml import etree

start = time.time()

urls = [
    'http://127.0.0.1:5000/index',
    'http://127.0.0.1:5000/index1',
    'http://127.0.0.1:5000/index2',
]

# 特殊的函数:请求发送和响应数据的捕获
# 细节1:在每一个with前加上async
# 细节2:在每一个阻塞操作的前边加上await
async def get_request(url):
    async with aiohttp.ClientSession() as s:
        # with s.get(url, headers,proxy="http://ip:port")
        async with await s.get(url) as response:
            page_text = await response.text()  # read()返回的时byte
            return page_text


# 回调函数
def parse(task):
    page_text = task.result()
    tree = etree.HTML(page_text)
    parse_data = tree.xpath('//li/text()')
    print(parse_data)


tasks = []
for url in urls:
    c = get_request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(parse)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

print(time.time() - start)

"""
['i am hero', 'i am superman', 'i am spider']
['i am hero', 'i am superman', 'i am spider']
['i am hero', 'i am superman', 'i am spider']
2.0393407344818115
"""

selenium模块在爬虫中的使用

  • 概念:是一个基于浏览器自动化的模块。
  • 爬虫之间的关联:
    • 便捷的捕获到动态加载的数据。(可见即可得)
    • 实现模拟登录
  • 环境安装:pip install selenium
  • 基本使用:
    • 准备号某一款浏览器的驱动程序(谷歌):https://chromedriver.storage.googleapis.com/index.html 谷歌驱动版本要对应
import time

from selenium import webdriver

bro = webdriver.Chrome(executable_path='chromedriver.exe')

bro.get('https://www.jd.com')
time.sleep(1)

# 进行标签定位
search_input = bro.find_element_by_id('key')
search_input.send_keys('mac pro')

btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
time.sleep(2)

# 滚轮转动
# window.scrollTo(0,document.body.scrollHeight)
# 执行js
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')

time.sleep(2)

# 获取页面源码数据
page_text = bro.page_source
print(page_text)

time.sleep(2)

bro.quit()

动态获取数据

"""
爬取药监总局相关企业的详细信息:
http://scxk.nmpa.gov.cn:81/xk/
"""
from time import sleep
from selenium import webdriver
from lxml import etree

bro = webdriver.Chrome(executable_path='chromedriver.exe')

bro.get('http://scxk.nmpa.gov.cn:81/xk/')
sleep(1)

page_text = bro.page_source
page_text_list = [page_text]

for i in range(3):
    bro.find_element_by_id('pageIto_next').click()  # 点击下一页
    sleep(1)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//*[@id="gzlist"]/li')
    for li in li_list:
        title = li.xpath('./dl/@title')[0]
        num = li.xpath('./ol/@title')[0]
        print(title+':'+num)

sleep(2)
bro.quit()

模拟登录12306

  • 动作量

    • 一系列连续的动作

    • from selenium.webdriver import ActionChains

    • 实现标签定位时,如果发现定位的标签是存在于iframe标签之中的,则在定位时必须执行一个固定的操作:bro.switch_to.fram(‘iframe_id’)

    • 拖到=点击+滑动

    • action =ActionChains(bro)
      action.click_and_hold(div_tag)
      
      for i in range(5):
          # perform 让动作链立即执行
          action.move_by_offset(17, 5).perform()
          
      action.release()
      

模拟登录

"""
https://www.cnblogs.com/bobo-zhang/articles/10815914.html


模拟登录12306:
https://kyfw.12306.cn/otn/login/init
"""
from time import sleep
from selenium import webdriver
from chaojiying_Python.chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
from PIL import Image
from lxml import etree


def get_text(imgPath, imgType):
    chaojiying = Chaojiying_Client('用户名', '密码', '911057')  # 用户中心>>软件ID 生成一个替换 96001
    im = open('imgPath', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    return chaojiying.PostPic(im, imgType)['pic_str']


bro = webdriver.Chrome(executable_path='chromedriver.exe')

bro.get('https://kyfw.12306.cn/otn/login/init')
sleep(1)

# 截图
bro.save_screenshot('main.png')

code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img_tag.location  # 左下角的坐标
size = code_img_tag.size  # 长和宽
print(location, size)
# 裁剪的区域范围
rangle = (int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))

i = Image.open('main.png')
code_img_name = 'code.png'
frame = i.crop(rangle)
frame.save(code_img_name)

result = get_text('code.png', 9004)
# print(img_location)  # '55,70|267,133'  ==  [[55,70], [267,133]]
all_list = []
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)


for a in all_list:
    x = a[0]
    y = a[1]
    action = ActionChains(bro)
    # 先把坐标原点偏移到验证码的原点
    action.move_to_element_with_offset(code_img_tag, x, y).click().perform()
    sleep(1)

bro.find_element_by_id('username').send_keys('[email protected]')
bro.find_element_by_id('password').send_keys('123456')
bro.find_element_by_id('loginSub').click()

sleep(3)

# bro.quit()

你可能感兴趣的:(爬虫)