爬虫学习笔记——selenium.webdriver

我爱学习

在学习爬虫的时候经常使用selenium应对js动态渲染,以及验证码的操作。初学者,自己写这个是为了熟悉selenium以及更好的学习!
我的这个脚本在自动播放视频的时候会有声音,即使是heahless的Firefox。我很无奈。

参考了众多的代码

崔庆才的教程
selenium官方文档
关于datetime模块
还有这篇

url_manager 手动添加

'''python
import random

我的分数页面
My_Points = 'https://pc.xuexi.cn/points/my-points.html'

文章学习页面
Articles_list = [
    'https://www.xuexi.cn/9ca612f28c9f86ad87d5daa34c588e00/9a3668c13f6e303932b5e0e100fc248b.html',
]
Articles_Url = random.choice(Articles_list)

视频学习页面
Videos_list = [
    '#1novbsbi47k-5',
]
Videos_Url = 'https://www.xuexi.cn/4426aa87b0b64ac671c96379a3a8bd26/db086044562a57b441c24f2af1c8e101.html' + random.choice(Videos_list)	
'''

首先,扫码登陆获取登陆后的cookies

网站只能手机扫码登陆,扫码获取cookies并存储在一个json文件中

from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
import json
from url_manager import My_Points


def log_in():
    print('获取cookies...')
    try:
        browser = Firefox()
        browser.get(My_Points)
        browser.execute_script("var q=document.documentElement.scrollTop=750")
        WebDriverWait(browser, 120).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.my-points-card-detailBox'))
            )
        cookies = browser.get_cookies()
        return cookies
    except WebDriverException as e:
        print(e.args)
        print("获取Cookies失败")
    finally:
        # 关闭浏览器
        browser.quit()


def write_in():
    with open('COOKIES.json', 'w', encoding='utf-8') as file:
        file.write(json.dumps(log_in()))


if __name__ == '__main__':
    write_in()

传入cookies后开始打开文章,视频

from selenium.webdriver import Firefox
from selenium.webdriver import FirefoxOptions
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import url_manager
import time
import re
import json
import easygui


class I_love_xuexi():

    def __init__(self):
        """
        初始化一个无头模式的火狐浏览器对象
        和一个显式等待对象
        """
        options = FirefoxOptions()
        options.add_argument('-headless')
        self.browser = Firefox(options=options)
        self.wait = WebDriverWait(self.browser, 10)


    def head_less_firefox(self):
        """读取COOKIES.json中的cookies并添加到请求头中"""
        print('正在添加cookies...')
        with open('COOKIES.json', 'r', encoding='utf-8') as file:
            content = file.read()
            cookies = json.loads(content)
        self.browser.get(url_manager.My_Points)
        self.browser.delete_all_cookies()
        for cookie in cookies:
            self.browser.add_cookie(cookie)


    def learn_articles(self):
        """学习文章"""
        self.browser.get(url_manager.Articles_Url) # 打开文章页
        print('等待文章列表加载...\n')
        articles = self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.text-wrap>.text')))
        for article in articles[:6]:  # 遍历部分文章列表,并点击打开文章
            print("阅读文章", article.text, '\n')
            article.click()
            if len(self.browser.window_handles) >= 2: # 检查是否点开文章
                self.browser.switch_to.window(self.browser.window_handles[1])  # 切换选项卡到文章页
                time.sleep(2)
                for i in range(0, 1000, 5):  # 睡眠加拖动滚动条,搞得好像真的有人再看文章一样
                    self.browser.execute_script("var q=document.documentElement.scrollTop={}".format(i))
                    time.sleep(1)
                self.browser.close()  # 关闭当前选项卡
                self.browser.switch_to.window(self.browser.window_handles[0])  # 切换回文章列表页选项卡

    def learn_video(self):
        """学习视频"""
        self.browser.get(url_manager.Videos_Url)
        print('等待视频列表加载\n')
        videos = self.wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.textWrapper'))
        )
        for video in videos[:6]:
            video.click()
            print("观看视频", video.text, '\n')
            if len(self.browser.window_handles) >= 2:
                try:
                    self.browser.switch_to.window(self.browser.window_handles[1])

                    button = self.wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '.outter'))
                    )  # 等待三角形的播放按钮
                    button.click()
                    time.sleep(10)
                    timeline = self.wait.until(
                        EC.presence_of_element_located((By.XPATH, "//span[@class='time-bound']//following-sibling::span"))
                    )  # 等待加载视频时长
                    print("视频时长", timeline.text, '\n')
                    WebDriverWait(self.browser, 300).until(
                        EC.text_to_be_present_in_element((By.XPATH, "//span[@class='time-bound']//preceding-sibling::span"), timeline.text)
                    )  # 等待视频播放,显式等待防止视频过长
                except TimeoutException:
                    print("播放下一个视频")
                finally:
                    self.browser.close()
                    self.browser.switch_to.window(self.browser.window_handles[0])


    def show_mark(self):
        """打印分数"""
        self.browser.get(url_manager.My_Points)
        try:
            self.wait.until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.my-points-card-detailBox'))
            )
        except TimeoutException:
            return False
        else:
            html = self.browser.page_source
            points_cards = re.findall(r'

(.*?)

.*?
(\d.?\/\d.?)
'
, html, re.S) str = '' for card in points_cards: point = card[0] + card[1] + '\n' str += point easygui.msgbox(msg="当前分数:\n" + str) return True def xuexi(self, a=1, v=1): """ 调度方法 :param a: 学习文章的开关,a == 0,不学习文章 :param v: 学习视频的开关,a == 0,不学习视频 :return: None """ try: self.head_less_firefox() # 检查是否登陆成功 if self.show_mark(): if a: self.learn_articles() if v: count = 0 while count <= 5: try: # 这个玩意儿经常报错,弄不清为啥,然后重新调用就好了 self.learn_video() break except WebDriverException: count += 1 continue self.show_mark() else: easygui.msgbox(msg='请重新扫码登陆') except Exception: easygui.exceptionbox() finally: self.browser.quit() if __name__ == '__main__': I_love_xuexi().xuexi()

调用其它模块,开始学习

from learning import I_love_xuexi
from get_cookies import write_in
import easygui

while True:
    print("-"*9 + "程序正在启动,请稍后" + "-"*9)
    choice = easygui.indexbox(
            msg='选择学习内容',
            choices=['扫码登陆', '学习文章', '学习视频', '退出程序']
    )
    if choice == 0:
        write_in()
        I_love_xuexi().xuexi()
    elif choice == 1:
        I_love_xuexi().xuexi(v=0)
    elif choice == 2:
        I_love_xuexi().xuexi(a=0)
    else:
        break

最后,欢迎交流,指出问题,提出意见!

你可能感兴趣的:(爬虫学习笔记——selenium.webdriver)