在学习爬虫的时候经常使用selenium应对js动态渲染,以及验证码的操作。初学者,自己写这个是为了熟悉selenium以及更好的学习!
我的这个脚本在自动播放视频的时候会有声音,即使是heahless的Firefox。我很无奈。
崔庆才的教程
selenium官方文档
关于datetime模块
还有这篇
'''python
import random
我的分数页面
My_Points = 'https://pc.xuexi.cn/points/my-points.html'
文章学习页面
Articles_list = [
'https://www.xuexi.cn/9ca612f28c9f86ad87d5daa34c588e00/9a3668c13f6e303932b5e0e100fc248b.html',
]
Articles_Url = random.choice(Articles_list)
视频学习页面
Videos_list = [
'#1novbsbi47k-5',
]
Videos_Url = 'https://www.xuexi.cn/4426aa87b0b64ac671c96379a3a8bd26/db086044562a57b441c24f2af1c8e101.html' + random.choice(Videos_list)
'''
网站只能手机扫码登陆,扫码获取cookies并存储在一个json文件中
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
import json
from url_manager import My_Points
def log_in():
print('获取cookies...')
try:
browser = Firefox()
browser.get(My_Points)
browser.execute_script("var q=document.documentElement.scrollTop=750")
WebDriverWait(browser, 120).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.my-points-card-detailBox'))
)
cookies = browser.get_cookies()
return cookies
except WebDriverException as e:
print(e.args)
print("获取Cookies失败")
finally:
# 关闭浏览器
browser.quit()
def write_in():
with open('COOKIES.json', 'w', encoding='utf-8') as file:
file.write(json.dumps(log_in()))
if __name__ == '__main__':
write_in()
from selenium.webdriver import Firefox
from selenium.webdriver import FirefoxOptions
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import url_manager
import time
import re
import json
import easygui
class I_love_xuexi():
def __init__(self):
"""
初始化一个无头模式的火狐浏览器对象
和一个显式等待对象
"""
options = FirefoxOptions()
options.add_argument('-headless')
self.browser = Firefox(options=options)
self.wait = WebDriverWait(self.browser, 10)
def head_less_firefox(self):
"""读取COOKIES.json中的cookies并添加到请求头中"""
print('正在添加cookies...')
with open('COOKIES.json', 'r', encoding='utf-8') as file:
content = file.read()
cookies = json.loads(content)
self.browser.get(url_manager.My_Points)
self.browser.delete_all_cookies()
for cookie in cookies:
self.browser.add_cookie(cookie)
def learn_articles(self):
"""学习文章"""
self.browser.get(url_manager.Articles_Url) # 打开文章页
print('等待文章列表加载...\n')
articles = self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.text-wrap>.text')))
for article in articles[:6]: # 遍历部分文章列表,并点击打开文章
print("阅读文章", article.text, '\n')
article.click()
if len(self.browser.window_handles) >= 2: # 检查是否点开文章
self.browser.switch_to.window(self.browser.window_handles[1]) # 切换选项卡到文章页
time.sleep(2)
for i in range(0, 1000, 5): # 睡眠加拖动滚动条,搞得好像真的有人再看文章一样
self.browser.execute_script("var q=document.documentElement.scrollTop={}".format(i))
time.sleep(1)
self.browser.close() # 关闭当前选项卡
self.browser.switch_to.window(self.browser.window_handles[0]) # 切换回文章列表页选项卡
def learn_video(self):
"""学习视频"""
self.browser.get(url_manager.Videos_Url)
print('等待视频列表加载\n')
videos = self.wait.until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.textWrapper'))
)
for video in videos[:6]:
video.click()
print("观看视频", video.text, '\n')
if len(self.browser.window_handles) >= 2:
try:
self.browser.switch_to.window(self.browser.window_handles[1])
button = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '.outter'))
) # 等待三角形的播放按钮
button.click()
time.sleep(10)
timeline = self.wait.until(
EC.presence_of_element_located((By.XPATH, "//span[@class='time-bound']//following-sibling::span"))
) # 等待加载视频时长
print("视频时长", timeline.text, '\n')
WebDriverWait(self.browser, 300).until(
EC.text_to_be_present_in_element((By.XPATH, "//span[@class='time-bound']//preceding-sibling::span"), timeline.text)
) # 等待视频播放,显式等待防止视频过长
except TimeoutException:
print("播放下一个视频")
finally:
self.browser.close()
self.browser.switch_to.window(self.browser.window_handles[0])
def show_mark(self):
"""打印分数"""
self.browser.get(url_manager.My_Points)
try:
self.wait.until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.my-points-card-detailBox'))
)
except TimeoutException:
return False
else:
html = self.browser.page_source
points_cards = re.findall(r'(.*?)
.*?(\d.?\/\d.?)', html, re.S)
str = ''
for card in points_cards:
point = card[0] + card[1] + '\n'
str += point
easygui.msgbox(msg="当前分数:\n" + str)
return True
def xuexi(self, a=1, v=1):
"""
调度方法
:param a: 学习文章的开关,a == 0,不学习文章
:param v: 学习视频的开关,a == 0,不学习视频
:return: None
"""
try:
self.head_less_firefox()
# 检查是否登陆成功
if self.show_mark():
if a:
self.learn_articles()
if v:
count = 0
while count <= 5:
try:
# 这个玩意儿经常报错,弄不清为啥,然后重新调用就好了
self.learn_video()
break
except WebDriverException:
count += 1
continue
self.show_mark()
else:
easygui.msgbox(msg='请重新扫码登陆')
except Exception:
easygui.exceptionbox()
finally:
self.browser.quit()
if __name__ == '__main__':
I_love_xuexi().xuexi()
from learning import I_love_xuexi
from get_cookies import write_in
import easygui
while True:
print("-"*9 + "程序正在启动,请稍后" + "-"*9)
choice = easygui.indexbox(
msg='选择学习内容',
choices=['扫码登陆', '学习文章', '学习视频', '退出程序']
)
if choice == 0:
write_in()
I_love_xuexi().xuexi()
elif choice == 1:
I_love_xuexi().xuexi(v=0)
elif choice == 2:
I_love_xuexi().xuexi(a=0)
else:
break