到了开学季,图书馆成了大家每天争抢的目标,尤其是信息学部的图书馆,几乎等到我刚一打开预定座位的网页,座位就被一抢而空,感叹于学长学姐的“手速之快”时,也常常想自己也写一个刷座位的脚本,然而当我打开登陆页面的时候,却被类似这样的验证码阻拦在外面,也想过先登录获取 sessionid,但是觉得这样没有什么意思,毕竟算不上是真正意义上的自动化。
直到最近在 gitchat 上看到了大佬的一篇文章 详解爬虫处理滑动验证的技术细节和思想以案例说明,顺利的完成了京东登陆的滑块验证码,突然间就有了信心,想着能不能也来试一下完成图书馆的自动化登陆。
一开始也没有什么思路,直到看到了一篇文章,破解含语序问题的点击验证码。卧槽,这不正是我想要的?,真是雪中送碳啊。
踩坑经历:
主要代码如下:
这里使用了协程调用第三方打码平台
import aiohttp
import asyncio
import fake_useragent
import time
import hashlib
from ast import literal_eval
import os
from tqdm import tqdm
from contextvars import ContextVar
concurrent = ContextVar("concurrent")
# 等待验证的图片的位置
base_dir = "../single_chinese2"
# 这里使用 set PD_ID="****"
PD_ID = os.getenv("PD_ID")
PD_KEY = os.getenv("PD_KEY")
def calc_sign(timestamp):
md5 = hashlib.md5()
md5.update((str(timestamp)+PD_KEY).encode())
sign_first = md5.hexdigest()
md5 = hashlib.md5()
md5.update((PD_ID + str(timestamp) + sign_first).encode())
return md5.hexdigest()
# 检查是否是汉字
def check_result(word):
try:
if u'\u4e00' <= str(word) <= u'\u9fff':
return False
except Exception as e:
print(e.args)
return True
def param_data(data_json, amount):
if data_json.get("RetCode") == "0":
write_path = os.path.join(base_dir, f"{amount}.txt")
result = literal_eval(data_json.get("RspData")).get("result")
# 检查获得的结果是否合法
if not result or len(result) > 1 or check_result(result):
print(f"{amount}.jpeg: fail to recognize!")
return
with open(write_path, "w", encoding="utf-8") as f:
f.write(result)
print(f"{amount}.jpeg: success => \'{result}\' saved in {write_path} ")
async def api_from_ff(timestamp, ua, img_path, amount):
api_url = "http://pred.fateadm.com/api/capreg"
headers = {
"User-Agent": ua,
"Content-Type": "application/x-www-form-urlencoded",
}
payload = {
"user_id": PD_ID,
"timestamp": str(timestamp),
"sign": calc_sign(timestamp),
"predict_type": "40100",
"up_type": "mt",
"img_data": open(img_path, "rb"),
}
sem = concurrent.get()
try:
async with sem:
async with aiohttp.ClientSession() as session:
async with session.post(url=api_url, data=payload, timeout=300) as resp:
data_json = await resp.text()
param_data(literal_eval(data_json), amount)
except Exception as e:
print(e)
async def main(amount):
# 限制最多打开的文件数量
concurrent.set(asyncio.Semaphore(30))
ua = fake_useragent.UserAgent()
tasks = []
for i in tqdm(range(amount)):
# 这里是因为我在存储图片的时候是按照图片的编号命名的
txt_path = os.path.join(base_dir, f"{i+1}.txt")
img_path = os.path.join(base_dir, f"{i+1}.jpeg")
if os.path.exists(img_path):
if not os.path.exists(txt_path) or check_result(open(txt_path, "r", encoding="utf8").read()):
timestamp = int(time.time())
print(img_path)
tasks.append(asyncio.create_task(api_from_ff(timestamp, ua.random, img_path, i+1)))
await asyncio.wait(tasks, return_when='ALL_COMPLETED')
print(f"{amount} 个图片已经识别完毕!")
if __name__ == "__main__":
asyncio.run(main(20000))
主要的登陆模块如下,有关 darknet 的使用方法可以参照上面知乎的链接
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common import exceptions as SException
import requests
import fake_useragent
import re
import cv2
import base64
import time
import darknet
import os
import numpy as np
# 试过直接调用百度的接口,但是识别的准确率太低了
# class BaiduOcr:
# def __init__(self, api_key, secrect_key):
# self.api_key = api_key
# self.secrect_key = secrect_key
# self.access_token = self.get_access_token()
# def get_access_token(self):
# access_url = "https://aip.baidubce.com/oauth/2.0/token"
# headers = {
# "User-Agent": fake_useragent.UserAgent().random,
# }
# params = {
# "grant_type": "client_credentials",
# "client_id": self.api_key,
# "client_secret": self.secrect_key
# }
# r = requests.get(url=access_url, headers=headers, params=params)
# if r.status_code == 200:
# return r.json().get("access_token")
# else:
# return None
# def ocr_with_location(self, pic_path):
# api_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate"
# headers = {
# "User-Agent": fake_useragent.UserAgent().random,
# "Content-Type": "application/x-www-form-urlencoded",
# }
# params = {
# "access_token": self.access_token
# }
# payload = {
# "image": base64.b64encode(open(pic_path, "rb").read()),
# "probability": "true",
# # 定位单字符位置
# "recognize_granularity": "small",
# }
# r = requests.post(url=api_url, data=payload, params=params, headers=headers)
# print(r.json())
# def general_ocr(self, pic_path):
# api_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic"
# headers = {
# "User-Agent": fake_useragent.UserAgent().random,
# "Content-Type": "application/x-www-form-urlencoded",
# }
# params = {
# "access_token": self.access_token
# }
# payload = {
# "image": base64.b64encode(open(pic_path, "rb").read()),
# "probability": "true",
# }
# r = requests.post(url=api_url, data=payload, params=params)
# print(r.json())
# def accurate_ocr(self, pic_path):
# api_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# headers = {
# "User-Agent": fake_useragent.UserAgent().random,
# "Content-Type": "application/x-www-form-urlencoded",
# }
# params = {
# "access_token": self.access_token
# }
# payload = {
# "image": base64.b64encode(open(pic_path, "rb").read()),
# "probability": "true",
# }
# r = requests.post(url=api_url, data=payload, params=params)
# print(r.json())
def param_directions(directions):
x, y, x_width, y_width = directions[2][0], directions[2][1], directions[2][2], directions[2][3]
return (round(y - y_width / 2), round(y + y_width / 2),
round(x - x_width / 2), round(x + x_width / 2))
def resize_img(img_path):
crop_size = (40, 50)
pic = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
pic_new = cv2.resize(pic, crop_size, interpolation=cv2.INTER_CUBIC)
img_write = cv2.imencode(".jpeg",pic_new)[1].tofile(img_path)
class LibraryLogin:
def __init__(self, name=None, pwd=None, timeout=5, poll_frequency=0.5, img_dir="data/code_img/"):
self.name = name
self.pwd = pwd
self.img_dir = img_dir
# 验证字段
self.verify_text = None
# 在 Selenium 启动参数中指定以开发者模式启动
# 则 window.navigator.webdriver 的值和正常值一样
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches",
["enable-automation"])
# 浏览器无头模式
# options.add_argument("--headless")
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=timeout, poll_frequency=poll_frequency)
def _download_verify_pic(self):
button_verify = self.wait.until(EC.element_to_be_clickable((
By.CSS_SELECTOR, "form dd input[class='verifyCode']"
)))
ActionChains(self.driver).click(button_verify).perform()
# 切换进入 iframe
iframe = self.wait.until(EC.presence_of_element_located((
By.CSS_SELECTOR, "#layui-layer-iframe1"
)))
self.driver.switch_to.frame(iframe)
# 统计尝试的次数
count = 1
while True:
print(f"第{count}次尝试")
# 一次测试之后赋值为 None,避免出错
self.verify_text = None
detect_words_info = self._get_one_code()
if detect_words_info:
print(detect_words_info)
print(self.verify_text)
print(f"识别成功,尝试次数:{count}")
break
else:
refresh = self.wait.until(EC.element_to_be_clickable((
By.CSS_SELECTOR, ".myCaptcha-btn-refresh"
)))
ActionChains(self.driver).click(refresh).perform()
# 休眠 1s 钟避免字体无法显示
time.sleep(1)
count = count + 1
self._perform_action(detect_words_info)
# self.driver.switch_to.default_content()
# 点击验证码是否成功
try:
verify_btn = WebDriverWait(self.driver, timeout=1).until(EC.visibility_of_element_located((
By.CSS_SELECTOR, "#login dd input[type='button']"
)))
except SException.TimeoutException:
return False
else:
if verify_btn.get_attribute("value") == "验证通过":
return True
else:
return False
def _get_one_code(self):
# 绑定验证的顺序
verify_text = self.wait.until(EC.visibility_of_element_located((
By.CSS_SELECTOR, "div .myCaptcha-tip-box span"
)))
# 确保不是空
# print(verify_text.text)
try:
if len(verify_text.text) == 0:
return False
except Exception as e:
print(e.args)
self.verify_text = "".join(verify_text.text.replace("请依次点击:", "").replace("\"", "").split())
pic_verify = self.wait.until(EC.presence_of_element_located((
By.CSS_SELECTOR, ".myCaptcha-img-box img"
)))
pic_verify_url = pic_verify.get_attribute("src")
headers = {
"User-Agent": fake_useragent.UserAgent().random,
}
code_name = f"{int(time.time())}.jpeg"
# 加入 self 可以方便识别
code_path = os.path.join(self.img_dir, code_name)
r = requests.get(url=pic_verify_url, headers=headers)
if r.status_code == 200:
with open(code_path, "wb") as f:
f.write(r.content)
# 切割并识别图片
detect_words_info = self._divide_and_save_img(code_path, self.img_dir)
# 判断识别的内容是否正确
count = 0
for word_info in detect_words_info:
word = word_info.get("value")
if word in self.verify_text:
count = count + 1
if count == len(self.verify_text):
return detect_words_info
else:
return False
def _divide_and_save_img(self, img_path, write_path):
img = cv2.imread(img_path)
directions = darknet.performDetect(img_path)
count = 1
# 存储识别的汉字
detect_words = []
for direction in directions:
# 汉字的信息
word_info = {}
# 汉字所在的坐标
word_info["x"] = int(direction[2][0])
word_info["y"] = int(direction[2][1])
y0, y1, x0, x1 = param_directions(direction)
cropped = img[y0:y1, x0:x1]
save_path = os.path.join(write_path, f"{count}.jpeg")
cv2.imwrite(save_path, cropped)
# 将图片的分辨率改为 (40, 50)
resize_img(save_path)
url = "http://127.0.0.1:6000/b"
files = {
"image_file": ("image_file", open(save_path, "rb"), "application")
}
r = requests.post(url=url, files=files)
if r.status_code == 200:
word_info["value"] = r.json().get("value")
detect_words.append(word_info)
count = count + 1
# 删除识别的图片
os.remove(img_path)
return detect_words
def _perform_action(self, detect_words_info):
# 找到验证码元素
verify_img = self.wait.until(EC.visibility_of_element_located((
By.CSS_SELECTOR, ".myCaptcha-image"
)))
for word in self.verify_text:
for word_info in detect_words_info:
if word == word_info.get("value"):
x, y = word_info.get("x"), word_info.get("y")
ActionChains(self.driver).move_to_element_with_offset(verify_img, x, y).perform()
ActionChains(self.driver).click().perform()
# time.sleep(1)
def login(self):
try:
self.driver.get("https://seat.lib.whu.edu.cn/login")
self._download_verify_pic()
username_input = self.wait.until(EC.element_to_be_clickable((
By.CSS_SELECTOR, "dd input[class='input1']"
)))
ActionChains(self.driver).click(username_input).perform()
ActionChains(self.driver).send_keys(self.name).perform()
pwd_input = self.wait.until(EC.element_to_be_clickable((
By.CSS_SELECTOR, "dd input[class='input2']"
)))
ActionChains(self.driver).click(pwd_input).perform()
ActionChains(self.driver).send_keys(self.pwd).perform()
login_btn = self.wait.until(EC.element_to_be_clickable((
By.CSS_SELECTOR, "dd input[class='btn1']"
)))
ActionChains(self.driver).click(login_btn).perform()
self.wait.until(EC.url_to_be("https://seat.lib.whu.edu.cn/"))
raw_cookies = self.driver.get_cookies()
cookie = {}
for raw_cookie in raw_cookies:
cookie[raw_cookie.get("name")] = raw_cookie.get("value")
return cookie
except Exception as e:
print(e.args)
return None
finally:
self.driver.close()
if __name__ == "__main__":
test = LibraryLogin("*******", "*******")
test.login()
中间的一部分过程写的并不是很详细,中间标记数据的时候弄了好几天,当时的想法也记得不是太清楚了,只能是大概记录一下思路,也希望以后再碰见点击验证码的时候可以不用一筹莫展。