- 需要注意的基本都在注释里了
- 文章最后是我参考的文章
import os
import json
import requests
import datetime
import pytesseract
from lxml import etree
from bs4 import BeautifulSoup
from selenium import webdriver
from PIL import Image, ImageEnhance
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
buildingID = 9
imageUrl = "****"
cookiesUrl = "***"
urlsearch = '*****************************'
url = '***************************************************'
'''
先验证存在本地的cookies有没有用
'''
with open(cookiesUrl, 'r', encoding='utf-8') as f:
listCookies = json.loads(f.read())
cookie = [item["name"] + "=" + item["value"] for item in listCookies]
cookiestr = '; '.join(item for item in cookie)
headers = {
'cookie': cookiestr,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
html = requests.get(url=urlsearch, headers=headers)
demo = html.text
soup = BeautifulSoup(demo, "html.parser")
soo = soup.find('input', id='password')
soo2 = soup.find("a", id="errorMove")
if (soo != None or soo2 != None):
broswer = webdriver.Chrome()
broswer.get(url)
i = 0
while (1):
WebDriverWait(broswer, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'user')))
user = broswer.find_element_by_xpath('//input[@id="username"]')
pwd = broswer.find_element_by_xpath('//input[@id="password"]')
invaild = broswer.find_element_by_xpath('//input[@id="captcha_response"]')
broswer.save_screenshot(imageUrl)
ran = Image.open(imageUrl)
if(i == 0): box = (675, 338, 759, 369)
else: box = (675, 376, 760, 406)
ran.crop(box).save(imageUrl)
'''
以下操作把验证码变成白底黑字,提高orc识别率
'''
im = Image.open(imageUrl)
imgry = im.convert('L')
sharpness = ImageEnhance.Contrast(imgry)
sharp_img = sharpness.enhance(2.0)
sharp_img.save(imageUrl)
img = Image.open(imageUrl)
text = pytesseract.image_to_string(img, config=' --psm 7')
user.click()
user.send_keys('你的用户名')
pwd.click()
pwd.send_keys('你的密码')
invaild.click()
invaild.send_keys(text)
try:
broswer.find_element_by_xpath('//*[@id="loginForm"]/table/tbody/tr[5]/td/input').click()
except:
pass
i = 1
try:
textt = broswer.find_element_by_xpath('//*[@id="messages16741228231"]/div/div/span[2]').text
except:
break
'''
以下操作目的是将cookies存到本地以便下次使用
'''
cookie = broswer.get_cookies()
os.makedirs("D:/code/python/爬虫/教务网站/", exist_ok=True)
jsonCookies = json.dumps(cookie)
with open(cookiesUrl, 'w') as f:
f.write(jsonCookies)
with open(cookiesUrl, 'r', encoding='utf-8') as f:
listCookies = json.loads(f.read())
cookie = [item["name"] + "=" + item["value"] for item in listCookies]
cookiestr = '; '.join(item for item in cookie)
broswer.quit()
a = [0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 2]
today = datetime.date.today()
headers = {
'Cookie': cookiestr,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
data = {
'classroom.type.id': '2',
'classroom.campus.id': '',
'classroom.building.id': buildingID,
'seats': '',
'classroom.name': '',
'cycleTime.cycleCount': '1',
'cycleTime.cycleType': '1',
'cycleTime.dateBegin': today,
'cycleTime.dateEnd': today,
'roomApplyTimeType': '1',
'timeBegin': '14:00',
'timeEnd': '18:00'
}
response = requests.post(url=urlsearch, data=data, headers=headers, timeout=1000)
response.encoding = 'utf-8'
demo = response.text
soup = BeautifulSoup(demo, "html.parser")
soo = soup.find('tbody', id='grid15320024301_data')
for i in soo.find_all('tr'):
if (i.find_all('td')[1].text[0] == str(a[buildingID])):
print(i.find_all('td')[1].text)
References
- Tesseract-OCR-v5.0中文识别,训练自定义字库,提高图片的识别效果
- 在Python中使用Tesseract进行OCR识别
- Python Tesseract 图片识别-小操练
- python-识别图片-安装及配置:tesseract+pytesseract+Pillow
- python安装及配置:tesseract+pytesseract+Pillow
- OCR Tesseract, Empty page error?
- python pytesseract psm 选项参数
- Python+pytesseract+Tesseract-OCR图片文字识别(只适合新手)
- Python+Selenium+PIL+Tesseract真正自动识别验证码进行一键登录
- 关于selenium获取cookie然后实现免登陆
- XZ_Python之使用selenium加载动态网页和判断元素是否存在
- 使用selenium判断标签的元素值是否存在
- Python爬虫学习(十一)下载验证码图片
- python 自动化测试(1):获取验证码图片,实现自动登录