本文以知网注册验证码为例
1. 安装tesseract,并将路径加到环境变量中
下载地址:https://digi.bib.uni-mannheim.de/tesseract/
2. 安装pytesseract
pip install pytesseract
代码示例
# coding = utf-8
import time
import pytesseract
from PIL import Image
from selenium import webdriver
def get_captcha(driver):
driver.save_screenshot('./files/screen_shot.png') # 截屏保存
check_code = driver.find_element_by_id('checkcode') # 找到验证码框
loc = check_code.location # 验证码的左上角的坐标, x、y # 找到左上角坐标
size = check_code.size # 找到验证码图片的大小,高和宽
# # print(loc, size)
image = Image.open('./files/screen_shot.png') # 打开验证码文件
rec = (loc['x'], loc['y'], size['width'] + loc['x'], size['height'] + loc['y']) # 4个坐标。左上角、右下角
captcha = image.crop(rec) # 截取对应坐标的图片
# captcha.show()
captcha.save('./files/captcha.png') # 保存截取的验证码区域文件
def recognize_captcha(file):
gray = Image.open(file).convert('L') # 灰度化
# gray.show()
w, h = gray.size
data = gray.load() # 数值化,分配内存加载二维点阵数据
for i in range(w):
for j in range(h): # 点阵里面的值,以128为界,置成0或者255.非黑即白
if data[i, j] < 128:
data[i, j] = 0
else:
data[i, j] = 255
# print(data[i, j])
return pytesseract.image_to_string(gray) # pytesseract image_to_string 图像识别为字符串
if __name__ == '__main__':
url = 'https://my.cnki.net/Register/CommonRegister.aspx'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window() # 窗口最大化
driver.implicitly_wait(5) # 隐式等待5s
get_captcha(driver)
captcha = recognize_captcha('./files/captcha.png')
print(captcha)
time.sleep(10)
driver.quit()
结果:
注: 有一定的错误识别率,实际用的时候,如果遇到识别失败的,可以多次获取