解析:通过验证码的验证才可以访问页面
1.普通图形验证码
常见4位由数字和字母组成,利用OCR技术识别图形验证码,需要库tesserocr
import tesserocr
from PIL import Image
image=Image.open('code.jpg')
result=tesserocr.image_to_text(image) # 图像转化为文本
print(result) # 等同于 print(tesserocr.file_to_text('image.png'))
当验证码内有多余线条干扰了图片的识别
image=image.convert('L') # 将图片传化为灰度图像
image.show()
image=image.convert('1') # 将图片进行二进制处理,采用默认阈值127
image.show()
# 可将原图先转化为灰度图像,然后指定二值化阈值
image=image.convert('L')
threshold=80
table=[]
for i in range(256):
if i
2.极验滑动验证码的识别
解析:需要拖动拼合滑块才可以完成验证
三步流程:1)模拟点击验证按钮 2)识别滑动缺口的位置 3)模拟拖动滑块
email='[email protected]'
password='123456'
class CrackGeetest():
def __init__(self):
self.url='*/login'
self.browser=webdriver.Chrome()
self.wait=webDriverWait(self.browser,20)
self.email=email
sele.password=password
模拟点击
def get_geetest_button(self):
"""
获取初始验证按钮
""""
button=self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_radar_tip')))
return button
button=self.get_geetest_button()
button.click() # 模拟点击
button.click() # 模拟点击
识别缺口
解析:获取前后两张比对图片,二者不一致的地方即为缺口
def get_position(self):
"""
获取验证码位置
"""
img=self.wait.until(EC.presence_of_element_located((By.CLASS_NAME,'geetest_canvas_img')))
time.sleep(2)
location=img.location
size=img.size
top,bottom,left,right=location['y'],location['y']+size['height'],
location['x'],lcoation['x']+size['width']
return (top,bottom,left,right)
def get_geetest_image(self,name='captcha.png'):
"""
获取验证码图片
"""
top,bottom,left,right=self.get_position()
print('验证码位置:'top,bottom,left,right)
screenshot=self.get_screenshot()
captcha=screenshot.crop((left,top,right,bottom)) # crop()方法将图片裁切出来
return captcha
def get_slider(self):
"""
获取滑块
"""
slider=self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
return slider
# 点击呼出缺口
slider=self.get_slider()
slider.click()
# 当前已获取到两张图片对象,获取两张图片对应像素点的RGB数据,
# 若当前两者的数据在一定范围内代表一致,否则为不同即为缺口位置
def is_pixel_equal(self,image1,image2,x,y):
"""
判断两个像素是否相同
"""
# 取两个图片的像素点
pixel1=image1.load()[x,y]
pixel2=image2.load()[x,y]
threshold=60
if abs(pixel1[0]-pixel2[0])
模拟拖动:
拖动速度不可匀速,随机拖动,一般先加速后减速的运动轨迹符合人为标准
公式:x=v0 * t + 0.5 * a * t * t v=v0 + a * t
def get_track(self,distance):
"""
根据偏移量获取移动轨迹
"""
track = [] # 移动轨迹
current = 0 # 当前位移
mid = distance * 4 / 5 # 减速阈值
t = 0.2 # 计算间隔
v = 0 # 初速度
while current < distance:
if current < mid:
a = 2 # 加速度为正 2
else:
a = -3 # 加速度为负 3
v0 = v
v = v0 + a * t
move = v0 * t + 1/2 * a * t * t
current+=move # 当前位移
track.append(round(move)) # 加入轨迹
return track
最后一步:按照运动轨迹拖动滑块
def move_to_gap(self,slider,tracks):
"""
拖动滑块到缺口处
"""
ActionChains(self.browser).click_and_hold(slider).perform() # 按住拖动底部滑块
for x in tracks:
ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform() #拖动位移
time.sleep(1)
ActionChains(self.browser).release().perform() # 松开鼠标
# tesserocr 库用法
# 识别测试
import tesserocr
from PIL import Image
image = Image.open('code.jpg')
result = tesserocr.image_to_text(image)
print(result)
# 等同于如下
import tesserocr
print(tesserocr.file_to_text('image.png'))
# 迷惑验证码处理
import tesserocr
from PIL import Image
image = Image.open('code2.jpg')
image = image.convert('L') # 将图片转为灰度图像
threshold = 80
table = []
for i in range(256):
if i< threshold:
table.append(0)
else:
table.append(1)
image= image.point(table,'1') # 指定二值化处理
image.show() # 这时的验证码没有线条缠绕,数值分明
# 下述开始识别验证码
result = tesserocr.image_to_text(image)
print(result)
# 极验滑动验证码的识别
# 初始化
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
EMAIL = '[email protected]'
PASSWORD = '123456'
class CrackGeetest():
def __init__(self):
self.url ='https://account.geetest.com/login'
self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser,20)
self.email = EMAIL
self.password = PASSWORD
# 模拟点击
def get_geetest_button(self):
button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_radar_tip')))
return button
# 点击验证按钮
button = self.get_geetest_button()
button.click()
# 识别缺口
def get_position(self):
'''获取验证码位置:return:验证码位置元组'''
img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME,'geetest_canvas_img')))
time.sleep(2)
location = img.location
size = img.size
top ,bottom,left,right = location['y'],location['y']+size['height'],location['x'],location['x']+size['width']
return (top,bottom,left,right)
# 获取网页截图
def get_geetest_image(self,name='captcha.png'):
"""获取验证码图片:return :图片对象"""
top,bottom,left,right =self.get_position()
print('验证码位置',top,bottom,left,right)
screenshot= self.get_screenshot()
captcha = screenshot.crop((left,top,right,bottom)) # crop 裁剪
return captcha
def get_slider(self):
"""获取滑块"""
slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
return slider
# 点按呼出缺口
slider = self.get_slider()
slider.click()
def is_pixel_equal(self,image1,image2,x,y):
"""判断两个像素是否相同"""
# 取两个图片的像素点
pixel1 = image1.load()[x,y]
pixel2 = image2.load()[x,y]
threshold = 60
if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1]- pixel2[1])
代码摘抄之《Python 3网络爬虫开发实战》