Python 3网络爬虫之验证码

解析:通过验证码的验证才可以访问页面

1.普通图形验证码

常见4位由数字和字母组成,利用OCR技术识别图形验证码,需要库tesserocr

import tesserocr
from PIL import Image
image=Image.open('code.jpg')
result=tesserocr.image_to_text(image)  # 图像转化为文本
print(result)  # 等同于 print(tesserocr.file_to_text('image.png'))

当验证码内有多余线条干扰了图片的识别

image=image.convert('L')  # 将图片传化为灰度图像
image.show()
image=image.convert('1')  # 将图片进行二进制处理,采用默认阈值127
image.show() 
# 可将原图先转化为灰度图像,然后指定二值化阈值
image=image.convert('L')
threshold=80
table=[]
for i in range(256):
    if i

2.极验滑动验证码的识别

解析:需要拖动拼合滑块才可以完成验证

三步流程:1)模拟点击验证按钮 2)识别滑动缺口的位置 3)模拟拖动滑块

email='[email protected]'
password='123456'
class CrackGeetest():
    def __init__(self):
        self.url='*/login'
        self.browser=webdriver.Chrome()
        self.wait=webDriverWait(self.browser,20)
        self.email=email
        sele.password=password

模拟点击

def get_geetest_button(self):
    """
    获取初始验证按钮
    """"
    button=self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_radar_tip')))
    return button
button=self.get_geetest_button()
button.click() # 模拟点击

button.click() # 模拟点击

识别缺口

解析:获取前后两张比对图片,二者不一致的地方即为缺口

def get_position(self):
    """
    获取验证码位置
    """
    img=self.wait.until(EC.presence_of_element_located((By.CLASS_NAME,'geetest_canvas_img')))
    time.sleep(2)
    location=img.location
    size=img.size
    top,bottom,left,right=location['y'],location['y']+size['height'],
                          location['x'],lcoation['x']+size['width']
    return (top,bottom,left,right)

def get_geetest_image(self,name='captcha.png'):
    """
    获取验证码图片
    """
    top,bottom,left,right=self.get_position()
    print('验证码位置:'top,bottom,left,right)
    screenshot=self.get_screenshot()
    captcha=screenshot.crop((left,top,right,bottom)) # crop()方法将图片裁切出来
    return captcha
def get_slider(self):
    """
    获取滑块
    """
    slider=self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
    return slider
# 点击呼出缺口
slider=self.get_slider()
slider.click()
# 当前已获取到两张图片对象,获取两张图片对应像素点的RGB数据,
# 若当前两者的数据在一定范围内代表一致,否则为不同即为缺口位置
def is_pixel_equal(self,image1,image2,x,y):
    """
    判断两个像素是否相同
    """
    # 取两个图片的像素点
    pixel1=image1.load()[x,y]
    pixel2=image2.load()[x,y]
    threshold=60
    if abs(pixel1[0]-pixel2[0])

模拟拖动:

拖动速度不可匀速,随机拖动,一般先加速后减速的运动轨迹符合人为标准

公式:x=v0 * t + 0.5 * a * t * t v=v0 + a * t

def get_track(self,distance):
    """
    根据偏移量获取移动轨迹
    """
    track = [] # 移动轨迹
    current = 0 # 当前位移
    mid = distance * 4 / 5 # 减速阈值
    t = 0.2 # 计算间隔
    v = 0 # 初速度
    while current < distance:
        if current < mid:
            a = 2 # 加速度为正 2
        else:
            a = -3 # 加速度为负 3
        v0 = v
        v = v0 + a * t
        move = v0 * t + 1/2 * a * t * t
        current+=move # 当前位移
        track.append(round(move)) # 加入轨迹
    return track

最后一步:按照运动轨迹拖动滑块

def move_to_gap(self,slider,tracks):
    """
    拖动滑块到缺口处
    """
    ActionChains(self.browser).click_and_hold(slider).perform()  # 按住拖动底部滑块
    for x in tracks:
        ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform()  #拖动位移
    time.sleep(1)
    ActionChains(self.browser).release().perform() # 松开鼠标


# tesserocr 库用法
# 识别测试
import tesserocr
from PIL import Image

image = Image.open('code.jpg')
result = tesserocr.image_to_text(image)
print(result)

# 等同于如下

import tesserocr
print(tesserocr.file_to_text('image.png'))

# 迷惑验证码处理
import tesserocr
from PIL import Image

image = Image.open('code2.jpg')
image = image.convert('L')  # 将图片转为灰度图像
threshold = 80
table = []
for i in range(256):
    if i< threshold:
        table.append(0)
    else:
        table.append(1)

image= image.point(table,'1') # 指定二值化处理
image.show()  # 这时的验证码没有线条缠绕,数值分明
# 下述开始识别验证码
result = tesserocr.image_to_text(image)
print(result)

# 极验滑动验证码的识别

# 初始化
import time

from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

EMAIL = '[email protected]'
PASSWORD = '123456'
class CrackGeetest():
    def __init__(self):
        self.url ='https://account.geetest.com/login'
        self.browser = webdriver.Chrome()
        self.wait = WebDriverWait(self.browser,20)
        self.email = EMAIL
        self.password = PASSWORD

# 模拟点击
def get_geetest_button(self):
    button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_radar_tip')))
    return button
# 点击验证按钮
    button = self.get_geetest_button()
    button.click()
# 识别缺口
def get_position(self):
    '''获取验证码位置:return:验证码位置元组'''
    img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME,'geetest_canvas_img')))
    time.sleep(2)
    location = img.location
    size = img.size
    top ,bottom,left,right = location['y'],location['y']+size['height'],location['x'],location['x']+size['width']
    return (top,bottom,left,right)
    # 获取网页截图
def get_geetest_image(self,name='captcha.png'):
    """获取验证码图片:return :图片对象"""
    top,bottom,left,right =self.get_position()
    print('验证码位置',top,bottom,left,right)
    screenshot= self.get_screenshot()
    captcha = screenshot.crop((left,top,right,bottom)) # crop 裁剪
    return captcha

def get_slider(self):
    """获取滑块"""
    slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
    return slider
# 点按呼出缺口
slider = self.get_slider()
slider.click()

def is_pixel_equal(self,image1,image2,x,y):
    """判断两个像素是否相同"""
    # 取两个图片的像素点
    pixel1 = image1.load()[x,y]
    pixel2 = image2.load()[x,y]
    threshold = 60
    if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1]- pixel2[1]) 

代码摘抄之《Python 3网络爬虫开发实战》

你可能感兴趣的:(Python,3网络爬虫,python,爬虫)