爬虫高级应用05--验证码识别(打码平台的使用)

一、图形验证码

爬虫高级应用05--验证码识别(打码平台的使用)_第1张图片

1.tesseerocr
tesseract a.jpg result -l eng && cat result.txt
识别率低

2、识别验证码平台(打码平台)

超级鹰
网站地址可以百度搜索

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

def main1(urlstr):
    chaojiying = Chaojiying_Client('carmack', 'Vff635241', '96001')
    im = open(urlstr, 'rb').read()
    return chaojiying.PostPic(im, 1902)['pic_str']

if __name__ == '__main__':
	chaojiying = Chaojiying_Client('carmack', 'Vff635241', '96001')
	im = open('pic_b.jpg', 'rb').read()
	print(chaojiying.PostPic(im, 1902))

二.滑动验证码

极验平台

step1. 模拟点击验证按钮

step2. 识别滑动缺口位置
遍历没有缺口和有缺口的两张图片,找出相同位置像素差距超过指定值的像素点,即缺口位置
(目前极验已经改进了了算法)

step3. 模拟拖动滑块

爬虫高级应用05--验证码识别(打码平台的使用)_第2张图片

import random
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from lxml import etree
import time
from PIL import Image
from chaojiying import main1
from io import BytesIO


browser = webdriver.Chrome()
browser.set_window_size(1300, 600)
wait = WebDriverWait(browser, 10)


def get_page():
    url = 'http://bm.e21cn.com/log/reg.aspx'
    browser.get(url)
    html = browser.page_source
    return html


def get_msg(html):
    etree_html = etree.HTML(html)
    username = 'lalala'
    password = '123456'
    tel = '18011405897'
    img_url = etree_html.xpath('//img[@id="imgCheckCode"]/@src')
    check_url = 'http://bm.e21cn.com' + img_url[0][2:]
    img = get_geetest_image('1.png')
    print(img)
    # headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"}
    # response = requests.get(url=check_url, headers=headers)
    # time.sleep(5)
    # with open('./yanzhengma/1.jpg', 'wb') as f:
    #     f.write(response.content)
    check_msg = main1('1.png')
    print(check_msg)
    input_username = wait.until(expected_conditions.presence_of_element_located
                       ((By.CSS_SELECTOR, 'input#username')))
    input_password1 = wait.until(expected_conditions.presence_of_element_located
                       ((By.CSS_SELECTOR, 'input#pwd')))
    input_password2 = wait.until(expected_conditions.presence_of_element_located
                                 ((By.CSS_SELECTOR, 'input#pwd_Q')))
    input_tel = wait.until(expected_conditions.presence_of_element_located
                                 ((By.CSS_SELECTOR, 'input#tel')))
    input_check = wait.until(expected_conditions.presence_of_element_located
                                 ((By.CSS_SELECTOR, 'input#CheckCode')))
    sublime = wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, 'input#btn_login')))
    input_username.send_keys(username)
    input_password1.send_keys(password)
    input_password2.send_keys(password)
    input_tel.send_keys(tel)
    input_check.send_keys(check_msg)
    time.sleep(2)
    sublime.click()


def get_position():
    """
    获取验证码位置
    :return: 验证码位置元组
    """
    img = wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, '#imgCheckCode')))
    time.sleep(2)
    location = img.location
    size = img.size
    print(size)
    top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
        'width']
    return (top, bottom, left, right)


def get_screenshot():
    """
    获取网页截图
    :return: 截图对象
    """
    screenshot = browser.get_screenshot_as_png()
    screenshot = Image.open(BytesIO(screenshot))
    return screenshot


def get_geetest_image(name):
    """
    获取验证码图片
    :return: 图片对象
    """
    top, bottom, left, right = get_position()
    print('验证码位置', top, bottom, left, right)
    screenshot = get_screenshot()
    captcha = screenshot.crop((left, top, right, bottom))
    path = name
    captcha.save(path)
    return captcha


def main():
    html = get_page()
    get_msg(html)


if __name__ == '__main__':
    main()

你可能感兴趣的:(Python)