破解携程中文验证码爬取机票价格数据

国内机票预定APP携程处于垄断地位,但是携程有反爬虫策略,对于密集的查询请求会要求验证,验证操作有两次,一次是拖动验证,一次是点选中文,selenium+webdriver可以轻松绕过这一反爬虫设置。
破解携程中文验证码爬取机票价格数据_第1张图片
重点是中文点选问题,涉及到中文识别OCR技术,笔者曾经使用过阿里云中文识别API,识别率较高,但是比较贵,到开源社区逛了一圈后,发现chineseocr_lite这么一款轻量级OCR项目,实在是雪中送炭,所以本文的技术重点就是python3 + selenium + chromedriver + chineseocr_lite。
破解携程中文验证码爬取机票价格数据_第2张图片
整体项目代码,有需要的童鞋可自取:
https://github.com/ag-niemin/ctrip

对于拖动滑块的破解,很简单,网上有很多帖子可以参考,大多都是使用selenium模拟仿真操作;复杂的是中文点选验证,大致分为三步:
1)识别目标中文字符;
2)识别点选区的中文字符及坐标位置;
3)按照目标中文字符顺序,依次点击中文字符坐标位置;
废话不多说,先上破解代码:

# -*- coding: utf-8 -*-
import os
import sys
import time
import logging
from selenium.webdriver.common.action_chains import ActionChains
sys.path.append(os.getcwd())
from chineseocr_lite import ocr
import importlib
importlib.reload(sys)

logging.basicConfig(level=logging.INFO,
                    filename='selenium.log',
                    filemode='a')

# 破解携程滑块验证码
def crack_slide_verification(browser,url):
    driver = browser
    slider_btn = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq"]/div[1]/div[2]')
    if slider_btn:
        logging.info(url + u' drag slider button')
        actions = ActionChains(driver)
        actions.click_and_hold(slider_btn).perform()
        actions.move_by_offset(280,0).release(slider_btn).perform()
        # driver.save_screenshot('screenshot-verify.png')

        return driver,url

# 破解携程中文验证码
def crack_ocr_verification(browser,url):
    driver = browser
    dest_img_url = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[1]/img').get_attribute('src')
    dest_img_res = ocr.resultBase64(dest_img_url)
    for dest_img_character in dest_img_res:
        # dest_img_characters = unicode(dest_img_character['word'], 'utf-8')
        dest_img_characters = dest_img_character['word']
        logging.info(url + u' dest characters: ' + dest_img_characters)
        characters = list(dest_img_characters)

    sele_img_url = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[3]/img').get_attribute('src')
    sele_img_res = ocr.resultBase64(sele_img_url)
    sele_characters = []
    sele_characters_pos = []
    for sele_img_character in sele_img_res:
        sele_characters.append(sele_img_character['word'])
        sele_characters_pos.append(sele_img_character['pos'])
    logging.info(url + u' candidate characters: ' + ' '.join(sele_characters))

    characters_pos = []
    for c in characters:
        for i in range(0,len(sele_characters)):
            if sele_characters[i] == c:
                characters_pos.append(sele_characters_pos[i])

    return driver,url,characters,characters_pos

# 刷新携程中文验证码
def fresh_verification(browser,url,characters,characters_pos):
    driver = browser
    if len(characters_pos) == len(characters):
        return driver,url,characters,characters_pos

    while (len(characters_pos) != len(characters)):
       cpt_choose_refresh = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[4]/div/a')
       cpt_choose_refresh.click()
       driver,url,characters,characters_pos = crack_ocr_verification(driver,url)

       if len(characters_pos) == len(characters):
           # driver.save_screenshot('screenshot-verify.png')
           return driver,url,characters,characters_pos

# 点选携程中文验证码
def click_verification(browser,url,characters,characters_pos):
    driver = browser

    actions = ActionChains(driver)
    while (len(characters_pos) == len(characters)):
        cpt_big_img = driver.find_element_by_class_name("cpt-big-img")
        for i in range(0,len(characters)):
            logging.info(url + u' click ' + characters[i] + u' located (' + str(characters_pos[i]['x']) + ',' + str(characters_pos[i]['y']) + ')')
            actions.move_to_element_with_offset(cpt_big_img,0,0).perform()
            actions.move_by_offset(characters_pos[i]['x'],characters_pos[i]['y']).click().perform()
            time.sleep(2)
        # driver.save_screenshot('screenshot-click.png')

        # 提交点选验证码
        cpt_choose_submit = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[4]/a')
        cpt_choose_submit.click()
        # driver.save_screenshot('screenshot-submit.png')

        return driver

# 检查是否点选成功
def check_verification(browser,url):
    driver = browser
    cpt_success_click = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq"]/div[1]/div[3]/div/span')
    while (u'校验成功' not in cpt_success_click.text):
        driver,url,characters,characters_pos = crack_ocr_verification(driver,url)
        driver,url,characters,characters_pos = fresh_verification(driver, url, characters, characters_pos)
        driver = click_verification(driver, url, characters, characters_pos)
    logging.info(url + ' ' + cpt_success_click.text)

    # 点击重新搜索
    research_btn = driver.find_element_by_xpath('//*[@id="app"]/div/div[2]/div/div[2]/div/div[2]/div/button')
    research_btn.click()
    # driver.save_screenshot('screenshot-search.png')
    time.sleep(2)
    return driver

接下来的重点是爬虫,将chromedriver中的html元素标签使用xpath提取出来并解析,这一步,写过scrapy和requests爬虫的童鞋们肯定非常熟悉,就不做过多赘述。
废话不多说,直接贴出爬虫代码,笔者列了大约15条航线18天的机票价格:

# -*- coding: utf-8 -*-
import time
import sys
import os
sys.path.append(os.getcwd())
import datetime
import logging
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from data import t_market_airticket_day
from OracleUtils import Oracle
import crack as crack
import importlib
importlib.reload(sys)


logging.basicConfig(level=logging.INFO,
                    filename='selenium.log',
                    filemode='a')


class selenium_ctrip(object):

    BROWSER_PATH = os.path.dirname(__file__) + '/browser/chromedriver.exe'
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    DATABASE = 'oracle://stg:[email protected]:1521/?service_name=db'

    city_dict_en = {
            'BJS': "北京",
            'SHA': "上海",
            'SZX': "深圳",
            'HGH': "杭州",
            'CTU': "成都",
            'SIA': "西安",
            'CAN': "广州"
        }
    city_dict_cn = {v: k for k, v in city_dict_en.items()}

    city_list = [
            city_dict_cn["北京"] + '-' + city_dict_cn["上海"],
            city_dict_cn["北京"] + '-' + city_dict_cn["深圳"],
            city_dict_cn["北京"] + '-' + city_dict_cn["杭州"],
            city_dict_cn["北京"] + '-' + city_dict_cn["成都"],
            city_dict_cn["上海"] + '-' + city_dict_cn["深圳"],
            city_dict_cn["上海"] + '-' + city_dict_cn["成都"],
            city_dict_cn["上海"] + '-' + city_dict_cn["西安"],
            city_dict_cn["深圳"] + '-' + city_dict_cn["杭州"],
            city_dict_cn["深圳"] + '-' + city_dict_cn["成都"],
            city_dict_cn["深圳"] + '-' + city_dict_cn["西安"],
            city_dict_cn["北京"] + '-' + city_dict_cn["广州"],
            city_dict_cn["上海"] + '-' + city_dict_cn["广州"],
            city_dict_cn["成都"] + '-' + city_dict_cn["广州"],
            city_dict_cn["杭州"] + '-' + city_dict_cn["广州"],
            city_dict_cn["西安"] + '-' + city_dict_cn["广州"],
        ]

    # 未来1天、2天、3天、4天、5天、6天、7天、8天、9天、10天、15天、20天、30天、40天、50天、60天、120天、180天
    date_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 60, 120, 180]

    def get_ctrip_data(self):

        scan_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        scan_hour = time.strftime('%H', time.localtime(time.time()))

        if int(scan_hour) >= 0 and int(scan_hour) <= 23:
            request_urls = []
            for city_li in self.city_list:
                for i in self.date_list:
                    today = datetime.date.today()
                    sp_date = today + datetime.timedelta(days=i)
                    st_date = str(sp_date)[0:10]
                    request_url = "https://flights.ctrip.com/itinerary/oneway/" + city_li.lower() + "?date=" + st_date
                    request_urls.append(request_url)

            browser_path = self.BROWSER_PATH
            options = Options()
            options.add_argument('--headless') # 设置Chrome不弹出界面
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-gpu')  # 禁用GPU加速
            options.add_argument("--user-agent=%s" % self.USER_AGENT)  # 设置用户代理
            options.add_argument('--log-level=3')  # python调用selenium会产生大量日志
            options.add_argument('--start-maximized')  # 最大化运行
            options.add_argument('--disable-infobars')  # 禁用浏览器正在被自动化程序控制的提示
            # options.add_argument('--blink-settings=imagesEnabled=false') # 不加载图片
            options.add_experimental_option('excludeSwitches', ['enable-logging'])

            driver = Chrome(executable_path=browser_path, chrome_options=options)


            for url in request_urls:
                items = []
                driver.get(url)
                # 判断是否弹出滑动验证码
                try:
                    if driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq"]/div[1]/div[2]'):
                        driver, url = crack.crack_slide_verification(driver, url)
                        driver, url, characters, characters_pos = crack.crack_ocr_verification(driver, url)
                        driver, url, characters, characters_pos = crack.fresh_verification(driver, url, characters, characters_pos)
                        driver = crack.click_verification(driver, url, characters, characters_pos)
                        driver = crack.check_verification(driver, url)

                        # 判断是否下拉到底部
                        s = 0
                        t = 1
                        while s < t:
                            for i in range(10):  # 下拉10次
                                driver.execute_script("var q=document.documentElement.scrollTop=10000")
                            elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
                            s = len(elements)
                            for i in range(10):  # 再下拉10次
                                driver.execute_script("var q=document.documentElement.scrollTop=10000")
                            elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
                            t = len(elements)

                except:

                    # 判断是否下拉到底部
                    s = 0
                    t = 1
                    while s < t:
                        for i in range(10):  # 下拉10次
                            driver.execute_script("var q=document.documentElement.scrollTop=10000")
                        elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
                        s = len(elements)
                        for i in range(10):  # 再下拉10次
                            driver.execute_script("var q=document.documentElement.scrollTop=10000")
                        elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
                        t = len(elements)

                driver.implicitly_wait(2)
                # driver.save_screenshot('screenshot-result.png')
                html = driver.page_source

                rbody = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
                res = rbody.xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
                if res:
                    # print(url + ' selenium chrome scraped %s records' % str(len(res)))
                    logging.info(url + ' selenium chrome scraped %s records' % str(len(res)))
                    for r in res:
                        st_date = url[-10:]
                        city_li = url.replace('https://flights.ctrip.com/itinerary/oneway/', '')[0:7].upper()
                        startcity = self.city_dict_en[city_li[0:city_li.index('-')]]
                        stopcity = self.city_dict_en[city_li[city_li.index('-') + 1:]]

                        startairport = r.xpath('./div[1]/div[1]/div[@class="inb right"]/div[@class="airport"]//text()')[0]
                        starttime = r.xpath('./div[1]/div[1]/div[@class="inb right"]/div[@class="time_box"]/strong[1]/text()')[0]
                        stopairport = r.xpath('./div[1]/div[1]/div[@class="inb left"]/div[@class="airport"]//text()')[0]
                        stoptime = r.xpath('./div[1]/div[1]/div[@class="inb left"]/div[@class="time_box"]/strong[1]/text()')[0]
                        airline = r.xpath('./div[1]/div[1]/div[@class="inb logo"]/div[1]/div[1]/span[1]/span[1]/strong[1]/text()')[0]
                        airtype = r.xpath('./div[1]/div[1]/div[@class="inb logo"]/div[1]/div[1]/span[1]/span[1]/span[1]/text()')[0]
                        if r.xpath('./div[1]/div[1]/div[@class="inb price child_price lowest_price"]/div[1]/span[@class="base_price02"]/text()'):
                            price = r.xpath('./div[1]/div[1]/div[@class="inb price child_price lowest_price"]/div[1]/span[@class="base_price02"]/text()')[0]
                            class_discount = r.xpath('./div[1]/div[1]/div[@class="inb price child_price lowest_price"]/div[1]/div[@class="flight_price_tips"]/div[1]/span[1]/text()')[0]
                        else:
                            price = r.xpath('./div[1]/div[1]/div[@class="inb price child_price"]/div[1]/span[@class="base_price02"]/text()')[0]
                            class_discount = r.xpath('./div[1]/div[1]/div[@class="inb price child_price"]/div[1]/div[@class="flight_price_tips"]/div[1]/span[1]/text()')[0]
                        classgrade = class_discount[0:class_discount.index(u'舱') + 1]
                        discount = class_discount.replace(classgrade, '') or u'全价'

                        item = {}
                        item['scan_date'] = datetime.datetime.strptime(str(scan_date), '%Y-%m-%d')
                        item['scan_hour'] = str(scan_hour)
                        item['start_city'] = startcity
                        item['stop_city'] = stopcity
                        item['start_airport'] = startairport
                        item['start_time'] = datetime.datetime.strptime(st_date + ' ' + starttime, '%Y-%m-%d %H:%M')
                        item['stop_airport'] = stopairport
                        if int(starttime[0:2]) <= int(stoptime[0:2]):
                            item['stop_time'] = datetime.datetime.strptime(st_date + ' ' + stoptime, '%Y-%m-%d %H:%M')
                        else:
                            item['stop_time'] = datetime.datetime.strptime(st_date + ' ' + stoptime,'%Y-%m-%d %H:%M') + datetime.timedelta(days=1)
                        item['airline'] = airline
                        item['air_type'] = airtype
                        item['source'] = url
                        item['low_price'] = price
                        item["discount"] = discount
                        item["class_grade"] = classgrade

                        # print(item)
                        items.append(item)

                else:
                    # print(url + " selenium chrome failure, failure")
                    # driver.save_screenshot('screenshot-failure.png')
                    logging.info(url + " selenium chrome failure, failure")

            driver.quit()
            res = {'scan_date': scan_date,
                   'scan_hour': scan_hour,
                   'flights' : items}
            return res


    def load_ctrip_data(self,seleres):
        table = t_market_airticket_day()
        self.table_name = table.table_name
        self.column_list = table.column_list

        orcl = Oracle()
        insertValues = []
        deleteValues = []

        scan_date = datetime.datetime.strptime(str(seleres['scan_date']), '%Y-%m-%d')
        scan_hour = seleres['scan_hour']
        deleteValues.append([scan_date,scan_hour])

        for item in seleres['flights']:
            insertValues.append([item['scan_date'],
                                 item['scan_hour'],
                                 item['start_city'],
                                 item['stop_city'],
                                 item['start_airport'],
                                 item['start_time'],
                                 item['stop_airport'],
                                 item['stop_time'],
                                 item['airline'],
                                 item['air_type'],
                                 item['class_grade'],
                                 item['low_price'],
                                 item['discount'],
                                 item['source']])

        column_nums = len(self.column_list)
        orders = list(range(1, column_nums + 1))
        value_orders = ','.join([':' + str(i) for i in orders])
        insertsql = "insert into %s(%s) values(%s)" % (self.table_name, ','.join(self.column_list), value_orders)
        deletesql = "delete from %s where scan_date=:1 and scan_hour=:2" % (self.table_name)
        orcl.batchinsert_ex(deletesql, deleteValues, insertsql, insertValues)


if __name__ == '__main__':
    ctrip = selenium_ctrip()
    res = ctrip.get_ctrip_data()
    ctrip.load_ctrip_data(res)

主体代码已经完成,笔者整体项目是将爬取的数据直接载入oracle数据库,这个根据所需自行完成剩下代码。

chineseocr_lite对于分散字体的识别准确率并不是那么高,但在每次打开浏览器后成功识别一次即可,最终运行效果如下:

Connected to pydev debugger (build 192.6817.19)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 drag slider button
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 香糊丽舍
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 糊 E 香 含 畅 在
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 四川盆地
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 所 州 盆 四 地 责 I
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 好阿灿歌
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 展 法 京 区 西 系 岛
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 复口大学
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 大 资 十 快友 巴 色 想
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 洛带古镇
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 带 古 异 作 镇 洛 托
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 洛 located (67,164)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 带 located (240,48)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 古 located (127,48)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 镇 located (206,124)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 校验成功,通过!
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 selenium chrome scraped 26 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-15 selenium chrome scraped 25 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-16 selenium chrome scraped 23 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-17 selenium chrome scraped 25 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-18 selenium chrome scraped 37 records

你可能感兴趣的:(破解携程中文验证码爬取机票价格数据)