基于selenium爬取智联招聘及国家企业信用信息公示系统


title: 基于selenium爬取智联招聘及国家企业信用信息公示系统
date: 2019-09-18 09:28:46
categories: 爬虫
tags:

  • scrapy
  • selenium
    cover: https://www.github.com/OneJane/blog/raw/master/小书匠/1566388570600.png

突破加密混淆的js文件,IP封锁,验证码识别(滑动和语序点击并存),useragent检查,多重url拼接cookie

智联招聘

通过获取链接返回的json数据拿到新的页面,selenium进行解析

class ZhilianSpider(scrapy.Spider):
    name = 'zhilian'
    allowed_domains = ['zhaopin.com']
    start_urls = ['https://sou.zhaopin.com/']

    driver = None
    chrome_options = webdriver.ChromeOptions()
    # proxy_url = get_random_proxy()
    # print(proxy_url + "代理服务器正在爬取")
    # chrome_options.add_argument('--proxy-server=https://' + proxy_url.strip())
    prefs = {
        'profile.default_content_setting_values': {
            'images': 1,  # 不加载图片
            "User-Agent": UserAgent().random,  # 更换UA
        }
    }
    chrome_options.add_experimental_option("prefs", prefs)
    if platform.system() == "Windows":
        driver = webdriver.Chrome('chromedriver.exe', chrome_options=chrome_options)
    elif platform.system() == "Linux":
        chrome_options.add_argument("--headless")
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        driver = webdriver.Chrome(
            executable_path="/usr/bin/chromedriver",
            chrome_options=chrome_options)
    wait = WebDriverWait(driver, 15)

    def start_requests(self):
        data = ["游戏", "期货", "贷款"]
        for kw in data:
            yield Request(
                url="https://fe-api.zhaopin.com/c/i/sou?start=0&pageSize=90&cityId=639&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=" + kw + "&kt=3",
                meta={"kw": kw},
                callback=self.parse_pages)  # response获取meta

    def parse_pages(self, response):
        numtotal = json.loads(response.text)["data"]["count"]
        kw = response.meta.get("kw", "游戏")
        for i in range(0, numtotal // 90 + 1):
            url = "https://fe-api.zhaopin.com/c/i/sou?start=" + str(
                90 * i) + "&pageSize=90&cityId=639&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=" + kw + "&kt=3"
            yield Request(
                url=url,
                meta={"kw": kw},
                callback=self.parse)  # response获取meta

    def parse(self, response):
        job_list = json.loads(response.text)["data"]["results"]
        for job in job_list:
            yield Request(url=job["positionURL"], callback=self.parse_detail,
                          meta={'cookiejar': 'chrome', 'kw': response.meta.get("kw", "")})

    def parse_detail(self, response):
        print(response.url)
        self.driver.get(response.url)
        self.driver.refresh()
        time.sleep(2)
        self.driver.implicitly_wait(20)
        dom = etree.HTML(self.driver.page_source)
        item = JobItem()
        item['recruitment_position'] = null_if(dom.xpath('//*[@class="summary-plane__title"]'))
        item['salary'] = null_if(dom.xpath('//*[@class="summary-plane__salary"]'))
        item['company_name'] = dom.xpath('//*[@class="company__title"]')[0].text
        item['work_experience'] = dom.xpath('//ul[@class="summary-plane__info"]/li[2]')[0].text
        item['education_background'] = dom.xpath('//ul[@class="summary-plane__info"]/li[3]')[0].text
        item['job_requirements'] = remove_html(
            etree.tostring(dom.xpath('//div[@class="describtion__detail-content"]')[0], encoding="utf-8").decode(
                'utf-8'))
        item['company_info'] = null_if(dom.xpath('//div[@class="company__description"]'))
        item['company_address'] = remove_html(
            etree.tostring(dom.xpath('//span[@class="job-address__content-text"]')[0], encoding="utf-8").decode(
                'utf-8'))
        if len(dom.xpath('//div[@class="highlights__content"]')):
            item['company_welfare'] = remove_html(etree.tostring(dom.xpath('//div[@class="highlights__content"]')[0], encoding="utf-8").decode('utf-8'))
        else:
            item['company_welfare'] = '无'
        item['id'] = get_md5(self.driver.current_url)
        item['keyword'] = response.meta.get("kw", "")
        item['url'] = response.url
        item['crawl_date'] = datetime.now().strftime("%Y-%m-%d")
        yield item

国家企业信用信息系统

获取cookie

crack.py

class Crack(object):
    """
    同一ip频繁使用:
        出现正常200但是没有结果
        第一次解密出来是错误的
    """
    def __init__(self, url, test_url):
        path = os.getcwd()
        with open(os.path.join(path, "wc_js.js"), encoding='utf-8') as f:
            wc_js = f.read()
        self.wc_js = execjs.compile(wc_js)
        self.url = url
        self.test_url = test_url

        # 固定user_agent,后台使用user-agent验证cookies, 之后的访问也需要使用这个
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
        }

    def acquire_js(self):
        """
        不带cookies请求首页,获得返回的js
        :return:页面中的js,和set_cookies中的jsluid
        """
        response = requests.get(self.url, headers=self.headers)
        if response.status_code == 521:
            return response.text, response.headers['Set-Cookie'].split('=')[1].split(';')[0]
        else:
            print(response.text)
            print(self.headers)
            return None, None

    def first_decryption(self, first_js):
        """
        解密js,获得第二层加密的js
        :param first_js:
        :return:
        """
        x = re.findall('var x="(.*?)"', first_js)[0]
        y = re.findall(',y="(.*?)"', first_js)[0]
        second_js = self.wc_js.call('once_js', x, y)
        # second_js = self.wc_js.call('get_js', x, y, z)
        return second_js

    def regex(self, js):
        regex =  "!*window\[.*?\]"
        find = re.findall(regex, js)
        if find:
            for f in find:
                if '!' in f:
                    if len(re.findall('!', f)) % 2 == 0:
                        js = js.replace(f, 'false')
                    else:
                        js = js.replace(f, 'true')
                else:
                    js = js.replace(f, 'undefined')
        js = js.replace('window.headless', 'undefined')
        return js

    def replace_url(self, js):
        # 替换1
        # 取出两个变量名
        _3d = re.findall("(var .{0,5}=)document\.createElement\('div'\);", js)
        _2b = re.findall("(var .{0,5}=).{0,5}\.match\(/https\?:\\\/\\\//\)\[0\];", js)

        # 替换成要访问的url
        js = re.sub("var .{0,5}=document\.createElement\('div'\);", _3d[0] + f'"{self.url.replace("http://", "")}";',
                    js)
        js = re.sub("_.{0,5}\.innerHTML='';", "", js)
        js = re.sub("_.{0,5}=.{0,5}\.firstChild\.href;", "", js)
        js = re.sub("var .{0,5}=.{0,5}\.match\(/https\?:\\\/\\\//\)\[0\];", _2b[0] + '"http://";', js)
        js = re.sub("_.{0,5}=.{0,5}\.substr\(.{0,5}\.length\)\.toLowerCase\(\);", "", js)
        return js

    def second_decryption(self, second_js):
        """
        把第二层js准换成本地可以运行的js
        !!!此处可能会出错!!!
        :param second_js: 第一次解密的js
        :return: __jsl_clearance的值
        """
        # 转义字符
        js = second_js.replace('\\\\', '\\')

        # 切割
        js = 'cookie' + js.split('document.cookie')[1]
        js = js.split('GMT;Path=/;')[0] + "'"

        if re.findall("(var .{0,5}=)document\.createElement\('div'\);", js):
            js = self.replace_url(js)

        # 替换可能出现的window
        js = self.regex(js)

        s = """
            function cook() {
            %s
            return cookie
            }
            """
        new_js = s % js
        ctx = execjs.compile(new_js)
        # 切割获得的__jsl_clearance
        jsl = ctx.call('cook')
        jsl = jsl.split(';')[0]
        jsl_clearance = jsl.split('=')[1]
        return jsl_clearance

    def test_cookies(self, jsluid, jsl_clearance):
        """
        带cookies访问,测试拿到的是否正确
        :param jsluid:cookies中的参数
        :param jsl_clearance: cookies中的参数
        :return:
        """
        headers = self.headers.copy()
        headers['Cookie'] = f'__jsluid_h={jsluid}; __jsl_clearance={jsl_clearance};'
        response = requests.get(self.test_url, headers=headers)
        print(response.text)
        return response.status_code

    def run(self):
        while True:
            first_js, jsluid = self.acquire_js()
            second_js = self.first_decryption(first_js)
            try:
                jsl_clearance = self.second_decryption(second_js)
            except:
                # print(second_js)
                continue
            else:
                code = self.test_cookies(jsluid, jsl_clearance)
                if code == 200:
                    return jsluid, jsl_clearance
                else:
                    print(code)
                    # print(second_js)
                    continue


if __name__ == '__main__':
    # # 企业信息公示系统
    url = "http://www.gsxt.gov.cn/index.html"
    test_url = "http://www.gsxt.gov.cn/index.html"

    # # 66代理
    # url = "http://www.66ip.cn/2.html"
    # test_url = "http://www.66ip.cn/2.html"

    # # 公安部网站
    # url = 'http://www.mps.gov.cn/'
    # test_url = 'http://www.mps.gov.cn/'

    ck = Crack(url, test_url)
    jsluid, jsl_clearance = ck.run()
    print('jsluid:', jsluid)
    print('jsl_clearance:', jsl_clearance)

利用超级鹰破解验证码

class SearchResultParse(object):
    '''查询结果页解析
    '''

    def __init__(self, pagesource, base_url, parse_rule):
        self.selector = etree.HTML(pagesource)
        self.url_list = []
        self.base_url = base_url
        self.parse_rule = parse_rule['search_result_url']

    def search_result_parse(self):
        self.url_list = [self.base_url + i for i in self.selector.xpath(self.parse_rule)]
        return self.url_list


class PageDetailParse(object):
    '''详情页解析
    '''

    def __init__(self, pagesource, parse_rule):
        self.selector = etree.HTML(pagesource)
        self.parse_rule = parse_rule
        self.info_list = {}

    def search_result_parse(self, primary_info=None):
        if primary_info is None:
            primary_info = []
        for i in self.parse_rule['primaryinfo']:
            primary_info.append(
                self.selector.xpath(i).replace("\n", "").replace("\t", "").replace("\r", "").replace(" ", ""))
        self.info_list['primary_info'] = primary_info
        return self.info_list


class CookieRequest(object):
    '''带cookie访问查询结果
    '''

    def __init__(self, url_list=None):
        '''设置requests中的session的cookie
        '''
        self.url_list = url_list
        self.session = requests.Session()
        self.result = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
        }

    def cookie_requests(self):
        '''带cookie依次访问各个查询结果
        '''
        url = "http://www.gsxt.gov.cn/index.html"
        test_url = "http://www.gsxt.gov.cn/corp-query-entprise-info-hot-search-list.html?province=100000"
        ck = Crack(url, test_url)
        jsluid, jsl_clearance, JSESSIONID = ck.run()
        self.headers['Cookie'] = f'__jsluid_h={jsluid}; __jsl_clearance={jsl_clearance};JSESSIONID={JSESSIONID}'
        for url in self.url_list:
            response = self.session.get(url=url, headers=self.headers)
            self.result.append(response.text)
            time.sleep(5)
        return self.result


class MaxEnterError(Exception):
    '''输入关键字最大尝试次数
    '''

    def __init__(self, ErrorInfo):
        super().__init__(self)  # 初始化父类
        self.errorinfo = ErrorInfo

    def __str__(self):
        return self.errorinfo


class GtClickShot(object):

    def __init__(self, username, password,soft_id):
        '''初始化超级鹰
        softid已固化到程序
        args:
            username(str):超级鹰普通用户名
            password(str):超级鹰密码
        '''
        self.username = username
        self.password = md5(password.encode("utf-8")).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """发送图片至打码平台
        args:       
            im(Byte): 图片字节
            codetype(str): 题目类型 参考 http://www.chaojiying.com/price.html
        return(json):返回打码信息,包含坐标信息,坐标信息用“|”隔开
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """识别错误返回题分
        args:
            im_id(str):报错题目的图片ID
        return(str):报错反馈
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


class CorpSearch(object):
    def __init__(self, init_url, index_url, headers, max_click):

        '''初始化
        args:
            init_url:初始化url,加速乐反爬JS要求访问目标网站前需先访问初始化url获取gt和challenge
            index_url:目标网站首页url
            headers:请求头信息
            max_click:最大循环点击次数为了应对点击不灵敏,设置循环检查点击。
            self.wait:默认条件等待最大时间
            self.click_valitimes:点击验证次数,大于0时需返回题分,等于0时不需要
        '''
        chrome_options = webdriver.ChromeOptions()
        prefs = {
            'profile.default_content_setting_values': {
                'images': 1,  # 加载图片
                "User-Agent": UserAgent().random,  # 更换UA
            }
        }
        chrome_options.add_experimental_option("prefs", prefs)
        self.init_url = init_url
        self.index_url = index_url
        if platform.system() == "Windows":
            self.driver = webdriver.Chrome('chromedriver.exe', chrome_options=chrome_options)
        elif platform.system() == "Linux":
            chrome_options.add_argument("--headless")
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--no-sandbox')
            self.driver = webdriver.Chrome(
                executable_path="/usr/bin/chromedriver",
                chrome_options=chrome_options)
        self.wait = WebDriverWait(self.driver, 50)
        self.max_entertimes = max_click
        self.click_valitimes = 0
        self.action = ActionChains(self.driver)
        self.gt_shot = GtClickShot("****", "*****","901554")
        self.options = webdriver.ChromeOptions()
        self.headers = headers
        for option in self.headers:
            self.options.add_argument(option)

    # 初始化页面,绕过过加速乐反爬,获取gt和challenge,并加载进入首页
    def init(self):

        '''
        请求初始化网站,并进入首页
        '''
        self.driver.get(self.init_url)
        self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > pre:nth-child(1)")))
        self.driver.get(self.index_url)

    # 加载首页,输入查询关键词,点击查询按钮
    # 如果点击按钮失效,自动重新回车,并设定最大回车次数,一旦超过设定值,抛出异常,结束程序
    def input_query(self, keyword):

        '''输入关键词进行查询
        args:
            keyword:查询关键词
        return:
            仅用于方法返回
        '''
        enter_word = self.wait.until(EC.presence_of_element_located((By.ID, "keyword")))
        self.wait.until(EC.presence_of_element_located((By.ID, "btn_query")))
        time.sleep(random.randint(8, 15) / 10)
        enter_word.send_keys(keyword)
        time.sleep(random.randint(5, 10) / 10)
        enter_word.send_keys(Keys.ENTER)
        while True:
            if self.max_entertimes == 0:
                raise MaxEnterError('---Out of max times on the search enter---')
            gt_panel = self.driver.find_element_by_css_selector("body > div.geetest_panel.geetest_wind")
            style_value = gt_panel.value_of_css_property("display")
            if style_value.strip() == "block":
                break
            else:
                enter_word.send_keys(Keys.ENTER)
                time.sleep(random.randint(1, 5) / 10)
                self.max_entertimes -= 1
        return

    # 判断页面中是否包含某个元素,注意是class_name
    def is_element_exist(self, class_name):

        '''判断某个元素是否存在
        args:
            class_name:元素class属性名称
        return:
            存在(True),不存在(False)
        '''

        try:
            self.driver.find_element_by_class_name(class_name)
            return True
        except:
            return False

    # 屏幕截图,并将截图内容读入内存,加速计算操作
    def get_screenshot(self):

        '''屏幕截图
        return:
            返回截图
        '''

        screenshot = self.driver.get_screenshot_as_png()
        screenshot = Image.open(BytesIO(screenshot))
        return screenshot

    # 获取验证验证码图片的位置,用于裁图
    def get_position(self, pos_img):

        '''验证图片的坐标尺寸信息
        args:
            pos_img:验证码定位点元素
        return:
            验证码定位点的坐标信息,注意依次为:左底,左高,右高,右底
        '''

        location = pos_img.location
        size = pos_img.size
        top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
            'width']
        return (left, top, right, bottom)

    # 对于滑块验证码,获取完整的和缺块的验证码图片截图
    def get_slide_images(self):

        '''获取有缺口和没缺口的图片
        '''
        canvas_img = self.wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".geetest_canvas_img.geetest_absolute > div")))
        position = self.get_position(canvas_img)
        befor_screenshot = self.get_screenshot()
        befor_img = befor_screenshot.crop(position)
        befor_img.save("befor_click.png")

        btn_slide = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_slider_button")))
        self.action.click_and_hold(btn_slide).perform()
        after_screenshot = self.get_screenshot()
        after_img = after_screenshot.crop(position)
        after_img.save("after_click.png")

    # 获取缺口位置,计算滑动距离(灰度化,求差值,阈值去燥,计算缺口位置,计算滑动距离)
    def get_slide_distance(self):

        '''获取滑动距离
        return:
            返回滑动距离
        '''

        befor_click_img = "F:\\Anaconda3\\Lib\\captcha\\gt_validate\\befor_click.png"
        after_click_path = "F:\\Anaconda3\\Lib\\captcha\\gt_validate\\after_click.png"
        befor_img = cv2.imread(befor_click_img)
        after_img = cv2.imread(after_click_path)

        befor_gray = cv2.cvtColor(befor_img, cv2.COLOR_BGR2GRAY)
        after_gray = cv2.cvtColor(after_img, cv2.COLOR_BGR2GRAY)
        img_diff = np.array(befor_gray) - np.array(after_gray)

        height, width = img_diff.shape

        for i in range(height):
            for j in range(width):
                if img_diff[i][j] > 245 or img_diff[i][j] < 60:
                    img_diff[i][j] = 0

        start_position = random.choice([4, 5, 6])
        reshape_img = img_diff.T
        sum_color = list(map(lambda x: sum(x), reshape_img))
        for i in range(1, len(sum_color)):
            if sum_color[i] > 1000 and i > 60:
                end_position = i
                break

        slide_distance = end_position - start_position
        return slide_distance

    # 模拟鼠标轨迹,按照开始慢加速(2),中间快加速(5),后面慢加速(2),最后慢减速的方式(1)
    # 返回值是x值与Y值坐标以及sleep时间截点,起始中间最后都要sleep
    def get_track(self, distance, track_list=None):

        '''获取滑动轨迹
        args:
            distance:滑动距离
        kargs:
            Track_list:滑动轨迹,初始化为空
        return:
            滑动轨迹,断点位置(2处)
        '''

        if track_list is None:
            track_list = []
        base = distance / 10
        x1 = round(base * 2)
        x2 = round(base * 5)
        x3 = x1
        x4 = distance - x1 - x2 - x3
        ynoise_num = random.randint(5, 10)
        y1 = [random.randint(-2, 2) for _ in range(ynoise_num)]
        yrdm = list(set(random.choice(range(distance)) for _ in range(ynoise_num)))
        x = [1] * distance
        y = [0] * distance
        for i, j in enumerate(yrdm):
            y[j] = y1[i]
        t1 = sorted([random.randint(8, 13) / 1000 for _ in range(x1)], reverse=True)
        t2 = sorted([random.randint(1, 8) / 1000 for _ in range(x2)], reverse=True)
        t3 = sorted([random.randint(8, 13) / 1000 for _ in range(x3)], reverse=True)
        t4 = sorted([random.randint(12, 20) / 1000 for _ in range(x4)])
        t = t1 + t2 + t3 + t4

        for i in (zip(x, y, t)):
            track_list.append(i)
        return (track_list, x1 + x2, x1 + x2 + x3)

    # 对于点击验证码,获取验证码的校验文字和待点击图片截图,以及验证码弹框元素
    def get_click_images(self):

        '''获取需点击的图片
        return: 
            需点击坐标的图片,
            提示图片(用于调试打码时的计算点击次数),  
            验证码图片定位元素(用于定位鼠标位置并计算相对坐标)
        '''

        click_img_element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_widget")))
        self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_item_img")))
        time.sleep(random.randint(1, 5) / 10)
        click_position = self.get_position(click_img_element)
        all_screenshot = self.get_screenshot()
        click_img = all_screenshot.crop(click_position)
        click_img.save("click_img.png")

        tip_img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_tip_img")))
        tip_position = self.get_position(tip_img)
        tip_img = all_screenshot.crop(tip_position)
        tip_img.save("tip_img.png")

        return (click_img, tip_img, click_img_element)

    # 计算要点击的字符数量,灰度化,反向二值化,转置,沿X坐标对Y求和,判断分割点数量,判断字符数量
    def cal_char_num(self, char_img_path):

        '''计算需点击的字符数量
        args:
            char_img_path:提示图片的存储路径
        return:
            点击次数
        '''

        flag = 0
        origin_img = cv2.imread(char_img_path)
        gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)
        ret, thresh1 = cv2.threshold(gray_img, 127, 255, cv2.THRESH_BINARY_INV)
        transpos_img = np.array(thresh1).T
        result = list(map(lambda x: sum(x), transpos_img))
        for i in range(len(result) - 3):
            if result[i] == 0 and result[i + 1] == 0 and result[i + 2] > 0:
                flag += 1
        return flag

    # 返回验证码字符的坐标,每个点击点的坐标,并转化为整数坐标
    def char_absolute_coord(self, img, num, coord=None):

        '''调试用,点击验证码图片返回整数值坐标
        args:
            img:验证码图片
            num:点击次数
        kargs:
            coord:验证码字符坐标    
        return:
            字符坐标
        '''
        if coord is None:
            coord = []
        img = Image.open(img)
        plt.imshow(img)
        points = plt.ginput(num)
        plt.close()
        for i in points:
            x_co, y_co = i
            coord.append((round(x_co), round(y_co)))
        return coord

    # 返回从起点开始依次到每个点击文字的相对位置,形式为[(xoffset,yoffset),(),(),...]
    def get_offset_coord(self, absolute_coord, click_track=None):

        '''获取相邻点击字符的相对坐标,用于鼠标移动点击
        args:
            absolute_coord:验证码字符的绝对坐标
        kargs:
            click_track:每个需点击字符间的相对坐标或位移
        return:
            相对坐标或位移
        '''

        if click_track is None:
            click_track = []
        for i, j in enumerate(absolute_coord):
            if i == 0:
                click_track.append(j)
            else:
                click_track.append((j[0] - absolute_coord[i - 1][0], j[1] - absolute_coord[i - 1][1]))
        return click_track

    # 验证点击验证码,获取验证码数量,人工点击,按照计算的坐标相对偏移位置,依次点击文字进行验证
    # 通过打码平台,将验证码图片发送后返回坐标信息,通过超级鹰打码平台
    def click_captcha_validate(self):

        '''根据打码平台返回的坐标进行验证
        
        return:
            仅仅用于方法返回
        '''
        click_img, tip_img, click_img_element = self.get_click_images()

        bytes_array = BytesIO()
        click_img.save(bytes_array, format="PNG")
        coord_result = self.gt_shot.PostPic(bytes_array.getvalue(), "9005")
        print(coord_result)
        groups = coord_result.get("pic_str").split('|')
        if groups == "":
            raise RuntimeError("打码超时")
        pic_id = coord_result.get("pic_id")
        points = [[int(num) for num in group.split(',')] for group in groups]

        #        tip_img_path="D:\\Anaconda3\\Lib\\captcha\\gt_validate\\tip_img.png"
        #        click_img_path="D:\\Anaconda3\\Lib\\captcha\\gt_validate\\click_img.png"

        #        num=self.cal_char_num(tip_img_path)
        #        points=self.char_absolute_coord(click_img_path,num)

        mouse_track = self.get_offset_coord(points)
        print(mouse_track)
        self.action.move_to_element_with_offset(click_img_element, 0, 0)
        for position in mouse_track:
            self.action.move_by_offset(position[0], position[1])
            self.action.click()
            self.action.pause(random.randint(3, 7) / 10)
        self.action.perform()
        time.sleep(random.randint(4, 6) / 10)
        click_submit_btn = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_commit_tip')))
        click_submit_btn.click()
        self.action.reset_actions()
        self.valide_process(pic_id=pic_id)
        return

    # 验证滑动验证码,获取滑动距离和滑动轨迹,分别在起始,中间,结束时随机停顿
    def slide_captcha_validate(self):

        '''滑动验证码验证
        return:
            仅仅用于方法返回
        '''

        self.get_slide_images()
        distance = self.get_slide_distance()
        track, p1, p2 = self.get_track(distance)
        time.sleep(random.randint(3, 7) / 10)
        for i, j in enumerate(track):
            if i == p1 or i == p2:
                time.sleep(random.randint(3, 7) / 10)
            self.action.move_by_offset(j[0], j[1])
            time.sleep(j[2])
        time.sleep(random.randint(3, 7) / 10)
        self.action.release()
        self.valide_process()
        return

    # 验证是否成功破解,设置重启机制
    # 超过最大验证次数需点击“点击此处重试”
    def valide_process(self, pic_id=None):

        '''验证过程
        1>判断极验弹框消失且查询结果框出现,验证成功,结束验证;
        2>第一步验证失败,超时;
        3>超时原因:极验验证框没消失(跳转至第4步)或查询结果框没出现(跳转至第6步);
        4>极验验证框没消失,检验是否超过最大验证次数,如果是,需点击重试,跳至第7步,如果不是,跳至第5步;
        5>如果不是,判断验证类型,调用响应验证方法,跳至第1步;
        6>如果查询结果框没出现,直接退出关闭浏览器;
        7>点击重试时,如果是空白响应则退出浏览器,或者判断验证类型,调用响应验证方法,跳至第1步。
        args:
            cap_type:验证码类型
            pic_id:点击类验证码图片id
        return:
            要么验证成功,要么退出浏览器
        '''

        try:
            WebDriverWait(self.driver, 3).until_not(
                EC.visibility_of_element_located((By.CSS_SELECTOR, "body > div.geetest_panel")))
            WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located((By.ID, "advs")))
            print("Validate Successful")
            return
        except TimeoutException:
            try:
                gt_panel_error = self.driver.find_element_by_css_selector(
                    "body > div.geetest_panel.geetest_wind > div.geetest_panel_box > div.geetest_panel_error")
                error_display = gt_panel_error.value_of_css_property("display")

                if error_display.strip() == "block":
                    gt_panel_error_content = self.driver.find_element_by_css_selector(
                        ".geetest_panel_error > div.geetest_panel_error_content")
                    self.action.move_to_element(gt_panel_error_content).click().perform()
                    self.action.reset_actions()
                    try:
                        WebDriverWait(self.driver, 3).until_not(
                            EC.visibility_of_element_located((By.CSS_SELECTOR, "body > div.geetest_panel")))
                        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('advs').is_displayed())
                        print("Validate Successful")
                        return
                    except TimeoutException:
                        self.slide_orclick_validate(pic_id)
                else:
                    self.slide_orclick_validate(pic_id)

            except:
                print('error occured')
                return

    # 判断是执行点击还是滑块
    def slide_orclick_validate(self, pic_id=None):

        '''判断下一步是选择滑动验证还是点击验证还是退出浏览器
        args:
            pic_id:点击类验证码图片id
        return:
            要么滑动验证,要么点击验证,要么None          
        '''

        try:
            WebDriverWait(self.driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_close")))
            print('Validate Failed,retry again')
            if self.is_element_exist("geetest_canvas_img"):
                print('captcha type is slide')
                return self.slide_captcha_validate()
            else:
                print('captcha type is click')
                if self.click_valitimes > 0:
                    self.gt_shot.ReportError(pic_id)
                self.click_valitimes += 1
                return self.click_captcha_validate()
        except:
            print("Directly no click or slide validate")
            return

    # 带cookie切换至首页继续检索
    def switch_hmpg(self):

        '''由结果页切换至首页
        return: 用于方法返回
        '''
        self.wait.until(EC.presence_of_element_located((By.ID, "advs")))
        hmpg_btn = self.driver.find_element_by_css_selector(
            "body > div.container > div.header_box > div > div > a:nth-child(1)")
        self.action.move_to_element(hmpg_btn).click().perform()
        self.action.reset_actions()
        self.wait.until(lambda x: x.find_element_by_id('btn_query').is_displayed())
        return

    # 通过index界面或者点击首页继续检索时的爬取步骤
    def main(self, keyword, start_pg=None):

        '''操作主程序
        args:
            keyword:查询关键词
        kargs:
            start_pg:是否需要初始化访问加速乐,默认要
        
        '''

        if start_pg == "homepage":
            self.switch_hmpg()
        else:
            self.init()
        self.input_query(keyword)
        self.slide_orclick_validate()

    # 保存cookie和检索结果,用于requests及详情解析
    def to_dict(self):

        '''保存cookie(用于requests请求及详情解析)和查询结果
        args:
            cookie_name:cookie文件名称
        '''

        htmlpage = self.driver.page_source

        return {
                'page': htmlpage
                }


if __name__ == '__main__':
    init_url = "http://www.gsxt.gov.cn/SearchItemCaptcha"
    index_url = "http://www.gsxt.gov.cn/index.html"
    base_url = 'http://www.gsxt.gov.cn'
    result_parse_rule = {'search_result_url': '//*[@id="advs"]/div/div[2]/a/@href'}
    detail_parse_rule = {
        'primaryinfo': ['string(//*[@id="primaryInfo"]/div/div[@class="overview"]/dl[{}])'.format(i) for i in
                        range(15)], }
    max_click = 10
    chm_headers = ['Host="www.gsxt.gov.cn"',
                   'Connection="keep-alive"',
                   'User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"',
                   'Upgrade-Insecure-Requests=1',
                   'Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"',
                   'Accept-Encoding="gzip, deflate"',
                   'Accept-Language="zh-CN,zh;q=0.9"']

    search = CorpSearch(init_url, index_url, chm_headers, max_click)
    search.main("腾讯")
    cookie_html = search.to_dict()
    search_result = SearchResultParse(cookie_html['page'], base_url, result_parse_rule)
    url_list = search_result.search_result_parse()

    detail_request = CookieRequest(url_list=url_list)
    detail_result = detail_request.cookie_requests()
    for pg in detail_result:
        pg_detail = PageDetailParse(pg, detail_parse_rule)
        detail = pg_detail.search_result_parse()
        m = re.findall(r'\[(.*?)\]', str(detail))
        info_list = m[0].replace('\'', '').split(', ')
        sql = "insert into company(code,name,type,start,end,) values(%s,%s,%s,%s.%s)"
        count, rt_list = MysqlConnection.execute_sql(sql, (info_list[0],info_list[1],info_list[2],info_list[3]))

爬虫实现

class EnterPriseSpider(scrapy.Spider):
    name = 'enterprise'
    allowed_domains = ['gsxt.gov.cn']
    start_urls = ['http://www.gsxt.gov.cn/index.html']

    def __init__(self, word=None, *args, **kwargs):
        super(eval(self.__class__.__name__), self).__init__(*args, **kwargs)
        self.word = word

    def start_requests(self):
        init_url = "http://www.gsxt.gov.cn/SearchItemCaptcha"
        index_url = "http://www.gsxt.gov.cn/index.html"
        base_url = 'http://www.gsxt.gov.cn'
        result_parse_rule = {'search_result_url': '//*[@id="advs"]/div/div[2]/a/@href'}


        max_click = 10
        chm_headers = ['Host="www.gsxt.gov.cn"',
                       'Connection="keep-alive"',
                       'User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"',
                       'Upgrade-Insecure-Requests=1',
                       'Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"',
                       'Accept-Encoding="gzip, deflate"',
                       'Accept-Language="zh-CN,zh;q=0.9"']

        search = CorpSearch(init_url, index_url, chm_headers, max_click)
        search.main(self.word)
        cookie_html = search.to_dict()
        search_result = SearchResultParse(cookie_html['page'], base_url, result_parse_rule)
        url_list = search_result.search_result_parse()


        yield Request(url="https://www.baidu.com/",callback=self.parse,
                      meta={'url_list': url_list})

    def parse(self, response):
        detail_parse_rule = {
            'primaryinfo': ['string(//*[@id="primaryInfo"]/div/div[@class="overview"]/dl[{}])'.format(i) for i in
                            range(15)], }
        url_list = response.meta.get("url_list", "")
        detail_request = CookieRequest(url_list=url_list)
        detail_result = detail_request.cookie_requests()
        for pg in detail_result:
            pg_detail = PageDetailParse(pg, detail_parse_rule)
            detail = pg_detail.search_result_parse()
            m = re.findall(r'\[(.*?)\]', str(detail))
            info_list = m[0].replace('\'', '').split(', ')
            item = CompanyItem()
            item['name'] = company_info(info_list, "企业名称:")
            item['code'] = company_info(info_list, "统一社会信用代码:")
            item['type'] = company_info(info_list, "类型:")

            start = company_info(info_list, "营业期限自:")
            partner_start = company_info(info_list, "合伙期限自:")
            item['start'] = start if "无" == partner_start else partner_start
            end = company_info(info_list, "合伙期限自:")
            partner_end = company_info(info_list, "合伙期限至:")
            item['end'] = end if "无" == partner_end else partner_end

            item['capital'] = company_info(info_list, "注册资本:")
            item['owner'] = company_info(info_list, "法定代表人:")
            item['establish'] = company_info(info_list, "成立日期:")
            item['registration'] = company_info(info_list, "登记机关:")
            item['check'] = company_info(info_list, "核准日期:")
            item['status'] = company_info(info_list, "登记状态:")
            residence = company_info(info_list, "住所:")
            premises = company_info(info_list, "主要经营场所:")
            item['address'] = residence if "无" == premises else premises
            item['scope'] = company_info(info_list, "经营范围:")
            item['partner'] = company_info(info_list, "执行事务合伙人:")
            yield item

main.py

from scrapy import cmdline
from scrapy.cmdline import execute

import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# execute(["scrapy", "crawl", "enterprise","-a","word=百度"])
execute(["scrapy", "crawl", "zhilian"])

你可能感兴趣的:(爬虫)