【python爬虫】设计自己的爬虫 1. request封装

通过requests.session().request 封装request方法
考虑到请求HTTP/2.0
同时封装httpx 来处理HTTP/2.0的请求

封装requests

# 遇到请求失败的情况时 重新请求,请求5次等待2s
@retry(stop_max_attempt_number=5, retry_on_result=lambda re_data: re_data is None, wait_fixed=2000)
    def requests_request(self, method, url, params=None, data=None, json=None, headers=None, files=None, verify=False,
                         cert=None, timeout=None, proxies=None, proxy=None, **kwargs):

        # 对异常进行捕获
        try:
            """
                封装request请求,将请求方法、请求地址,请求参数、请求头等信息入参。
                注 :verify: True/False,默认为True,认证SSL证书开关;cert: 本地SSL证书。如果不需要ssl认证,可将这两个入参去掉
            
                使用session管理器
                requests.session(): 维持会话,跨请求的时候保存参数   
            """

            # 处理代理
            proxies = None
            if proxy:
                proxies = {
                    'http://': 'http://' + proxy,
                    'https://': 'https://' + proxy,
                }
            #  使用requests.session().request 请求
            re_data = requests.session().request(method, url, params=params, data=data, json=json, headers=headers,
                                                 files=files, cert=cert, timeout=timeout, verify=verify,
                                                 proxies=proxies, **kwargs)
        # 异常处理 报错显示具体信息
        except Exception as e:
            re_data = None
            # 打印异常
            print("请求失败:{0}".format(e))
            logger.error("Error occurred: %s", str(e), exc_info=True)
            # 重新抛出异常,触发 retry 机制
            raise e
        # 返回响应结果
        return re_data

封装httpx

@retry(stop_max_attempt_number=5, retry_on_result=lambda re_data: re_data is None, wait_fixed=2000)
    def httpx_request(self, method, url, is_http2=False, content=None, data=None, files=None, json=None, params=None,
                      headers=None, cookies=None, timeout=None, extensions=None, proxy=None, **kwargs):

        # 对异常进行捕获
        try:
            """
                使用client  
                method.upper() 请求方法都转为大写
            """
            # 处理代理
            proxies = None
            if proxy:
                proxies = {
                    'http://': 'http://' + proxy,
                    'https://': 'https://' + proxy,
                }

            re_data = httpx.Client(http2=is_http2, proxies=proxies).request(method.upper(), url, content=content,
                                                                            data=data, files=files, json=json,
                                                                            params=params, headers=headers,
                                                                            cookies=cookies, timeout=timeout,
                                                                            extensions=extensions, **kwargs)
        # 异常处理 报错显示具体信息
        except Exception as e:
            re_data = None
            # 打印异常
            print("请求失败:{0}".format(e))
            logger.error("Error occurred: %s", str(e), exc_info=True)
            # 重新抛出异常,触发 retry 机制
            raise e
        # 返回响应结果
        return re_data

将两个请求封装在一个方法里

@retry(stop_max_attempt_number=5, retry_on_result=lambda re_data: re_data is None, wait_fixed=2000)
    def request(self, method, url, is_http2=False, params=None, data=None, json=None, headers=None, files=None,
                verify=False, cert=None, timeout=None, proxies=None, content=None, cookies=None, extensions=None,
                **kwargs):

        try:
            if is_http2:
                re_data = self.httpx_request(method=method.upper(), url=url, is_http2=is_http2, content=content,
                                             data=data, files=files, json=json, params=params, headers=headers,
                                             cookies=cookies, timeout=timeout, extensions=extensions, **kwargs)
            else:
                re_data = self.requests_request(method=method, url=url, params=params, data=data, json=json,
                                                headers=headers, files=files, cert=cert, timeout=timeout, verify=verify,
                                                proxies=proxies, **kwargs)

        # 异常处理 报错显示具体信息
        except Exception as e:
            re_data = None
            # 打印异常
            print("请求失败:{0}".format(e))
            logger.error("Error occurred: %s", str(e), exc_info=True)
            # 重新抛出异常,触发 retry 机制
            raise e
        # 返回响应结果
        return re_data

通过is_http2来区分

测试代码如下

if __name__ == '__main__':

    # request_requests 使用requests请求
    request_data = request_main.requests_request("get", 'https://spa16.scrape.center/')
    if request_data:
        print(request_data.text)
        print(request_data.status_code)

    # httpx 请求HTTP/2.0
    # response = re.httpx_request('GET', 'https://spa16.scrape.center/', True)
    # httpx 一般请求
    # headers = {'User-Agent': 'my-app/0.0.1'}
    # response = re.httpx_request('get', 'https://www.httpbin.org/get',params={'name': 'germey'})
    # print(response.text)
    # print(response.status_code)

    print(datetime.datetime.now())

你可能感兴趣的:(python爬虫,python,爬虫,开发语言)