Scrapy通用文件(附带cookie测试)

测试专用:

import os, sys
import time
import re
import json
import random

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import BaseSpider
from scrapy.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.http.request.form import _get_form,_get_inputs
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from scrapy.spiders import Rule
from scrapy.selector import Selector
from scrapy.http.cookies import CookieJar



class SpiderSpider(InitSpider):
    name = 'spider'
    # allowed_domains = ['www.amazon.com']
    handle_httpstatus_list = [403]

    cookie_jar = CookieJar()
    cookie_save_type = 'sqlite3'
    cookie_save = "./cookie/"
    cookie_file = "cookies.json"
    cookie_save_type = 'file'
    try_login_max_time = 3

    start_urls = []

    def __init__(self):
        self.chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
        self.csrftoken = ''.join(random.sample(self.chars, 32))
        print(self.csrftoken)
        pass

    def init_request(self):
        url = "xxxxxx"一般为网站首页url
        headers = {}
        cookies = self.get_cookie()
        yield Request(url=url, callback=self.parse, headers=headers,cookies=cookies)

    def parse(self, response):
        url = "xxxxxxxxxxxxxxx"#请求页面url
        headers = {}
        cookies = self.merge_cookie(response)
        yield Request(url=url, callback=self.get_product_url, headers=headers,cookies=cookies)

    def get_product_url(self, response):
        self.log(response.text)

    def get_cookie(self):
        cookie_jar = self.cookie_jar
        try:
            cookies = {}
            if self.cookie_save_type == 'file':
                with open(self.cookie_file, 'r+') as f:
                    data = f.read()
                    if data:
                        cookies = json.loads(data)
            return cookies
        except IOError as e:
            return {}

    def merge_cookie(self, response):
        cookie_jar = self.cookie_jar
        cookie_jar.extract_cookies(response, response.request)
        cookies_final = self.get_cookie()

        p = re.compile(r'')
        for item in cookie_jar:
            cookies = re.findall(p, str(item))
            cookies = (cookie.split('=', 1) for cookie in cookies)
            cookies = dict(cookies)
            cookies_final.update(cookies)
        cookie_str = json.dumps(cookies_final)
        if self.cookie_save_type == 'file':
            with open(self.cookie_file, 'w') as f:
                f.write(cookie_str)
        return cookies_final

if __name__ == "__main__":
    script_path = os.path.split( os.path.realpath( sys.argv[0] ) )[0]
    os.chdir(script_path)
    start_at = time.time()

    # parser = argparse.ArgumentParser(description='Arguments')
    # parser.add_argument('--email', help='Email', required=True)
    # parser.add_argument('--password', help='Password', required=True)
    # parser.add_argument('--check_phone', help='Enter phone valide code by hand', required=False, default='n')
    # args = vars(parser.parse_args())
    # email = args['email']
    # password = args['password']
    # check_phone = args['check_phone']
    # params = {'email':email,'password':password,'check_phone':check_phone}
    params = {}

    CrawlSettings = {
        'BOT_NAME': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'DOWNLOAD_TIMEOUT' : 60,
        'DOWNLOAD_DELAY': 1,
        'DEFAULT_REQUEST_HEADERS': {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en',
        },
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    }
    process = CrawlerProcess(CrawlSettings)
    process.crawl(SpiderSpider,**params)
    process.start()

    end_at = time.time()
    print( end_at-start_at)

你可能感兴趣的:(Scrapy通用文件(附带cookie测试))