spider动态页面 抓取方式

 中间件重写, 登录起始开启有头模拟,保存cookies后,开启无头模拟

       示例:

class SeleniumDownloaderMiddleware(object):
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        self.url = 'http://www.taobao.com'
        path = r'E:\LiuLanQi\chromedriver.exe'
        # self.driver = webdriver.Chrome(executable_path=path, chrome_options=self.chrome_options)
        self.driver = webdriver.Chrome(executable_path=path)
        self.load_cookies = False

    # def __del__(self):
    #     self.driver.close()

    def process_request(self, request, spider):

        if request.meta['page'] == '0':
            self.driver.get(url=self.url)
            if not os.path.exists('cookies.json'):
                time.sleep(30)
                dictCookies = self.driver.get_cookies()
                jsonCookies = json.dumps(dictCookies)
                # 登录完成后,将cookie保存到本地文件
                with open('cookies.json', 'w') as f:
                    f.write(jsonCookies)
                self.driver.close()
            else:
                if not self.load_cookies:
                    # 删除第一次建立连接时的cookie
                    self.driver.delete_all_cookies()
                    # 读取登录时存储到本地的cookie
                    with open('cookies.json', 'r') as f:
                        listCookies = json.loads(f.read())
                    for cookie in listCookies:
                        self.driver.add_cookie({
                            'domain': '.taobao.com',  # 此处xxx.com前,需要带点
                            'name': cookie['name'],
                            'value': cookie['value'],
                            'path': '/',
                            'expires': None
                        })
                    self.load_cookies = True
                # 再次访问页面,便可实现免登陆访问
                self.driver.get('https://www.taobao.com')
                time.sleep(5)
                my_input = self.driver.find_element_by_css_selector('#q')
                # 向这个框里面写内容
                my_input.send_keys('口红')
                time.sleep(3)
                button = self.driver.find_element_by_css_selector('#J_TSearchForm > div.search-button > button')
                button.click()
                time.sleep(5)
                bai_botton = self.driver.find_element_by_link_text('纪梵希')
                bai_botton.click()
                time.sleep(10)
                hufu_info = self.driver.find_element_by_css_selector(
                    '#mainsrp-itemlist > div > div > div:nth-child(1) > div:nth-child(1)')
                hufu_info.click()
                time.sleep(20)

                pinglun = self.driver.find_element_by_css_selector('#J_TabBar > li:nth-child(2)')
                pinglun.click()
                time.sleep(5)
                ping = self.driver.find_element_by_css_selector('#J_Reviews > div > div.rate-grid > table > tbody > tr:nth-child(1) > td.tm-col-master > div.tm-rate-content > div.tm-rate-fulltxt').text
                url = self.driver.current_url
                page_source = self.driver.page_source

                
                # 根据网页源代码,创建htmlresponse对象
                # 因为返回的是文本内容,指定字符编码格式
                time.sleep(5)
                response = HtmlResponse(url=url, body=page_source, encoding='utf-8',request=request)
                return response

 

你可能感兴趣的:(spider)