python Selenium 借助浏览器抓包

安装
pip install selenium

from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://qiang.taobao.com/')
from selenium import webdriver

dcap = dict(DesiredCapabilities.CHROME)
dcap["userAgent"] = (
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1295.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat")

# 个人资料路径
user_data_dir = (r'--user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data\Default')
# 加载配置数据
option = webdriver.ChromeOptions()
option.add_argument(user_data_dir)
browser = webdriver.Chrome("chromedriver.exe",
                           options=option, #在原浏览器中打开
                           desired_capabilities=dcap #修改头文件
                           )

Selenium启动浏览器时,默认是打开一个新用户,不会加载原有的配置以及插件。但有些时候我们可能需要加载默认配置


# 导入selenium模块中的web引擎
from selenium import webdriver


# 建立浏览器对象 ,通过Phantomjs
browser = webdriver.Chrome()

# 设置访问的url
url = 'https://www.baidu.com'

# 访问url
browser.get(url)

# 等待一定时间,让js脚本加载完毕
browser.implicitly_wait(3)

# 找到搜索框
text = browser.find_element_by_id('kw')

# 清空搜索框的文字
text.clear()

# 填写搜索框的文字
text.send_keys('python')

# 找到submit按钮
button = browser.find_element_by_id('su')

# 点击按钮 提交搜索请求
button.submit()


# 查看当前浏览器标题
print(browser.title)

# 以截图的方式查看浏览器的页面
browser.save_screenshot('text.png')

# 找到结果 结果保存为列表变量
results = browser.find_elements_by_class_name('t')

# 循环遍历找出每个结果的标题和url
for result in results:
    print('标题:{} 超链接:{}'.format(result.text,
                                result.find_element_by_tag_name('a').get_attribute('href')))

问题
1、Error message: “'chromedriver' executable needs to be available in the path”
从https://sites.google.com/a/chromium.org/chromedriver/downloads下载chromedriver
将chromedriver.exe 放入放到python脚本的文件夹下面
或者webdriver.Chrome() 参数中指定全路径

文档

http://selenium-python-zh.readthedocs.io/en/latest/page-objects.html

与scrapy配合使用
https://github.com/clemfromspace/scrapy-selenium

"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""

from importlib import import_module
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
"""This module contains the ``SeleniumRequest`` class"""

from scrapy import Request


class SeleniumRequest(Request):
    """Scrapy ``Request`` subclass providing additional arguments"""

    def __init__(self, url, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs):
        """Initialize a new selenium request

        Parameters
        ----------
        wait_time: int
            The number of seconds to wait.
        wait_until: method
            One of the "selenium.webdriver.support.expected_conditions". The response
            will be returned until the given condition is fulfilled.
        screenshot: bool
            If True, a screenshot of the page will be taken and the data of the screenshot
            will be returned in the response "meta" attribute.

        """

        self.wait_time = wait_time
        self.wait_until = wait_until
        self.screenshot = screenshot

        super().__init__(url, *args, **kwargs)




class SeleniumMiddleware:
    """Scrapy middleware handling the requests using selenium"""

    def __init__(self):
        self.driver = webdriver.Chrome()

    @classmethod
    def from_crawler(cls, crawler):
        """Initialize the middleware with the crawler settings"""
        middleware = cls()
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware

    def process_request(self, request, spider):
        """Process a request using the selenium driver if applicable"""

        if not isinstance(request, SeleniumRequest):
            return request

        self.driver.get(request.url)

        for cookie_name, cookie_value in request.cookies.items():
            self.driver.add_cookie(
                {
                    'name': cookie_name,
                    'value': cookie_value
                }
            )

        if request.wait_until:
            WebDriverWait(self.driver, request.wait_time).until(
                request.wait_until
            )

        if request.screenshot:
            request.meta['screenshot'] = self.driver.get_screenshot_as_png()

        body = str.encode(self.driver.page_source)

        # Expose the driver via the "meta" attribute
        request.meta.update({'driver': self.driver})

        return HtmlResponse(
            self.driver.current_url,
            body=body,
            encoding='utf-8',
            request=request
        )

    def spider_closed(self):
        """Shutdown the driver when spider is closed"""

        self.driver.quit()

return SeleniumRequest(url) 返回
settings.py DOWNLOADER_MIDDLEWARES中添加 该SeleniumMiddleware

你可能感兴趣的:(python Selenium 借助浏览器抓包)