selenium动态网页请求

1.安装

pip install selenium

2.访问动态网页

from selenium.webdriver import Chrome
from scrapy.selector import Selector

#加载驱动
browser = Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
browser.get(
    "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.45766a1570CTei&id=566735119832&skuId=3599500084427&standard=1&user_id=268451883&cat_id=2&is_b=1&rn=b4f8b93029030636b209199c95921f38")

# browser.page_source所有数据,包括动态生成的数据
# print(browser.page_source)

# 数据提取建议用scrapy.selector
t_selector = Selector(text=browser.page_source)
price = t_selector.xpath('//span[@class="tm-price"]/text()')
print(price)
browser.quit()

3.模拟登录知乎

browser.get("https://www.zhihu.com/signin")
#延迟3秒,防止网页没加载完,下面部分元素找不到
import time
time.sleep(3)
browser.find_element_by_xpath('//div[contains(@class,"SignFlow-accountInput")]/input').send_keys("13981826640")
browser.find_element_by_xpath('//div[@class="Input-wrapper"]/input').send_keys("545462004GYP")

browser.find_element_by_css_selector('div.Login-content button.SignFlow-submitButton').click()
browser.quit()

4.模拟鼠标下拉刷新

browser.get("https://www.oschina.net/blog")
import time

time.sleep(5)

for i in range(10):
    i += 1
    #执行script语句
    browser.execute_script(
        """
                window.scrollTo(0,document.body.scrollHeight);
                var lenOfPage=document.bodyscrollHeight;
                return lenOfPage;
        """
    )

    time.sleep(3)

5.设置chromedriver不加载图片

from selenium import webdriver

# 设置chromedriver不加载图片,只加载js与html
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)

browser = webdriver.Chrome(executable_path="/home/mata/Tools/driver/chromedriver",chrome_options=chrome_opt)

browser.get("https://www.taobao.com")

6.selenium集成到scrapy

middlewares.py中

from scrapy.http import HtmlResponse


class JSageMiddleware(object):
    # 通过chrome请求动态网页
    def process_request(self, request, spider):
        if spider.name == "myspider":
            # 变成了同步访问,降低了性能,若需要变成异步,需要重写downloader
            # 可去github上搜scrapy downloader
            self.browser.get(request.url)
            import time
            time.sleep(3)

            # 返回一个HtmlResponse给spider,将不会再调用downloader进行下载。
            return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8",
                                request=request)

myspder.py中

# 分发器
from scrapy.xlib.pydispatch import dispatcher
# 信号
from scrapy import signals

    def __init__(self):
        # 可共用一个浏览器,不用每个url都打开一个
        self.browser = webdriver.Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
        super(JobboleSpider, self).__init__()

        # 当信号量为signals.spider_closed时,调用相关函数
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        # 爬虫退出的时候,关闭chrome
        print("spider closed")
        self.browser.close()

7.chrome无界面运行

from pyvirtualdisplay import Display
browser = Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
browser.get(target_url)

print(browser.page_source)
....


easyprocess.EasyProcessCheckInstalledError: cmd=['Xvfb', '-help'] OSError=[Errno 2] No such file or directory[解决方案]

sudo apt-get install xvfb
pip install xvfbwrapper
from:[https://stackoverflow.com/questions/32173839/easyprocess-easyprocesscheckinstallederror-cmd-xvfb-help-oserror-errno]

你可能感兴趣的:(框架)