七、学习爬虫框架WebMagic(三)---webmagic+Selenium爬取动态页面

一、添加依赖

		<!-- selenium-java客户端段 -->
		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
			<artifactId>selenium-java</artifactId>
			<version>3.141.59</version>
		</dependency>
		 <!--webmagic 核心包-->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-core</artifactId>
			<version>0.7.3</version>
			<exclusions>
				<exclusion>
					<groupId>commons-collections</groupId>
					<artifactId>commons-collections</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<!-- webmagic 扩展包 -->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-extension</artifactId>
			<version>0.7.3</version>
			<exclusions>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-log4j12</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<!-- webmagic-selenium -->
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-selenium</artifactId>
			<version>0.7.3</version>
		</dependency>
		<!--&lt;!&ndash; commons-collections &ndash;&gt;-->
		<dependency>
			<groupId>commons-collections</groupId>
			<artifactId>commons-collections</artifactId>
			<version>3.2.1</version>
		</dependency>

备注:Maven仓库里的webmagic-core包有点问题,需要直接去 github clone修复后的 webmagic-core ,然后设置 commons-collections 的版本号为 3.2.1,然后重新打成Jar包(会打成webmagic-core-0.7.3.jar、webmagic-core-0.7.3-javadoc.jar、webmagic-core-0.7.3-sources.jar),然后替代本地仓库的Jar包(推荐使用!!!)。

二、修改 webmagic-selenium 包中的WebDriverPool 和 SeleniumDownloader

修改1:WebDriverPool中 private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";将 Selenium 配置文件路径写死了,需要改变配置路径:private static final String DEFAULT_CONFIG_FILE = “selenium.properties”;,同时修改读取配置文件的方式:sConfig.load(Thread.currentThread().getContextClassLoader().getResourceAsStream(configFile));
修改2:Selenium在调用Chrome浏览器时,Chrome浏览器默认会弹出界面,可以同时设置 setHeadless() 来避免弹出Chrome浏览器。

(一)修改后的 WebDriverPool

package org.pc.webmagic.update;

import org.apache.log4j.Logger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * @author [email protected] 
* Date: 13-7-26
* Time: 下午1:41
*/
class WebDriverPool { private Logger logger = Logger.getLogger(getClass()); private final static int DEFAULT_CAPACITY = 5; private final int capacity; private final static int STAT_RUNNING = 1; private final static int STAT_CLODED = 2; private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); /* * new fields for configuring phantomJS */ private WebDriver mDriver = null; private boolean mAutoQuitDriver = true; private static final String DEFAULT_CONFIG_FILE = "selenium.properties"; private static final String DRIVER_FIREFOX = "firefox"; private static final String DRIVER_CHROME = "chrome"; private static final String DRIVER_PHANTOMJS = "phantomjs"; protected static Properties sConfig; protected static DesiredCapabilities sCaps; /** * Configure the GhostDriver, and initialize a WebDriver instance. This part * of code comes from GhostDriver. * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver * * @author [email protected] * @throws IOException */ public void configure() throws IOException { // Read config file sConfig = new Properties(); String configFile = DEFAULT_CONFIG_FILE; if (System.getProperty("selenuim_config")!=null){ configFile = System.getProperty("selenuim_config"); } sConfig.load(Thread.currentThread().getContextClassLoader().getResourceAsStream(configFile)); // sConfig.load(new FileReader(configFile)); // Prepare capabilities sCaps = new DesiredCapabilities(); sCaps.setJavascriptEnabled(true); sCaps.setCapability("takesScreenshot", false); String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); // Fetch PhantomJS-specific configuration parameters if (driver.equals(DRIVER_PHANTOMJS)) { // "phantomjs_exec_path" if (sConfig.getProperty("phantomjs_exec_path") != null) { sCaps.setCapability( PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, sConfig.getProperty("phantomjs_exec_path")); } else { throw new IOException( String.format( "Property '%s' not set!", PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); } // "phantomjs_driver_path" if (sConfig.getProperty("phantomjs_driver_path") != null) { System.out.println("Test will use an external GhostDriver"); sCaps.setCapability( PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, sConfig.getProperty("phantomjs_driver_path")); } else { System.out .println("Test will use PhantomJS internal GhostDriver"); } } // Disable "web-security", enable all possible "ssl-protocols" and // "ignore-ssl-errors" for PhantomJSDriver // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new // String[] { // "--web-security=false", // "--ssl-protocol=any", // "--ignore-ssl-errors=true" // }); ArrayList<String> cliArgsCap = new ArrayList<String>(); cliArgsCap.add("--web-security=false"); cliArgsCap.add("--ssl-protocol=any"); cliArgsCap.add("--ignore-ssl-errors=true"); sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, cliArgsCap); // Control LogLevel for GhostDriver, via CLI arguments sCaps.setCapability( PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS, new String[] { "--logLevel=" + (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig .getProperty("phantomjs_driver_loglevel") : "INFO") }); // String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); // Start appropriate Driver if (isUrl(driver)) { sCaps.setBrowserName("phantomjs"); mDriver = new RemoteWebDriver(new URL(driver), sCaps); } else if (driver.equals(DRIVER_FIREFOX)) { mDriver = new FirefoxDriver(sCaps); } else if (driver.equals(DRIVER_CHROME)) { ChromeOptions options = new ChromeOptions(); options.setHeadless(true); mDriver = new ChromeDriver(options); } else if (driver.equals(DRIVER_PHANTOMJS)) { mDriver = new PhantomJSDriver(sCaps); } } /** * check whether input is a valid URL * * @author [email protected] * @param urlString urlString * @return true means yes, otherwise no. */ private boolean isUrl(String urlString) { try { new URL(urlString); return true; } catch (MalformedURLException mue) { return false; } } /** * store webDrivers created */ private List<WebDriver> webDriverList = Collections .synchronizedList(new ArrayList<WebDriver>()); /** * store webDrivers available */ private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>(); public WebDriverPool(int capacity) { this.capacity = capacity; } public WebDriverPool() { this(DEFAULT_CAPACITY); } /** * * @return * @throws InterruptedException */ public WebDriver get() throws InterruptedException { checkRunning(); WebDriver poll = innerQueue.poll(); if (poll != null) { return poll; } if (webDriverList.size() < capacity) { synchronized (webDriverList) { if (webDriverList.size() < capacity) { // add new WebDriver instance into pool try { configure(); innerQueue.add(mDriver); webDriverList.add(mDriver); } catch (IOException e) { e.printStackTrace(); } // ChromeDriver e = new ChromeDriver(); // WebDriver e = getWebDriver(); // innerQueue.add(e); // webDriverList.add(e); } } } return innerQueue.take(); } public void returnToPool(WebDriver webDriver) { checkRunning(); innerQueue.add(webDriver); } protected void checkRunning() { if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { throw new IllegalStateException("Already closed!"); } } public void closeAll() { boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); if (!b) { throw new IllegalStateException("Already closed!"); } for (WebDriver webDriver : webDriverList) { logger.info("Quit webDriver" + webDriver); webDriver.quit(); webDriver = null; } } }

(二)修改后的 SeleniumDownloader

修改1:将所引用到的 WebDriverPool 全部替换为 修改后的WebDriverPool

package org.pc.webmagic.update;

import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.PlainText;

import java.io.Closeable;
import java.io.IOException;
import java.util.Map;

/**
 * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
* 需要下载Selenium driver支持。
* * @author [email protected]
* Date: 13-7-26
* Time: 下午1:37
*/
public class SeleniumDownloader implements Downloader, Closeable { private volatile WebDriverPool webDriverPool; private Logger logger = Logger.getLogger(getClass()); private int sleepTime = 0; private int poolSize = 1; private static final String DRIVER_PHANTOMJS = "phantomjs"; /** * 新建 * * @param chromeDriverPath chromeDriverPath */ public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); } /** * Constructor without any filed. Construct PhantomJS browser * * @author [email protected] */ public SeleniumDownloader() { // System.setProperty("phantomjs.binary.path", // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); } /** * set sleep time to wait until load success * * @param sleepTime sleepTime * @return this */ public SeleniumDownloader setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } @Override public Page download(Request request, Task task) { checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { logger.warn("interrupted", e); return null; } logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } /* * TODO You can add mouse event or other processes * * @author: [email protected] */ WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); // page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; } private void checkInit() { if (webDriverPool == null) { synchronized (this) { webDriverPool = new WebDriverPool(poolSize); } } } @Override public void setThread(int thread) { this.poolSize = thread; } @Override public void close() throws IOException { webDriverPool.closeAll(); } }

三、添加 Selenium 配置文件
selenium.properties

# What WebDriver to use for the tests
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub

# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
#phantomjs_exec_path=d:/phantomjs.exe
#chrome_exec_path=E:\\demo\\crawler\\chromedriver.exe
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
#phantomjs_driver_loglevel=DEBUG
chrome_driver_loglevel=DEBUG

四、使用案例

目的:获取头条台海网主页的文章。文章是动态加载的

package org.pc.webmagic;

import org.pc.webmagic.update.SeleniumDownloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;

/**
 * 访问台海网头条主页
 * @author 咸鱼
 * @date 2018/12/30 18:54
 */
public class SeleniumProcessor implements PageProcessor {
    private Site site;
    @Override
    public void process(Page page) {
        List<String> titles = page.getHtml().xpath("//a[@class='link title']/text()").all();
        page.putField("title", titles);

    }

    @Override
    public Site getSite() {
        if (site == null) {
            site = Site.me().setDomain("https://www.toutiao.com")
                    .setSleepTime(1000);
        }
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new SeleniumProcessor())
                .addUrl("https://www.toutiao.com/c/user/50502347096/#mid=50502347096")
                .thread(5)
                .addPipeline(new ConsolePipeline())
                /*
                 * 为 SeleniumDownloader 设置休眠时间:
                 *     当动态加载页面时,可能还存在部分数据没有加载完毕,为它设置休眠时间后,可保证有足够的时间,加载完
                 */
                .setDownloader(new SeleniumDownloader("E:\\demo\\crawler\\chromedriver.exe").setSleepTime(1000))
                .run();
    }
}

问题:在测试时,发现有时候页面并没有动态加载完毕,爬虫就结束了,导致并没有爬取到元素?
解决办法:为 SeleniumDownloader 增加休眠时间,给动态页面充裕的时间去加载。

五、使用Selenium模拟下拉刷新

(一)改动一:WebDriverPool#configure()
容许浏览器加载JS脚本

 if (isUrl(driver)) {
            sCaps.setBrowserName("phantomjs");
            mDriver = new RemoteWebDriver(new URL(driver), sCaps);
        } else if (driver.equals(DRIVER_FIREFOX)) {
            mDriver = new FirefoxDriver(sCaps);
        } else if (driver.equals(DRIVER_CHROME)) {
            ChromeOptions options = new ChromeOptions();
            //不弹窗
            options.setHeadless(true);
            mDriver = new ChromeDriver(options);
        } else if (driver.equals(DRIVER_PHANTOMJS)) {
            mDriver = new PhantomJSDriver(sCaps);
        }

(二)改动二:SeleniumDownloader#download()

@Override
    public Page download(Request request, Task task) {
        checkInit();
        WebDriver webDriver;
        try {
            webDriver = webDriverPool.get();
        } catch (InterruptedException e) {
            logger.warn("interrupted", e);
            return null;
        }
        logger.info("downloading page " + request.getUrl());
        webDriver.get(request.getUrl());
        try {
            Thread.sleep(sleepTime);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        WebDriver.Options manage = webDriver.manage();
        Site site = task.getSite();
        if (site.getCookies() != null) {
            for (Map.Entry<String, String> cookieEntry : site.getCookies()
                    .entrySet()) {
                Cookie cookie = new Cookie(cookieEntry.getKey(),
                        cookieEntry.getValue());
                manage.addCookie(cookie);
            }
        }

		/*
		 * TODO You can add mouse event or other processes
		 *
		 * @author: [email protected]
		 */

        //模拟下拉,刷新页面
        String js = "";
        for (int i=0; i < 20; i++){
            System.out.println("休眠1s");
            try {
                //滚动到最底部
                ((JavascriptExecutor)webDriver).executeScript("window.scrollTo(0,document.body.scrollHeight)");
                //休眠,等待加载页面
                Thread.sleep(2000);
                //往回滚一点,否则不加载
                ((JavascriptExecutor)webDriver).executeScript("window.scrollBy(0,-300)");
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        WebElement webElement = webDriver.findElement(By.xpath("/html"));
        String content = webElement.getAttribute("outerHTML");
        Page page = new Page();
        page.setRawText(content);
        page.setUrl(new PlainText(request.getUrl()));
        page.setRequest(request);
        webDriverPool.returnToPool(webDriver);
        return page;
    }

你可能感兴趣的:(java爬虫)