爬虫框架WebMagic源码分析之Selenium

webmagic有一个selenium模块,其中实现了一个SeleniumDownloader。但是感觉灵活性不大。所以我就自己参考实现了一个。

首先是WebDriverPool用来管理WebDriver池:

import java.util.ArrayList;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import net.xby1993.common.util.FileUtil;

/**
 * @author taojw
 */
public class WebDriverPool {
    private Logger logger = LoggerFactory.getLogger(getClass());

    private int CAPACITY = 5;
    private AtomicInteger refCount = new AtomicInteger(0);
    private static final String DRIVER_PHANTOMJS = "phantomjs";

    /**
     * store webDrivers available
     */
    private BlockingDeque innerQueue = new LinkedBlockingDeque(
            CAPACITY);

    private static String PHANTOMJS_PATH;
    private static DesiredCapabilities caps = DesiredCapabilities.phantomjs();
    static {
        PHANTOMJS_PATH = FileUtil.getCommonProp("phantomjs.path");
        caps.setJavascriptEnabled(true);
        caps.setCapability(
                PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
                PHANTOMJS_PATH);
        caps.setCapability("takesScreenshot", false);
        caps.setCapability(
                PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX
                        + "User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");
        ArrayList cliArgsCap = new ArrayList();
        //http://phantomjs.org/api/command-line.html
        cliArgsCap.add("--web-security=false");
        cliArgsCap.add("--ssl-protocol=any");
        cliArgsCap.add("--ignore-ssl-errors=true");
        cliArgsCap.add("--load-images=false"); //不加载图片
        caps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
                cliArgsCap);
        caps.setCapability(
                PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
                new String[] {"--logLevel=INFO"});


    }

    public WebDriverPool() {
    }

    public WebDriverPool(int poolsize) {
        this.CAPACITY = poolsize;
        innerQueue = new LinkedBlockingDeque(poolsize);
    }

    public WebDriver get() throws InterruptedException {
        WebDriver poll = innerQueue.poll();
        if (poll != null) {
            return poll;
        }
        if (refCount.get() < CAPACITY) {
            synchronized (innerQueue) {
                if (refCount.get() < CAPACITY) {

                    WebDriver mDriver = new PhantomJSDriver(caps);
                    // 尝试性解决:https://github.com/ariya/phantomjs/issues/11526问题
                    mDriver.manage().timeouts()
                            .pageLoadTimeout(60, TimeUnit.SECONDS);
                    // mDriver.manage().window().setSize(new Dimension(1366,
                    // 768));
                    innerQueue.add(mDriver);
                    refCount.incrementAndGet();
                }
            }
        }
        return innerQueue.take();
    }

    public void returnToPool(WebDriver webDriver) {
        // webDriver.quit();
        // webDriver=null;
        innerQueue.add(webDriver);
    }

    public void close(WebDriver webDriver) {
        refCount.decrementAndGet();
        webDriver.quit();
        webDriver = null;
    }

    public void shutdown() {
        try {
            for (WebDriver driver : innerQueue) {
                close(driver);
            }
            innerQueue.clear();
        } catch (Exception e) {
//            e.printStackTrace();
            logger.warn("webdriverpool关闭失败",e);
        }
    }
}

之后便是SeleniumDownloader

import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;

import java.util.Map;

/**
 * @author taojw
 *
 */
public class SeleniumDownloader  implements Downloader{
    private static final Logger log=LoggerFactory.getLogger(SeleniumDownloader.class);
    private int sleepTime=3000;//3s
    private SeleniumAction action=null;
    private WebDriverPool webDriverPool=new WebDriverPool();
    public SeleniumDownloader(){
    }
    public SeleniumDownloader(int sleepTime,WebDriverPool pool){
        this(sleepTime,pool,null);
    }
    public SeleniumDownloader(int sleepTime,WebDriverPool pool,SeleniumAction action){
        this.sleepTime=sleepTime;
        this.action=action;
        if(pool!=null){
            webDriverPool=pool;
        }
    }
    public SeleniumDownloader setSleepTime(int sleepTime) {
        this.sleepTime = sleepTime;
        return this;
    }
    public void setOperator(SeleniumAction action){
        this.action=action;
    }
    @Override
    public Page download(Request request, Task task) {
        WebDriver webDriver;
        try {
            webDriver = webDriverPool.get();
        } catch (InterruptedException e) {
            log.warn("interrupted", e);
            return null;
        }
        log.info("downloading page " + request.getUrl());
        Page page = new Page();
        try {
            webDriver.get(request.getUrl());
            Thread.sleep(sleepTime);
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (Exception e) {
            webDriverPool.close(webDriver);
            page.setSkip(true);
            return page;
        }
//        WindowUtil.changeWindow(webDriver);
        WebDriver.Options manage = webDriver.manage();
        Site site = task.getSite();
        if (site.getCookies() != null) {
            for (Map.Entry cookieEntry : site.getCookies()
                    .entrySet()) {
                Cookie cookie = new Cookie(cookieEntry.getKey(),
                        cookieEntry.getValue());
                manage.addCookie(cookie);
            }
        }
        manage.window().maximize();
        if(action!=null){
            action.execute(webDriver);
        }
        SeleniumAction reqAction=(SeleniumAction) request.getExtra("action");
        if(reqAction!=null){
            reqAction.execute(webDriver);
        }

        WebElement webElement = webDriver.findElement(By.xpath("/html"));
        String content = webElement.getAttribute("outerHTML");
        
        page.setRawText(content);
        page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content,
                webDriver.getCurrentUrl())));
        page.setUrl(new PlainText(webDriver.getCurrentUrl()));
        page.setRequest(request);
        webDriverPool.returnToPool(webDriver);
        return page;
    }

    @Override
    public void setThread(int thread) {
        
    }

}

这里的扩展性主要体现在,我加入了SeleniumAction接口,可以在SeleniumDownloader初始化的时候配置一个全局的SeleniumAction,以及为每个Request配置对应的SeleniumAction。 SeleniumAction接口如下:

public interface SeleniumAction {
    void execute(WebDriver driver);
}

它会获得一个WebDriver实例,你可以在里面进行任意的Selenium操作。

本部分到此结束。

你可能感兴趣的:(java,webmagic,网页爬虫)