Nutch爬取Ajax请求的动态网页

利用开源插件html-unit

https://github.com/xautlx/nutch-htmlunit

把插件倒入到nutch环境中

但是在执行过程中,会出现各种错误。原因是lib-htmlunit的HttpWebClient有问题,

作了如下修改:

package org.apache.nutch.protocol.htmlunit;

import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.AjaxController;
import com.gargoylesoftware.htmlunit.BrowserVersion;

/**
 * Htmlunit WebClient Helper
 * Use one WebClient instance per thread by ThreadLocal to support multiple threads execution
 */
public class HttpWebClient {

    private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");

    private static ThreadLocal<WebClient> threadWebClient = new ThreadLocal<WebClient>();

    public static HtmlPage getHtmlPage(String url, Configuration conf) {
        try {
            WebClient webClient = threadWebClient.get();
            if (webClient == null) {
                LOG.info("Initing web client for thread: {}", Thread.currentThread().getId());
            AjaxController ajaxController = new NicelyResynchronizingAjaxController();
            webClient = new WebClient(BrowserVersion.FIREFOX_17);
            webClient.getOptions().setCssEnabled(false);
            webClient.getOptions().setJavaScriptEnabled(true);
            webClient.setAjaxController(ajaxController);    
            webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setPrintContentOnFailingStatusCode(false);
            webClient.getOptions().setRedirectEnabled(true);
            webClient.getOptions().setPopupBlockerEnabled(true);
            webClient.setCache(new ExtHtmlunitCache());
                // Enhanced WebConnection based on urlfilter

//百度云盘基本都是Ajax实现的,提供了账号密码方式

      HtmlPage loginPage = webClient.getPage("http://yun.baidu.com");
        loginPage.getElementById("TANGRAM__PSP_4__userName").setAttribute("value","280889189");
        loginPage.getElementById("TANGRAM__PSP_4__password").setAttribute("value","123578951");
        loginPage = ((HtmlInput)loginPage.getElementById("TANGRAM__PSP_4__submit")).click();
            webClient.setWebConnection(new RegexHttpWebConnection(webClient,conf));
            threadWebClient.set(webClient);
            }
            HtmlPage page = webClient.getPage(url);
//            webClient.closeAllWindows();
            return page;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static HtmlPage getHtmlPage(String url) {
        return getHtmlPage(url, null);
    }
}

你可能感兴趣的:(Ajax,Nutch,htmlunit)