从头学习爬虫(二十二)重构篇----WebMagic框架分析之downloader

这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。


欢迎加群313557283(刚创建),小白互相学习~


Downloader

我们先来看看接口

package us.codecraft.webmagic.downloader;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;

/**
 * Downloader is the part that downloads web pages and store in Page object. 
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler, * there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers. * * @author [email protected]
* @since 0.1.0 */ public interface Downloader { /** * Downloads web pages and store in Page object. * * @param request request * @param task task * @return page */ public Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. * @param threadNum number of threads */ public void setThread(int threadNum); }

提供了两个方法,一个是线程数控制,还有个是下载方法

我们再来看看默认调用实现downloader的那个类HttpClientDownloader

package us.codecraft.webmagic.downloader;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;


/**
 * The http downloader based on HttpClient.
 *
 * @author [email protected] 
* @since 0.1.0 */ public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); private final Map httpClients = new HashMap(); //下载方法包装类 private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); //代理提供类 private ProxyProvider proxyProvider; private boolean responseHeader = true; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } public void setProxyProvider(ProxyProvider proxyProvider) { this.proxyProvider = proxyProvider; } //通过site 构造 CloseableHttpClient private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; } @Override public Page download(Request request, Task task) { //如果要重写主要改造下面直到page = handleResponse(); if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } CloseableHttpResponse httpResponse = null; //解析site 里面的header 以及传递参数 主要调用的是HttpClientGenerator CloseableHttpClient httpClient = getHttpClient(task.getSite()); //判断有没有代理 Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); //初始化page Page page = Page.fail(); try { //发送请求HttpUriRequest 方法,HttpUriRequest 内容 httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); //包装page page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { //未实现 proxyProvider.returnProxy(proxy, page, task); } } } @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { charset = Charset.defaultCharset().name(); logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); } return charset; } }



比较主要的还有个类HttpClientGenerator,主要是提供下载参数构造的

这边不解析了 有疑问老规矩

package us.codecraft.webmagic.downloader;

import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;

/**
 * @author [email protected] 
* @since 0.4.0 */ public class HttpClientGenerator { private transient Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setDefaultMaxPerRoute(100); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public X509Certificate[] getAcceptedIssuers() { return null; } }; SSLContext sc = SSLContext.getInstance("SSLv3"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; } public CloseableHttpClient getClient(Site site) { return generateClient(site); } private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解决post/redirect/post 302跳转问题 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { if (site.isDisableCookieManagement()) { httpClientBuilder.disableCookieManagement(); return; } CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry cookieEntry : site.getCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); } for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) { for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } httpClientBuilder.setDefaultCookieStore(cookieStore); } }

扩展

目前还写了两个downloader,主要用于模拟浏览器,个人比较推荐selenium 用googledriver,目前比较重视自动化测试,更新很快。

直接贴代码

PhantomJSDownloader

package us.codecraft.webmagic.downloader;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;

import java.io.*;

/**
 * this downloader is used to download pages which need to render the javascript
 *
 * @author [email protected]
 * @version 0.5.3
 */
public class PhantomJSDownloader extends AbstractDownloader {

    private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
    private static String crawlJsPath;
    private static String phantomJsCommand = "phantomjs"; // default

    private int retryNum;
    private int threadNum;

    public PhantomJSDownloader() {
        this.initPhantomjsCrawlPath();
    }
    
    /**
     * 添加新的构造函数,支持phantomjs自定义命令
     * 
     * example: 
     *    phantomjs.exe 支持windows环境
     *    phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
     *    /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
     *   
     * @param phantomJsCommand phantomJsCommand
     */
    public PhantomJSDownloader(String phantomJsCommand) {
        this.initPhantomjsCrawlPath();
        PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
    }
    
    /**
     * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
     * 
     * crawl.js start --
     * 
     *   var system = require('system');
     *   var url = system.args[1];
     *   
     *   var page = require('webpage').create();
     *   page.settings.loadImages = false;
     *   page.settings.resourceTimeout = 5000;
     *   
     *   page.open(url, function (status) {
     *       if (status != 'success') {
     *           console.log("HTTP request failed!");
     *       } else {
     *           console.log(page.content);
     *       }
     *   
     *       page.close();
     *       phantom.exit();
     *   });
     *   
     * -- crawl.js end
     * 
* 具体项目时可以将以上js代码复制下来使用 * * example: * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); * * @param phantomJsCommand phantomJsCommand * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.crawlJsPath = crawlJsPath; } private void initPhantomjsCrawlPath() { PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; } @Override public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } String content = getPage(request); if (content.contains("HTTP request failed")) { for (int i = 1; i <= getRetryNum(); i++) { content = getPage(request); if (!content.contains("HTTP request failed")) { break; } } if (content.contains("HTTP request failed")) { //when failed Page page = new Page(); page.setRequest(request); return page; } } Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { this.threadNum = threadNum; } protected String getPage(Request request) { try { String url = request.getUrl(); Runtime runtime = Runtime.getRuntime(); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); InputStream is = process.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuffer stringBuffer = new StringBuffer(); String line; while ((line = br.readLine()) != null) { stringBuffer.append(line).append("\n"); } return stringBuffer.toString(); } catch (IOException e) { e.printStackTrace(); } return null; } public int getRetryNum() { return retryNum; } public PhantomJSDownloader setRetryNum(int retryNum) { this.retryNum = retryNum; return this; } }


SeleniumDownloader

package us.codecraft.webmagic.downloader.selenium;

import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;

import java.io.Closeable;
import java.io.IOException;
import java.util.Map;

/**
 * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
* 需要下载Selenium driver支持。
* * @author [email protected]
* Date: 13-7-26
* Time: 下午1:37
*/ public class SeleniumDownloader implements Downloader, Closeable { private volatile WebDriverPool webDriverPool; private Logger logger = Logger.getLogger(getClass()); private int sleepTime = 0; private int poolSize = 1; private static final String DRIVER_PHANTOMJS = "phantomjs"; /** * 新建 * * @param chromeDriverPath chromeDriverPath */ public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); } /** * Constructor without any filed. Construct PhantomJS browser * * @author [email protected] */ public SeleniumDownloader() { // System.setProperty("phantomjs.binary.path", // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); } /** * set sleep time to wait until load success * * @param sleepTime sleepTime * @return this */ public SeleniumDownloader setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } @Override public Page download(Request request, Task task) { checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { logger.warn("interrupted", e); return null; } logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry cookieEntry : site.getCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } /* * TODO You can add mouse event or other processes * * @author: [email protected] */ WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; } private void checkInit() { if (webDriverPool == null) { synchronized (this) { webDriverPool = new WebDriverPool(poolSize); } } } @Override public void setThread(int thread) { this.poolSize = thread; } @Override public void close() throws IOException { webDriverPool.closeAll(); } }


总结

如果想要改下载器 可以写个类实现下这个接口downloader

然后形式不变采取其他组件下载或者配置动态代理

因为这部分不是很完整。

你可能感兴趣的:(网络爬虫)