Java网络爬虫crawler4j学习笔记 PageFetcher类

简介

PageFetcher类主要是HTTPClient包的运用。需要了解其API

代码

package edu.uci.ics.crawler4j.fetcher;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import javax.net.ssl.SSLContext;

import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
//import edu.uci.ics.crawler4j.crawler.*;
import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
import edu.uci.ics.crawler4j.crawler.authentication.BasicAuthInfo;
import edu.uci.ics.crawler4j.crawler.authentication.FormAuthInfo;
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import org.apache.http.*;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;

/**
 * @author Yasser Ganjisaffar [lastname at gmail dot com]
 */
public class PageFetcher extends Configurable {

  protected static final Logger logger = LoggerFactory.getLogger(PageFetcher.class);

  // HttpClient连接池
  protected PoolingHttpClientConnectionManager connectionManager;
  // httpClient对象
  protected CloseableHttpClient httpClient;
  protected final Object mutex = new Object();
  protected long lastFetchTime = 0;
  protected IdleConnectionMonitorThread connectionMonitorThread = null;

  public PageFetcher(CrawlConfig config) {
    super(config);

    RequestConfig requestConfig = RequestConfig.custom()
        .setExpectContinueEnabled(false)
        .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
        .setRedirectsEnabled(false)     // 不允许redirect
        .setSocketTimeout(config.getSocketTimeout())    // socket超时
        .setConnectTimeout(config.getConnectionTimeout())   // connection超时
        .build();

    RegistryBuilder connRegistryBuilder = RegistryBuilder.create();
    connRegistryBuilder.register("http", PlainConnectionSocketFactory.INSTANCE);
    if (config.isIncludeHttpsPages()) {
      try { // Fixing: https://code.google.com/p/crawler4j/issues/detail?id=174
        // By always trusting the ssl certificate
        SSLContext sslContext = SSLContexts.custom()
            .loadTrustMaterial(null, new TrustStrategy() {
              @Override
              public boolean isTrusted(final X509Certificate[] chain, String authType) {
                return true;
              }
            }).build();
        SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
            sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
        connRegistryBuilder.register("https", sslsf);
      } catch (Exception e) {
        logger.warn("Exception thrown while trying to register https");
        logger.debug("Stacktrace", e);
      }
    }

    Registry connRegistry = connRegistryBuilder.build();
    connectionManager = new PoolingHttpClientConnectionManager(connRegistry);
    connectionManager.setMaxTotal(config.getMaxTotalConnections()); // 最大连接数
    connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); // 每个route的最大连接

    HttpClientBuilder clientBuilder = HttpClientBuilder.create();
    clientBuilder.setDefaultRequestConfig(requestConfig);
    clientBuilder.setConnectionManager(connectionManager);
    clientBuilder.setUserAgent(config.getUserAgentString());

    // 设置代理
    if (config.getProxyHost() != null) {
      if (config.getProxyUsername() != null) {
        BasicCredentialsProvider credentialsProvider = new BasicCredentialsProvider();
        credentialsProvider.setCredentials(
            new AuthScope(config.getProxyHost(), config.getProxyPort()),
            new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
        clientBuilder.setDefaultCredentialsProvider(credentialsProvider);
      }

      HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
      clientBuilder.setProxy(proxy);
      logger.debug("Working through Proxy: {}", proxy.getHostName());
    }

    httpClient = clientBuilder.build();
    if (config.getAuthInfos() != null && !config.getAuthInfos().isEmpty()) {
      doAuthetication(config.getAuthInfos());
    }

    if (connectionMonitorThread == null) {
      connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
    }
    connectionMonitorThread.start();
  }

  private void doAuthetication(List authInfos) {
    for (AuthInfo authInfo : authInfos) {
      if (authInfo.getAuthenticationType().equals(AuthInfo.AuthenticationType.BASIC_AUTHENTICATION)) {
        doBasicLogin((BasicAuthInfo) authInfo);
      } else {
        doFormLogin((FormAuthInfo) authInfo);
      }
    }
  }

  /**
   * BASIC authentication
* Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org/apache/http/examples/client/ClientAuthentication.java * */
private void doBasicLogin(BasicAuthInfo authInfo) { logger.info("BASIC authentication for: " + authInfo.getLoginTarget()); HttpHost targetHost = new HttpHost(authInfo.getHost(), authInfo.getPort(), authInfo.getProtocol()); CredentialsProvider credsProvider = new BasicCredentialsProvider(); credsProvider.setCredentials( new AuthScope(targetHost.getHostName(), targetHost.getPort()), new UsernamePasswordCredentials(authInfo.getUsername(), authInfo.getPassword())); httpClient = HttpClients.custom() .setDefaultCredentialsProvider(credsProvider) .build(); } /** * FORM authentication
* Official Example: https://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org/apache/http/examples/client/ClientFormLogin.java * */
private void doFormLogin(FormAuthInfo authInfo) { logger.info("FORM authentication for: " + authInfo.getLoginTarget()); String fullUri = authInfo.getProtocol() + "://" + authInfo.getHost() + ":" + authInfo.getPort() + authInfo.getLoginTarget(); HttpPost httpPost = new HttpPost(fullUri); List formParams = new ArrayList<>(); formParams.add(new BasicNameValuePair(authInfo.getUsernameFormStr(), authInfo.getUsername())); formParams.add(new BasicNameValuePair(authInfo.getPasswordFormStr(), authInfo.getPassword())); try { UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, "UTF-8"); httpPost.setEntity(entity); httpClient.execute(httpPost); logger.debug("Successfully Logged in with user: " + authInfo.getUsername() + " to: " + authInfo.getHost()); } catch (UnsupportedEncodingException e) { logger.error("Encountered a non supported encoding while trying to login to: " + authInfo.getHost(), e); } catch (ClientProtocolException e) { logger.error("While trying to login to: " + authInfo.getHost() + " - Client protocol not supported", e); } catch (IOException e) { logger.error("While trying to login to: " + authInfo.getHost() + " - Error making request", e); } } public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException { // Getting URL, setting headers & content PageFetchResult fetchResult = new PageFetchResult(); String toFetchURL = webUrl.getURL(); HttpGet get = null; try { get = new HttpGet(toFetchURL); // Applying Politeness delay synchronized (mutex) { long now = (new Date()).getTime(); if (now - lastFetchTime < config.getPolitenessDelay()) { Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); } lastFetchTime = (new Date()).getTime(); } HttpResponse response = httpClient.execute(get); fetchResult.setEntity(response.getEntity()); fetchResult.setResponseHeaders(response.getAllHeaders()); // Setting HttpStatus int statusCode = response.getStatusLine().getStatusCode(); // If Redirect ( 3xx ) if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) { // todo follow https://issues.apache.org/jira/browse/HTTPCORE-389 Header header = response.getFirstHeader("Location"); if (header != null) { String movedToUrl = URLCanonicalizer.getCanonicalURL(header.getValue(), toFetchURL); fetchResult.setMovedToUrl(movedToUrl); } } else if (statusCode == HttpStatus.SC_OK) { // is 200, everything looks ok fetchResult.setFetchedUrl(toFetchURL); String uri = get.getURI().toString(); if (!uri.equals(toFetchURL)) { if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) { fetchResult.setFetchedUrl(uri); } } // Checking maximum size if (fetchResult.getEntity() != null) { long size = fetchResult.getEntity().getContentLength(); if (size > config.getMaxDownloadSize()) { throw new PageBiggerThanMaxSizeException(size); } } } fetchResult.setStatusCode(statusCode); return fetchResult; } finally { // occurs also with thrown exceptions if (fetchResult.getEntity() == null && get != null) { get.abort(); } } } public synchronized void shutDown() { if (connectionMonitorThread != null) { connectionManager.shutdown(); connectionMonitorThread.shutdown(); } } }

你可能感兴趣的:(网络爬虫,crawler4j,网络爬虫,crawler4j)