crawler4j 源码解读之配置文件configurable

public class CrawlConfig {

     * The folder which will be used by crawler for storing the intermediate
     * crawl data. The content of this folder should not be modified manually.
    private String crawlStorageFolder;

     * If this feature is enabled, you would be able to resume a previously
     * stopped/crashed crawl. However, it makes crawling slightly slower
    private boolean resumableCrawling = false;

     * Maximum depth of crawling For unlimited depth this parameter should be
     * set to -1
    private int maxDepthOfCrawling = -1;

     * Maximum number of pages to fetch For unlimited number of pages, this
     * parameter should be set to -1
    private int maxPagesToFetch = -1;

     * user-agent string that is used for representing your crawler to web
     * servers. See for more details
    private String userAgentString = "crawler4j (";

     * Politeness delay in milliseconds (delay between sending two requests to
     * the same host).
    private int politenessDelay = 200;

     * Should we also crawl https pages?
    private boolean includeHttpsPages = false;

     * Should we fetch binary content such as images, audio, ...?
    private boolean includeBinaryContentInCrawling = false;

     * Maximum Connections per host
    private int maxConnectionsPerHost = 100;

     * Maximum total connections
    private int maxTotalConnections = 100;

     * Socket timeout in milliseconds
    private int socketTimeout = 20000;

     * Connection timeout in milliseconds
    private int connectionTimeout = 30000;

     * Max number of outgoing links which are processed from a page
    private int maxOutgoingLinksToFollow = 5000;

     * Max allowed size of a page. Pages larger than this size will not be
     * fetched.
    private int maxDownloadSize = 1048576;

     * Should we follow redirects?
    private boolean followRedirects = true;

     * If crawler should run behind a proxy, this parameter can be used for
     * specifying the proxy host.
    private String proxyHost = null;

     * If crawler should run behind a proxy, this parameter can be used for
     * specifying the proxy port.
    private int proxyPort = 80;

     * If crawler should run behind a proxy and user/pass is needed for
     * authentication in proxy, this parameter can be used for specifying the
     * username.
    private String proxyUsername = null;

     * If crawler should run behind a proxy and user/pass is needed for
     * authentication in proxy, this parameter can be used for specifying the
     * password.
    private String proxyPassword = null;

    public CrawlConfig() {

     * Validates the configs specified by this instance.
     * @throws Exception
    public void validate() throws Exception {
        if (crawlStorageFolder == null) {
            throw new Exception("Crawl storage folder is not set in the CrawlConfig.");
        if (politenessDelay < 0) {
            throw new Exception("Invalid value for politeness delay: " + politenessDelay);
        if (maxDepthOfCrawling < -1) {
            throw new Exception("Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
        if (maxDepthOfCrawling > Short.MAX_VALUE) {
            throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE);


