crawler4j 源码解读之配置文件configurable

public class CrawlConfig {

    /**
     * The folder which will be used by crawler for storing the intermediate
     * crawl data. The content of this folder should not be modified manually.
     */
    //爬取的存储文件夹
    private String crawlStorageFolder;

    /**
     * If this feature is enabled, you would be able to resume a previously
     * stopped/crashed crawl. However, it makes crawling slightly slower
     */
    //如果设置为true的话,可以恢复停止以前的状态。
    private boolean resumableCrawling = false;

    /**
     * Maximum depth of crawling For unlimited depth this parameter should be
     * set to -1
     */
    //爬取的最大深度
    private int maxDepthOfCrawling = -1;

    /**
     * Maximum number of pages to fetch For unlimited number of pages, this
     * parameter should be set to -1
     */
    //爬取页面的最大数量
    private int maxPagesToFetch = -1;

    /**
     * user-agent string that is used for representing your crawler to web
     * servers. See http://en.wikipedia.org/wiki/User_agent for more details
     */
    //设置用户代理
    private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";

    /**
     * Politeness delay in milliseconds (delay between sending two requests to
     * the same host).
     */
    //设置向同一主机发送请求的间隔时间
    private int politenessDelay = 200;

    /**
     * Should we also crawl https pages?
     */
    //设置是否只抓取https的页面。
    private boolean includeHttpsPages = false;

    /**
     * Should we fetch binary content such as images, audio, ...?
     */
    //是否抓取多媒体文件,如图像,音频等。
    private boolean includeBinaryContentInCrawling = false;

    /**
     * Maximum Connections per host
     */
    //对于同一个主机的最大连接数
    private int maxConnectionsPerHost = 100;

    /**
     * Maximum total connections
     */
    private int maxTotalConnections = 100;

    /**
     * Socket timeout in milliseconds
     */
    private int socketTimeout = 20000;

    /**
     * Connection timeout in milliseconds
     */
    private int connectionTimeout = 30000;

    /**
     * Max number of outgoing links which are processed from a page
     */
    //每个页面的最大出链数。
    private int maxOutgoingLinksToFollow = 5000;

    /**
     * Max allowed size of a page. Pages larger than this size will not be
     * fetched.
     */
    //抓取页面的最大size
    private int maxDownloadSize = 1048576;

    /**
     * Should we follow redirects?
     */
    private boolean followRedirects = true;

    /**
     * If crawler should run behind a proxy, this parameter can be used for
     * specifying the proxy host.
     */
    //配置代理主机
    private String proxyHost = null;

    /**
     * If crawler should run behind a proxy, this parameter can be used for
     * specifying the proxy port.
     */
    //配置代理主机的端口号
    private int proxyPort = 80;

    /**
     * If crawler should run behind a proxy and user/pass is needed for
     * authentication in proxy, this parameter can be used for specifying the
     * username.
     */
    //代理主机的用户名
    private String proxyUsername = null;

    /**
     * If crawler should run behind a proxy and user/pass is needed for
     * authentication in proxy, this parameter can be used for specifying the
     * password.
     */
    //代理主机的密码
    private String proxyPassword = null;

    public CrawlConfig() {
    }

    /**
     * Validates the configs specified by this instance.
     *
     * @throws Exception
     */
    public void validate() throws Exception {
        if (crawlStorageFolder == null) {
            throw new Exception("Crawl storage folder is not set in the CrawlConfig.");
        }
        if (politenessDelay < 0) {
            throw new Exception("Invalid value for politeness delay: " + politenessDelay);
        }
        if (maxDepthOfCrawling < -1) {
            throw new Exception("Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
        }
        if (maxDepthOfCrawling > Short.MAX_VALUE) {
            throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE);
        }

    }

}

你可能感兴趣的:(java,开源,爬虫)