Heritrix 3.1.0 源码解析(二十九)

本文接下来分析CrawlServer类和CrawlHost类,两者都实现了IdentityCacheable接口(可缓存对象接口)

CrawlServer对象代表服务器,里面存储了服务器的相关信息,包括服务名 端口 robots信息 Credential集合及相关操作等

private static final long serialVersionUID = 3L;



    public static final long ROBOTS_NOT_FETCHED = -1;

    /** only check if robots-fetch is perhaps superfluous 

     * after this many tries */

    public static final long MIN_ROBOTS_RETRIES = 3;



    private String server; // actually, host+port in the https case

    private int port;

    protected Robotstxt robotstxt;

    long robotsFetched = ROBOTS_NOT_FETCHED;

    boolean validRobots = false;

    FetchStats substats = new FetchStats();

    

    // how many consecutive connection errors have been encountered;

    // used to drive exponentially increasing retry timeout or decision

    // to 'freeze' entire class (queue) of URIs

    protected int consecutiveConnectionErrors = 0;



    /**

     * Set of credentials.

     */

    private transient Set<Credential> credentials =  null;

String server表示站点服务器的标识,其构造方法如下(初始化站点服务器的标识和端口

/**

     * Creates a new CrawlServer object.

     *

     * @param h the host string for the server.

     */

    public CrawlServer(String h) {

        // TODO: possibly check for illegal host string

        server = h;

        int colonIndex = server.lastIndexOf(":");

        if (colonIndex < 0) {

            port = -1;

        } else {

            try {

                port = Integer.parseInt(server.substring(colonIndex + 1));

            } catch (NumberFormatException e) {

                port = -1;

            }

        }

    }

下面的方法是有关Robotstxt robotstxt对象操作的

public Robotstxt getRobotstxt() {

        return robotstxt;

    }

    

    /** Update the robotstxt

    *

    * @param curi the crawl URI containing the fetched robots.txt

    * @throws IOException

    */

   public synchronized void updateRobots(CrawlURI curi) {



       robotsFetched = System.currentTimeMillis();

       

       boolean gotSomething = curi.getFetchType() == HTTP_GET 

           && (curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND);

       

       

       if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {

           // robots.txt lookup failed, still trying, no reason to consider IGNORE yet

           validRobots = false;

           return;

       }

              

       // special deeming for a particular kind of connection-lost (empty server response)

        if (curi.getFetchStatus() == S_CONNECT_LOST

                && CollectionUtils.exists(curi.getNonFatalFailures(),

                        PredicateUtils.instanceofPredicate(NoHttpResponseException.class))) {

            curi.setFetchStatus(S_DEEMED_NOT_FOUND);

            gotSomething = true;

        }

       

       if (!gotSomething) {

           // robots.txt fetch failed and exceptions (ignore/deeming) don't apply; no valid robots info yet

           validRobots = false;

           return;

       }

       

       int fetchStatus = curi.getFetchStatus();

       if (fetchStatus < 200 || fetchStatus >= 300) {

           // Not found or anything but a status code in the 2xx range is

           // treated as giving access to all of a sites' content.

           // This is the prevailing practice of Google, since 4xx

           // responses on robots.txt are usually indicative of a 

           // misconfiguration or blanket-block, not an intentional

           // indicator of partial blocking. 

           // TODO: consider handling server errors, redirects differently

           robotstxt = Robotstxt.NO_ROBOTS;

           validRobots = true;

           return;

       }



       InputStream contentBodyStream = null;

       try {

           BufferedReader reader;

           contentBodyStream = curi.getRecorder().getContentReplayInputStream();



           reader = new BufferedReader(new InputStreamReader(contentBodyStream));

           robotstxt = new Robotstxt(reader); 

           validRobots = true;

       } catch (IOException e) {

           robotstxt = Robotstxt.NO_ROBOTS;

           logger.log(Level.WARNING,"problem reading robots.txt for "+curi,e);

           validRobots = true;

           curi.getNonFatalFailures().add(e);

       } finally {

           IOUtils.closeQuietly(contentBodyStream);

       }

   }    

/**

     * If true then valid robots.txt information has been retrieved. If false

     * either no attempt has been made to fetch robots.txt or the attempt

     * failed.

     *

     * @return Returns the validRobots.

     */

    public synchronized boolean isValidRobots() {

        return validRobots;

    }

/**

     * Is the robots policy expired.

     *

     * This method will also return true if we haven't tried to get the

     * robots.txt for this server.

     *

     * @param curi

     * @return true if the robots policy is expired.

     */

    public synchronized boolean isRobotsExpired(int validityDuration) {

        if (robotsFetched == ROBOTS_NOT_FETCHED) {

            // Have not attempted to fetch robots

            return true;

        }

        long duration = validityDuration*1000L;

        if (duration == 0) {

            // When zero, robots should be valid forever

            return false;

        }

        if (robotsFetched + duration < System.currentTimeMillis()) {

            // Robots is still valid

            return true;

        }

        return false;

    }

Set<Credential> credentials证书集合方法

/**

     * @return Credential avatars for this server.  Returns null if none.

     */

    public Set<Credential> getCredentials() {

        return this.credentials;

    }



    /**

     * @return True if there are avatars attached to this instance.

     */

    public boolean hasCredentials() {

        return this.credentials != null && this.credentials.size() > 0;

    }



    /**

     * Add an avatar.

     *

     * @param ca Credential avatar to add to set of avatars.

     */

    public void addCredential(Credential cred) {

        if (this.credentials == null) {

            this.credentials = new HashSet<Credential>();

        }

        this.credentials.add(cred);

    }

根据UURI uuri对象生成key的静态方法(用于站点服务器标识)

/**

     * Get key to use doing lookup on server instances.

     * 

     * @param cauri  CandidateURI we're to get server key for.

     * @return String to use as server key.

     * @throws URIException

     */

    /**

     * 根据UURI uuri对象生成key

     * 这里的key不同于classkey,应该保证同一域名下的所有url的key的一致性

     * @param uuri

     * @return

     * @throws URIException

     */

    public static String getServerKey(UURI uuri) throws URIException {

        // TODO: evaluate if this is really necessary -- why not

        // make the server of a dns CandidateURI the looked-up domain,

        // also simplifying FetchDNS?

        String key = uuri.getAuthorityMinusUserinfo();

        if (key == null) {

            // Fallback for cases where getAuthority() fails (eg 'dns:'.

            // DNS UURIs have the 'domain' in the 'path' parameter, not

            // in the authority).

            key = uuri.getCurrentHierPath();

            if (key != null && !key.matches("[-_\\w\\.:]+")) {

                // Not just word chars and dots and colons and dashes and

                // underscores; throw away

                key = null;

            }

        }

        if (key != null && uuri.getScheme().equals(UURIFactory.HTTPS)) {

            // If https and no port specified, add default https port to

            // distinuish https from http server without a port.

            if (!key.matches(".+:[0-9]+")) {

                key += UURIFactory.HTTPS_PORT;

            }

        }

        return key;

    }

CrawlHost对象代表主机,里面存储了主机标识(域名) IP地址 抓取时间  国家代码信息等

/** Flag value indicating always-valid IP */

    public static final long IP_NEVER_EXPIRES = -1;

    /** Flag value indicating an IP has not yet been looked up */

    public static final long IP_NEVER_LOOKED_UP = -2;

    private String hostname;

    private String countryCode;

    private InetAddress ip;

    private long ipFetched = IP_NEVER_LOOKED_UP;

    protected FetchStats substats = new FetchStats(); 

    /**

     * TTL gotten from dns record.

     *

     * From rfc2035:

     * <pre>

     * TTL       a 32 bit unsigned integer that specifies the time

     *           interval (in seconds) that the resource record may be

     *           cached before it should be discarded.  Zero values are

     *           interpreted to mean that the RR can only be used for the

     *           transaction in progress, and should not be cached.

     * </pre>

     */

    private long ipTTL = IP_NEVER_LOOKED_UP;



    // Used when bandwith constraint are used

    private long earliestNextURIEmitTime = 0;

构造方法初始化主机标识

/** 

     * Create a new CrawlHost object.

     *

     * @param hostname the host name for this host.

     */

    public CrawlHost(String hostname) {

            this(hostname, null);

    }



    /** 

     * Create a new CrawlHost object.

     *

     * @param hostname the host name for this host.

     * @param countryCode the country code for this host.

     */

    public CrawlHost(String hostname, String countryCode) {

        this.hostname = hostname;

        this.countryCode = countryCode;

        InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname);

        if (tmp != null) {

            setIP(tmp, IP_NEVER_EXPIRES);

        }

    }

下面的方法用于设置IP地址

/** Return true if the IP for this host has been looked up.

     *

     * Returns true even if the lookup failed.

     *

     * @return true if the IP for this host has been looked up.

     */

    public boolean hasBeenLookedUp() {

        return ipFetched != IP_NEVER_LOOKED_UP;

    }



    /**

     * Set the IP address for this host.

     *

     * @param address

     * @param ttl the TTL from the dns record in seconds or -1 if it should live

     * forever (is a numeric IP).

     */

    /**

     * 设置IP FetchNDS处理器解析IP

     * @param address

     * @param ttl

     */

    public void setIP(InetAddress address, long ttl) {

        this.ip = address;

        // Assume that a lookup as occurred by the time

        // a caller decides to set this (even to null)

        this.ipFetched = System.currentTimeMillis();

        this.ipTTL = ttl;

        if (logger.isLoggable(Level.FINE)) {

            logger.fine(hostname + ": " +

                ((address != null)? address.toString(): "null"));

        }

    }

---------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/29/3050940.html

你可能感兴趣的:(Heritrix)