Heritrix 3.1.0 源码解析(三十二)

本文要分析的是FetchDNS处理器,该处理器的功能是解析CrawlURI curi对象的DNS地址,该处理器是采用dnsjava-2.0.3.jar组件进行解析DNS的(我们可以参考本文代码采用dnsjava-2.0.3.jar组件API解析DNS)

FetchDNS处理器的重要成员变量

// Defaults.

    private short ClassType = DClass.IN;

    private short TypeType = Type.A;

    protected InetAddress serverInetAddr = null;



 /**

     * Used to do DNS lookups.

     */

    protected ServerCache serverCache;

    public ServerCache getServerCache() {

        return this.serverCache;

    }

    @Autowired

    public void setServerCache(ServerCache serverCache) {

        this.serverCache = serverCache;

    }

    

    /**

     * Whether or not to perform an on-the-fly digest hash of retrieved

     * content-bodies.

     */

    {

        setDigestContent(true);

    }

    public boolean getDigestContent() {

        return (Boolean) kp.get("digestContent");

    }

    public void setDigestContent(boolean digest) {

        kp.put("digestContent",digest);

    }



    /**

     * Which algorithm (for example MD5 or SHA-1) to use to perform an 

     * on-the-fly digest hash of retrieved content-bodies.

     */

    String digestAlgorithm = "sha1"; 

    public String getDigestAlgorithm() {

        return digestAlgorithm;

    }

    public void setDigestAlgorithm(String digestAlgorithm) {

        this.digestAlgorithm = digestAlgorithm;

    }

处理器void innerProcess(CrawlURI curi)方法

protected void innerProcess(CrawlURI curi) {

        Record[] rrecordSet = null; // Retrieved dns records

        String dnsName = null;

        try {

            dnsName = curi.getUURI().getReferencedHost();

        } catch (URIException e) {

            logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);

        }

        

        if(dnsName == null) {

            curi.setFetchStatus(S_UNFETCHABLE_URI);

            return;

        }



        CrawlHost targetHost = getServerCache().getHostFor(dnsName);

        //IP地址转换为InetAddress类型

        if (isQuadAddress(curi, dnsName, targetHost)) {

            // We're done processing.

            return;

        }

        

        // Do actual DNS lookup.

        curi.setFetchBeginTime(System.currentTimeMillis());



        // Try to get the records for this host (assume domain name)

        // TODO: Bug #935119 concerns potential hang here

        String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";

        try {

            //DNS解析

            rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();

        } catch (TextParseException e) {

            rrecordSet = null;

        }

        curi.setContentType("text/dns");

        if (rrecordSet != null) {

            if (logger.isLoggable(Level.FINE)) {

                logger.fine("Found recordset for " + lookupName);

            }

            //设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性

            storeDNSRecord(curi, dnsName, targetHost, rrecordSet);

        } else {

            if (logger.isLoggable(Level.FINE)) {

                logger.fine("Failed find of recordset for " + lookupName);

            }

            if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) {

                // Do lookup that bypasses javadns.

                InetAddress address = null;

                try {

                    address = InetAddress.getByName(dnsName);

                } catch (UnknownHostException e1) {

                    address = null;

                }

                if (address != null) {

                    targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);

                    curi.setFetchStatus(S_GETBYNAME_SUCCESS);

                    if (logger.isLoggable(Level.FINE)) {

                        logger.fine("Found address for " + dnsName +

                            " using native dns.");

                    }

                } else {

                    if (logger.isLoggable(Level.FINE)) {

                        logger.fine("Failed find of address for " + dnsName +

                            " using native dns.");

                    }

                    setUnresolvable(curi, targetHost);

                }

            } else {

                setUnresolvable(curi, targetHost);

            }

        }

        curi.setFetchCompletedTime(System.currentTimeMillis());

    }

相关调用方法如下(dnsjava-2.0.3.jar组件的API) 

/**

     * 设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性

     * @param curi

     * @param dnsName

     * @param targetHost

     * @param rrecordSet

     */

    protected void storeDNSRecord(final CrawlURI curi, final String dnsName,

            final CrawlHost targetHost, final Record[] rrecordSet) {

        // Get TTL and IP info from the first A record (there may be

        // multiple, e.g. www.washington.edu) then update the CrawlServer

        ARecord arecord = getFirstARecord(rrecordSet);

        if (arecord == null) {

            throw new NullPointerException("Got null arecord for " +

                dnsName);

        }

        //设置CrawlHost targetHost对象IP属性

        targetHost.setIP(arecord.getAddress(), arecord.getTTL());

        try {

            //CrawlURI curi对象的Recorder httpRecorder属性

            recordDNS(curi, rrecordSet);

            curi.setFetchStatus(S_DNS_SUCCESS);

            curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());

        } catch (IOException e) {

            logger.log(Level.SEVERE, "Failed store of DNS Record for " +

                curi.toString(), e);

            setUnresolvable(curi, targetHost);

        }

    }

    /**

     * IP地址转换为InetAddress

     * @param curi

     * @param dnsName

     * @param targetHost

     * @return

     */

    protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,

            final CrawlHost targetHost) {

        boolean result = false;

        Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);

        // If it's an ip no need to do a lookup

        if (matcher == null || !matcher.matches()) {

            return result;

        }

        

        result = true;

        // Ideally this branch would never be reached: no CrawlURI

        // would be created for numerical IPs

        if (logger.isLoggable(Level.WARNING)) {

            logger.warning("Unnecessary DNS CrawlURI created: " + curi);

        }

        try {

            targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {

                    (byte) (new Integer(matcher.group(1)).intValue()),

                    (byte) (new Integer(matcher.group(2)).intValue()),

                    (byte) (new Integer(matcher.group(3)).intValue()),

                    (byte) (new Integer(matcher.group(4)).intValue()) }),

                    CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs

            curi.setFetchStatus(S_DNS_SUCCESS);

        } catch (UnknownHostException e) {

            logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);

            setUnresolvable(curi, targetHost);

        }

        return result;

    }

    /**

     * 封装到CrawlURI curi对象的Recorder httpRecorder属性

     * @param curi

     * @param rrecordSet

     * @throws IOException

     */

    protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)

            throws IOException {

        //转换为byte[]

        final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),

                rrecordSet);



        Recorder rec = curi.getRecorder();

        // Shall we get a digest on the content downloaded?

        boolean digestContent = getDigestContent();

        String algorithm = null;

        if (digestContent) {

            algorithm = getDigestAlgorithm();

            rec.getRecordedInput().setDigest(algorithm);

        } else {

            rec.getRecordedInput().setDigest((MessageDigest)null);

        }

        //byte[]转换为InputStream,封装到CrawlURI curi对象的Recorder httpRecorder属性

        InputStream is = curi.getRecorder().inputWrap(

                new ByteArrayInputStream(dnsRecord));



        if (digestContent) {

            rec.getRecordedInput().startDigest();

        }



        // Reading from the wrapped stream, behind the scenes, will write

        // files into scratch space

        try {

            while (is.read(this.reusableBuffer) != -1) {

                continue;

            }

        } finally {

            is.close();

            rec.closeRecorders();

        }

        curi.setContentSize(dnsRecord.length);



        if (digestContent) {

            curi.setContentDigest(algorithm,

                rec.getRecordedInput().getDigestValue());

        }

    }

    /**

     * 转换为byte[]

     * @param fetchStart

     * @param rrecordSet

     * @return

     * @throws IOException

     */

    protected byte [] getDNSRecord(final long fetchStart,

            final Record[] rrecordSet)

    throws IOException {

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        // Start the record with a 14-digit date per RFC 2540

        byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();

        baos.write(fetchDate);

        // Don't forget the newline

        baos.write("\n".getBytes());

        int recordLength = fetchDate.length + 1;

        if (rrecordSet != null) {

            for (int i = 0; i < rrecordSet.length; i++) {

                byte[] record = rrecordSet[i].toString().getBytes();

                recordLength += record.length;

                baos.write(record);

                // Add the newline between records back in

                baos.write("\n".getBytes());

                recordLength += 1;

            }

        }

        return baos.toByteArray();

    }

    

    protected void setUnresolvable(CrawlURI curi, CrawlHost host) {

        host.setIP(null, 0);

        curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 

    }

    /**

     * 返回Record[] rrecordSet数组Type.A类型的Record元素

     * @param rrecordSet

     * @return

     */

    protected ARecord getFirstARecord(Record[] rrecordSet) {

        ARecord arecord = null;

        if (rrecordSet == null || rrecordSet.length == 0) {

            if (logger.isLoggable(Level.FINEST)) {

                logger.finest("rrecordSet is null or zero length: " +

                    rrecordSet);

            }

            return arecord;

        }

        for (int i = 0; i < rrecordSet.length; i++) {

            if (rrecordSet[i].getType() != Type.A) {

                if (logger.isLoggable(Level.FINEST)) {

                    logger.finest("Record " + Integer.toString(i) +

                        " is not A type but " + rrecordSet[i].getType());

                }

                continue;

            }

            arecord = (ARecord) rrecordSet[i];

            break;

        }

        return arecord;

    }

FetchDNS处理器和后面的FetchHTTP处理器涉及到消息摘要算法MessageDigest digest 对象,我这里转自网上的一篇文章供参考 

转自 http://huangyunbin.iteye.com/blog/1123442

MessageDigest的功能及用法

MessageDigest 类为应用程序提供信息摘要算法的功能,如 MD5 或 SHA 算法。信息摘要是安全的单向哈希函数,它接收任意大小的数据,并输出固定长度的哈希值。 

MessageDigest 对象开始被初始化。该对象通过使用 update()方法处理数据。任何时候都可以调用 reset()方法重置摘要。一旦所有需要更新的数据都已经被更新了,应该调用digest() 方法之一完成哈希计算。 

对于给定数量的更新数据,digest 方法只能被调用一次。在调用 digest 之后,MessageDigest 对象被重新设置成其初始状态。 

1、public static MessageDigest getInstance(String algorithm) 
                                 throws NoSuchAlgorithmException 

   返回实现指定摘要算法的 MessageDigest 对象。 

   algorithm - 所请求算法的名称 

2、public static MessageDigest getInstance(String algorithm, 
                                        String provider) 
                                 throws NoSuchAlgorithmException, 
                                        NoSuchProviderException 

  返回实现指定摘要算法的 MessageDigest 对象。 

  algorithm - 所请求算法的名称 

  provider - 提供者的名称。 

3、public void update(byte[] input) 

  使用指定的 byte 数组更新摘要。 

4、public byte[] digest() 

  通过执行诸如填充之类的最终操作完成哈希计算。在调用此方法之后,摘要被重置。 

5、public static boolean isEqual(byte[] digesta, 
                              byte[] digestb) 

    比较两个摘要的相等性。做简单的字节比较。 


注意:Provider可以通过 java.security.Security.getProviders() 方法获取已注册提供者列表。比较常用的有“SUN” 

SUN提供的常用的算法名称有:MD2 
MD5 
                        SHA-1 
                        SHA-256 
                        SHA-384 
                        SHA-512 

Code举例: 

import java.security.*; 

public class myDigest { 

  public static void main(String[] args)  { 

    myDigest my=new myDigest(); 

    my.testDigest(); 

  } 

  public void testDigest() 

  { 

   try { 

     String myinfo="我的测试信息"; 

    //java.security.MessageDigest alg=java.security.MessageDigest.getInstance("MD5"); 

      java.security.MessageDigest alga=java.security.MessageDigest.getInstance("SHA-1"); 

      alga.update(myinfo.getBytes()); 

      byte[] digesta=alga.digest(); 

      System.out.println("本信息摘要是:"+byte2hex(digesta)); 

      //通过某中方式传给其他人你的信息(myinfo)和摘要(digesta) 对方可以判断是否更改或传输正常 

      java.security.MessageDigest algb=java.security.MessageDigest.getInstance("SHA-1"); 

      algb.update(myinfo.getBytes()); 

      if (algb.isEqual(digesta,algb.digest())) { 

         System.out.println("信息检查正常"); 

       } 

       else 

        { 

          System.out.println("摘要不相同"); 

         } 

   } 

   catch (java.security.NoSuchAlgorithmException ex) { 

     System.out.println("非法摘要算法"); 

   } 

  } 

  public String byte2hex(byte[] b) //二行制转字符串 

    { 

     String hs=""; 

     String stmp=""; 

     for (int n=0;n<b.length;n++) 

      { 

       stmp=(java.lang.Integer.toHexString(b[n] & 0XFF)); 

       if (stmp.length()==1) hs=hs+"0"+stmp; 

       else hs=hs+stmp; 

       if (n<b.length-1)  hs=hs+":"; 

      } 

     return hs.toUpperCase(); 

    } 

} 

关于Java加密的更多信息:http://www.ibm.com/developerworks/cn/java/l-security/

--------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/30/3052411.html

你可能感兴趣的:(Heritrix)