基于Spindle的增强HTTP Spider

 

zz:http://www.iteye.com/news/1731

构建于lucene之上的可用的Java开源Spider少之又 少,spindle长期没有更新且功能不够完善,故而自己参考其源
代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,
有任何意见及建议均可Email联系我 ([email protected])
   以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector- 1.0.5;
下载地址分别为
htmlparser:http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis:http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
spindle的官方站点:http://www.bitmechanic.com/projects/spindle/

Java 代码
  1. package  com.huizhi.kanine.util;  
  2.   
  3. import  java.io.BufferedReader;  
  4. import  java.io.File;  
  5. import  java.io.FileNotFoundException;  
  6. import  java.io.IOException;  
  7. import  java.io.InputStream;  
  8. import  java.io.InputStreamReader;  
  9. import  java.io.UnsupportedEncodingException;  
  10. import  java.net.HttpURLConnection;  
  11. import  java.net.MalformedURLException;  
  12. import  java.net.SocketException;  
  13. import  java.net.SocketTimeoutException;  
  14. import  java.net.URL;  
  15. import  java.net.UnknownHostException;  
  16. import  java.nio.charset.Charset;  
  17. import  java.util.ArrayList;  
  18. import  java.util.Date;  
  19. import  java.util.HashSet;  
  20.   
  21. import  jeasy.analysis.MMAnalyzer;  
  22.   
  23. import  org.apache.lucene.analysis.Analyzer;  
  24. import  org.apache.lucene.document.DateTools;  
  25. import  org.apache.lucene.document.Document;  
  26. import  org.apache.lucene.document.Field;  
  27. import  org.apache.lucene.index.CorruptIndexException;  
  28. import  org.apache.lucene.index.IndexReader;  
  29. import  org.apache.lucene.index.IndexWriter;  
  30. import  org.apache.lucene.index.Term;  
  31. import  org.apache.lucene.search.Hits;  
  32. import  org.apache.lucene.search.IndexSearcher;  
  33. import  org.apache.lucene.search.TermQuery;  
  34. import  org.apache.lucene.store.Directory;  
  35. import  org.apache.lucene.store.LockObtainFailedException;  
  36. import  org.apache.lucene.store.RAMDirectory;  
  37. import  org.htmlparser.Parser;  
  38. import  org.htmlparser.PrototypicalNodeFactory;  
  39. import  org.htmlparser.filters.AndFilter;  
  40. import  org.htmlparser.filters.HasAttributeFilter;  
  41. import  org.htmlparser.filters.NodeClassFilter;  
  42. import  org.htmlparser.tags.BaseHrefTag;  
  43. import  org.htmlparser.tags.FrameTag;  
  44. import  org.htmlparser.tags.LinkTag;  
  45. import  org.htmlparser.tags.MetaTag;  
  46. import  org.htmlparser.util.EncodingChangeException;  
  47. import  org.htmlparser.util.NodeIterator;  
  48. import  org.htmlparser.util.NodeList;  
  49. import  org.htmlparser.util.ParserException;  
  50. import  org.htmlparser.visitors.HtmlPage;  
  51.   
  52. import  cpdetector.io.ASCIIDetector;  
  53. import  cpdetector.io.CodepageDetectorProxy;  
  54. import  cpdetector.io.JChardetFacade;  
  55. import  cpdetector.io.ParsingDetector;  
  56. import  cpdetector.io.UnicodeDetector;  
  57.   
  58.   
  59. /**  
  60.  * @author 张 波   
  61.  * E-mail:[email protected]    
  62.  * Created On : 2008-03-30  
  63.  */   
  64. public   class  SiteCapturer  implements  Runnable{  
  65.       
  66.     /* 基准(初始)URL */   
  67.     protected  URL mSource;  
  68.   
  69.     /* 索引文件的存放位置 */   
  70.     protected  String mTarget;  
  71.   
  72.     /**  
  73.      * 待 解析的URL地址集合,所有新检测到的链接均存放于此;  
  74.      * 解析时按照先入先出(First-In First-Out)法则线性取出  
  75.      */   
  76.     protected  ArrayList mPages;  
  77.   
  78.     /* 已解析的URL地址集合,避免链接的重复抓取 */   
  79.     protected  HashSet mFinished;  
  80.   
  81.     protected  Parser mParser;  
  82.       
  83.     /* StringBuffer的缓冲区大小 */   
  84.     protected    final   int  TRANSFER_SIZE =  4096 ;  
  85.       
  86.     /* 当前平台的行分隔符 */   
  87.     protected   static  String lineSep = System.getProperty( "line.separator" );  
  88.       
  89.     /* 程序运行线程数,默认2个线程 */   
  90.     protected   int  mthreads;  
  91.       
  92.     protected  ArrayList threadList;  
  93.       
  94.     /* 存储于磁盘的IndexWriter */   
  95.     protected  IndexWriter FSDWriter;  
  96.       
  97.     /* 存储于内存的IndexWriter */   
  98.     protected  IndexWriter RAMWriter;  
  99.   
  100.     protected  IndexSearcher indexSearcher;  
  101.   
  102.     protected  RAMDirectory ramDirectory;  
  103.       
  104.     /* 筛选页面内容的分词器 */   
  105.     protected  Analyzer luceneAnalyzer;  
  106.   
  107.     /* 解析页面时的字符编码 */   
  108.     protected  String charset;  
  109.       
  110.     /* 统计已抓取的页面数量 */   
  111.     protected   int  count =  0 ;  
  112.       
  113.     /* 基准端口 */   
  114.     protected   int  mPort;  
  115.       
  116.     /* 基准主机 */   
  117.     protected  String mHost;  
  118.       
  119.     /* 检测索引中是否存在当前URL信息,避免重复抓取 */   
  120.     protected   boolean  mCheck;  
  121.   
  122.     /* 索引操作的写入线程锁 */   
  123.     public   static   final  Object indexLock =  new  Object();  
  124.       
  125.     public  SiteCapturer() {  
  126.         mSource = null ;  
  127.         mTarget = null ;  
  128.         mthreads = 2 ;  
  129.         mCheck = false ;  
  130.         mPages = new  ArrayList();  
  131.         mFinished = new  HashSet();  
  132.         mParser = new  Parser();  
  133.         PrototypicalNodeFactory factory = new  PrototypicalNodeFactory();  
  134.         factory.registerTag(new  LocalLinkTag());  
  135.         factory.registerTag(new  LocalFrameTag());  
  136.         factory.registerTag(new  LocalBaseHrefTag());  
  137.         mParser.setNodeFactory(factory);  
  138.     }  
  139.   
  140.     public  String getSource() {  
  141.         return  mSource.toString();  
  142.     }  
  143.   
  144.     public   void  setSource(String source) {  
  145.         if  (source.endsWith( "/" ))  
  146.             source = source.substring(0 , source.length() -  1 );  
  147.         try  {  
  148.             mSource = new  URL(source);  
  149.         } catch  (MalformedURLException e) {  
  150.             System.err.println("Invalid URL : "  + getSource());  
  151.         }  
  152.     }  
  153.   
  154.     public  String getTarget() {  
  155.         return  (mTarget);  
  156.     }  
  157.   
  158.     public   void  setTarget(String target) {  
  159.         mTarget = target;  
  160.     }  
  161.       
  162.     public   int  getThreads() {  
  163.         return  (mthreads);  
  164.     }  
  165.   
  166.     public   void  setThreads( int  threads) {  
  167.         mthreads = threads;  
  168.     }  
  169.       
  170.     public   boolean  isMCheck() {  
  171.         return  mCheck;  
  172.     }  
  173.   
  174.     public   void  setMCheck( boolean  check) {  
  175.         mCheck = check;  
  176.     }  
  177.   
  178.     /**  
  179.      * 程 序入口,在此初始化mPages、IndexWriter  
  180.      * 通过协调各线程间的活动完成website的抓取工作  
  181.      * 任务完成后将所有的索引片段合并为一个以优化检索  
  182.      */   
  183.     public   void  capture(){  
  184.   
  185.         mPages.clear();  
  186.         mPages.add(getSource());  
  187.           
  188.         int  responseCode =  0 ;  
  189.         String contentType = "" ;  
  190.           
  191.         try  {  
  192.             HttpURLConnection uc = (HttpURLConnection) mSource.openConnection();  
  193.             responseCode = uc.getResponseCode();  
  194.             contentType = uc.getContentType();  
  195.         } catch  (MalformedURLException mue) {  
  196.             System.err.println("Invalid URL : "  + getSource());  
  197.         } catch  (IOException ie) {  
  198.             if  (ie  instanceof  UnknownHostException) {  
  199.                 System.err.println("UnknowHost : "  + getSource());  
  200.             } else   if  (ie  instanceof  SocketException) {  
  201.                 System.err.println("Socket Error : "  + ie.getMessage() +  " "   
  202.                         + getSource());  
  203.             } else   
  204.                 ie.printStackTrace();  
  205.         }  
  206.           
  207.         if  (responseCode == HttpURLConnection.HTTP_OK  
  208.                 && contentType.startsWith("text/html" )) {  
  209.               
  210.             mPort = mSource.getPort();  
  211.             mHost = mSource.getHost();  
  212.             charset = autoDetectCharset(mSource);  
  213.   
  214.             /* 存放索引文件的位置 */   
  215.             File indexDir = new  File(mTarget);  
  216.             /* 标记是否重新建立索引,true为重新建立索引 */   
  217.             boolean  flag =  true ;  
  218.             if  (!indexDir.exists()) {  
  219.                 /* 如果文件夹不存在则创建 */   
  220.                 indexDir.mkdir();  
  221.             } else   if  (IndexReader.indexExists(mTarget)) {  
  222.                 /* 如果已存在索引,则追加索引 */   
  223.                 flag = false ;  
  224.                 File lockfile = new  File(mTarget + File.separator +  "write.lock" );  
  225.                 if  (lockfile.exists())  
  226.                     lockfile.delete();  
  227.             }  
  228.             luceneAnalyzer = new  MMAnalyzer();  
  229.             ramDirectory = new  RAMDirectory();  
  230.   
  231.             try  {  
  232.                 FSDWriter = new  IndexWriter(indexDir, luceneAnalyzer, flag);  
  233.                 RAMWriter = new  IndexWriter(ramDirectory, luceneAnalyzer,  true );  
  234.                   
  235.                 while  (mCheck) {  
  236.                     IndexReader indexReader = IndexReader.open(mTarget);  
  237.                     indexSearcher = new  IndexSearcher(indexReader);  
  238.                 }  
  239.                   
  240.                 long  start = System.currentTimeMillis();  
  241.                 threadList = new  ArrayList();  
  242.   
  243.                 for  ( int  i =  0 ; i < mthreads; i++) {  
  244.                     Thread t = new  Thread( this "K-9 Spider Thread #"  + (i +  1 ));  
  245.                     t.start();  
  246.                     threadList.add(t);  
  247.                 }  
  248.                 while  (threadList.size() >  0 ) {  
  249.                     Thread child = (Thread) threadList.remove(0 );  
  250.                     try  {  
  251.                         child.join();  
  252.                     } catch  (InterruptedException e) {  
  253.                         e.printStackTrace();  
  254.                     }  
  255.                 }  
  256.                 long  elapsed = System.currentTimeMillis() - start;  
  257.   
  258.                 RAMWriter.close();  
  259.                 FSDWriter.addIndexes(new  Directory[] { ramDirectory });  
  260.                 FSDWriter.optimize();  
  261.                 FSDWriter.close();  
  262.   
  263.                 System.out.println("Finished in "  + (elapsed /  1000 )  
  264.                         + " seconds" );  
  265.                 System.out.println("The Count of the Links Captured is "   
  266.                         + count);  
  267.             } catch  (CorruptIndexException cie) {  
  268.                 cie.printStackTrace();  
  269.             } catch  (LockObtainFailedException lofe) {  
  270.                 lofe.printStackTrace();  
  271.             } catch  (IOException ie) {  
  272.                 ie.printStackTrace();  
  273.             }  
  274.         }      
  275.     }  
  276.       
  277.     public   void  run() {  
  278.         String url;  
  279.         while  ((url = dequeueURL()) !=  null ) {  
  280.             if  (isToBeCaptured(url))  
  281.                 process(url);  
  282.         }  
  283.         mthreads--;  
  284.     }  
  285.   
  286.     /**  
  287.      * 判 断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain  
  288.      */   
  289.     public   boolean  isToBeCaptured (String url){  
  290.         boolean  flag =  false ;  
  291.           
  292.         HttpURLConnection uc = null ;  
  293.         int  responseCode =  0 ;  
  294.         String contentType = "" ;  
  295.         String host = "" ;  
  296.         int  port =  0 ;  
  297.           
  298.         try  {  
  299.             URL source = new  URL(url);  
  300.             String protocol = source.getProtocol();  
  301.             if  (protocol !=  null  && protocol.equals( "http" )) {  
  302.                 host = source.getHost();  
  303.                 port = source.getPort();  
  304.                 uc = (HttpURLConnection) source.openConnection();  
  305.                 uc.setConnectTimeout(8000 );  
  306.                 responseCode = uc.getResponseCode();  
  307.                 contentType = uc.getContentType();  
  308.             }  
  309.         } catch  (MalformedURLException mue) {  
  310.             System.err.println("Invalid URL : "  + url);  
  311.         } catch  (IOException ie) {  
  312.             if  (ie  instanceof  UnknownHostException) {  
  313.                 System.err.println("UnknowHost : "  + url);  
  314.             } else   if  (ie  instanceof  SocketException) {  
  315.                 System.err.println("Socket Error : "  + ie.getMessage() +  " "   
  316.                         + url);  
  317.             } else   if  (ie  instanceof  SocketTimeoutException) {  
  318.                 System.err.println("Socket Connection Time Out : "  + url);  
  319.             } else   if  (ie  instanceof  FileNotFoundException) {  
  320.                 System.err.println("broken link "   
  321.                         + ((FileNotFoundException) ie.getCause()).getMessage()  
  322.                         + " ignored" );  
  323.             } else   
  324.                 ie.printStackTrace();  
  325.         }  
  326.           
  327.         if  (port == mPort  
  328.                 && responseCode == HttpURLConnection.HTTP_OK  
  329.                 && host.equals(mHost)  
  330.                 && (contentType.startsWith("text/html" ) || contentType  
  331.                         .startsWith("text/plain" )))  
  332.             flag = true ;  
  333.         return  flag;  
  334.     }  
  335.   
  336.     /* 从URL队列mPages里取出单个的URL */   
  337.     public   synchronized  String dequeueURL() {  
  338.         while  ( true ) {  
  339.             if  (mPages.size() >  0 ) {  
  340.                 String url = (String) mPages.remove(0 );  
  341.                 mFinished.add(url);  
  342.                   
  343.                 if  (isToBeCaptured(url)) {  
  344.                     int  bookmark;  
  345.                     NodeList list;  
  346.                     NodeList robots;  
  347.                     MetaTag robot;  
  348.                     String content;  
  349.                     try  {  
  350.                         bookmark = mPages.size();  
  351.                         /* 获取页面所有节点 */   
  352.                         mParser.setURL(url);  
  353.                         try  {  
  354.                             list = new  NodeList();  
  355.                             for  (NodeIterator e = mParser.elements(); e  
  356.                                     .hasMoreNodes();)  
  357.                                 list.add(e.nextNode());  
  358.                         } catch  (EncodingChangeException ece) {  
  359.                             /* 解码出错的异常处理 */   
  360.                             mParser.reset();  
  361.                             list = new  NodeList();  
  362.                             for  (NodeIterator e = mParser.elements(); e  
  363.                                     .hasMoreNodes();)  
  364.                                 list.add(e.nextNode());  
  365.                         }  
  366.                         /**  
  367.                          * 依 据 http://www.robotstxt.org/wc/meta-user.html  处 理  
  368.                          * Robots  tag  
  369.                          */   
  370.                         robots = list  
  371.                                 .extractAllNodesThatMatch(  
  372.                                         new  AndFilter( new  NodeClassFilter(  
  373.                                                 MetaTag.class ),  
  374.                                                 new  HasAttributeFilter( "name" ,  
  375.                                                         "robots" )),  true );  
  376.                         if  ( 0  != robots.size()) {  
  377.                             robot = (MetaTag) robots.elementAt(0 );  
  378.                             content = robot.getAttribute("content" )  
  379.                                     .toLowerCase();  
  380.                             if  ((- 1  != content.indexOf( "none" ))  
  381.                                     || (-1  != content.indexOf( "nofollow" )))  
  382.                                 for  ( int  i = bookmark; i < mPages.size(); i++)  
  383.                                     mPages.remove(i);  
  384.                         }  
  385.                     } catch  (ParserException pe) {  
  386.                         pe.printStackTrace();  
  387.                     }  
  388.                 }  
  389.                 return  url;  
  390.             } else  {  
  391.                 mthreads--;  
  392.                 if  (mthreads >  0 ) {  
  393.                     try  {  
  394.                         wait();  
  395.                         mthreads++;  
  396.                     } catch  (InterruptedException ie) {  
  397.                         ie.printStackTrace();  
  398.                     }  
  399.                 } else  {  
  400.                     notifyAll();  
  401.                     return   null ;  
  402.                 }  
  403.             }  
  404.         }  
  405.     }  
  406.   
  407.     /**  
  408.      * 处 理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行  
  409.      */   
  410.     protected   void  process(String url) {  
  411.           
  412.         String result[];  
  413.         String content = null ;  
  414.         String title = null ;  
  415.   
  416.         /* 此项操作较耗性能,故默认不予检测 */   
  417.         if  (mCheck) {  
  418.             try  {  
  419.                 TermQuery query = new  TermQuery( new  Term( "url" , url));  
  420.                 Hits hits = indexSearcher.search(query);  
  421.                 if  (hits.length() >  0 ) {  
  422.                     System.out.println("The URL : "  + url  
  423.                             + " has already been captured" );  
  424.                 } else  {  
  425.                     result = parseHtml(url, charset);  
  426.                     content = result[0 ];  
  427.                     title = result[1 ];  
  428.                 }  
  429.             } catch  (IOException ie) {  
  430.                 ie.printStackTrace();  
  431.             }  
  432.         } else  {  
  433.             result = parseHtml(url, charset);  
  434.             content = result[0 ];  
  435.             title = result[1 ];  
  436.         }  
  437.           
  438.         if  (content !=  null  && content.trim().length() >  0 ) {  
  439.   
  440.             Document document = new  Document();  
  441.             document.add(new  Field( "content" , content, Field.Store.YES,  
  442.                     Field.Index.TOKENIZED,  
  443.                     Field.TermVector.WITH_POSITIONS_OFFSETS));  
  444.             document.add(new  Field( "url" , url, Field.Store.YES,  
  445.                     Field.Index.UN_TOKENIZED));  
  446.             document.add(new  Field( "title" , title, Field.Store.YES,  
  447.                     Field.Index.TOKENIZED,  
  448.                     Field.TermVector.WITH_POSITIONS_OFFSETS));  
  449.             document.add(new  Field( "date" , DateTools.timeToString( new  Date()  
  450.                     .getTime(), DateTools.Resolution.DAY), Field.Store.YES,  
  451.                     Field.Index.UN_TOKENIZED));  
  452.               
  453.             synchronized  (indexLock) {  
  454.                 try  {  
  455.                     RAMWriter.addDocument(document);  
  456.                     /**  
  457.                      * 当 存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是  
  458.                      * 通过内存缓冲避免频繁的IO操作,提高索引创建性能;  
  459.                      */   
  460.                     if  (RAMWriter.ramSizeInBytes() >  512  *  1024 ) {  
  461.                         RAMWriter.close();  
  462.                         FSDWriter.addIndexes(new  Directory[] { ramDirectory });  
  463.                         RAMWriter = new  IndexWriter(ramDirectory,  
  464.                                 luceneAnalyzer, true );  
  465.                     }  
  466.                     count++;  
  467.                     System.out.println(Thread.currentThread().getName()  
  468.                             + ": Finished Indexing URL: "  + url);  
  469.                 } catch  (CorruptIndexException cie) {  
  470.                     cie.printStackTrace();  
  471.                 } catch  (IOException ie) {  
  472.                     ie.printStackTrace();  
  473.                 }  
  474.             }  
  475.         }  
  476.     }  
  477.   
  478.     /**  
  479.      * Link tag that rewrites the HREF.  
  480.      * The HREF is changed to a local target if it matches the source.  
  481.      */   
  482.     class  LocalLinkTag  extends  LinkTag {  
  483.         public   void  doSemanticAction() {  
  484.   
  485.             String link = getLink();  
  486.             if  (link.endsWith( "/" ))  
  487.                 link = link.substring(0 , link.length() -  1 );  
  488.             int  pos = link.indexOf( "#" );  
  489.             if  (pos != - 1 )  
  490.                 link = link.substring(0 , pos);  
  491.   
  492.             /* 将链接加入到处理队列中 */   
  493.             if  (!(mFinished.contains(link) || mPages.contains(link)))  
  494.                 mPages.add(link);  
  495.   
  496.             setLink(link);  
  497.         }  
  498.     }  
  499.   
  500.     /**  
  501.      * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local  
  502.      * targets if they match the source.  
  503.      */   
  504.     class  LocalFrameTag  extends  FrameTag {  
  505.         public   void  doSemanticAction() {  
  506.   
  507.             String link = getFrameLocation();  
  508.             if  (link.endsWith( "/" ))  
  509.                 link = link.substring(0 , link.length() -  1 );  
  510.             int  pos = link.indexOf( "#" );  
  511.             if  (pos != - 1 )  
  512.                 link = link.substring(0 , pos);  
  513.   
  514.             /* 将链接加入到处理队列中 */   
  515.             if  (!(mFinished.contains(link) || mPages.contains(link)))  
  516.                 mPages.add(link);  
  517.   
  518.             setFrameLocation(link);  
  519.         }  
  520.     }  
  521.   
  522.     /**  
  523.      * Base tag that doesn't show. The toHtml() method is overridden to return  
  524.      * an empty string, effectively shutting off the base reference.  
  525.      */   
  526.     class  LocalBaseHrefTag  extends  BaseHrefTag {  
  527.           
  528.         public  String toHtml() {  
  529.             return  ( "" );  
  530.         }  
  531.     }  
  532.       
  533.     /* 自动探测页面编码,避免中文乱码的出现 */   
  534.     protected  String autoDetectCharset(URL url) {  
  535.           
  536.         CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();  
  537.         /**  
  538.          * ParsingDetector 可用于检查HTML、XML等文件或字符流的编码  
  539.          * 构造方法中的参数用于指示是否显示探测过程的详细信息  
  540.          * 为false则不显示  
  541.          */    
  542.         detector.add(new  ParsingDetector( false ));  
  543.         detector.add(JChardetFacade.getInstance());  
  544.         detector.add(ASCIIDetector.getInstance());  
  545.         detector.add(UnicodeDetector.getInstance());  
  546.           
  547.         Charset charset = null ;  
  548.         try  {  
  549.             charset = detector.detectCodepage(url);  
  550.         } catch  (MalformedURLException mue) {  
  551.             mue.printStackTrace();  
  552.         } catch  (IOException ie) {  
  553.             ie.printStackTrace();  
  554.         }  
  555.         if  (charset ==  null )  
  556.             charset = Charset.defaultCharset();  
  557.         return  charset.name();  
  558.     }  
  559.   
  560.     /* 按照指定编码解析标准的html页面,为建立索引做准备*/   
  561.     protected  String[] parseHtml(String url, String charset) {  
  562.   
  563.         String result[] = null ;  
  564.         String content = null ;  
  565.           
  566.         tr
分享到:
评论

你可能感兴趣的:(apache,.net,socket,IE,Lucene)