lucene2.0+heritrix示例补充

  由于lucene2.0+heritrix一书示例用的网站(http://mobile.pconline.com.cn/,http: //mobile.163.com/)改版了,书上实例不能运行,我又找了一个http://mobile.younet.com/进行开发并成功实现示例,希望感兴趣的同学,近快实践,如果此网站也改了就又得改extractor了,哈哈!
search的Extractor代码如下,(别和书上实例相同)供大家参考:附件里有完整代码

<!--<br /> <br /> Code highlighting produced by Actipro CodeHighlighter (freeware)<br /> http://www.CodeHighlighter.com/<br /> <br /> --> package  com.luceneheritrixbook.extractor.younet;

import  java.io.BufferedWriter;
import  java.io.File;
import  java.io.FileWriter;
import  java.io.IOException;
import  java.util.Date;

import  org.htmlparser.Node;
import  org.htmlparser.NodeFilter;
import  org.htmlparser.Parser;
import  org.htmlparser.filters.AndFilter;
import  org.htmlparser.filters.HasAttributeFilter;
import  org.htmlparser.filters.HasChildFilter;
import  org.htmlparser.filters.TagNameFilter;
import  org.htmlparser.tags.ImageTag;
import  org.htmlparser.util.NodeIterator;
import  org.htmlparser.util.NodeList;

import  com.luceneheritrixbook.extractor.Extractor;
import  com.luceneheritrixbook.util.StringUtils;

/**
 * <p></p>
 * 
@author  [email protected]
 * @date   Feb 6, 2009 
 
*/

public   class  ExtractYounetMoblie  extends  Extractor {

    @Override
    
public   void  extract() {
        BufferedWriter bw 
=   null ;
        NodeFilter title_filter 
=   new  AndFilter( new  TagNameFilter( " div " ),  new  HasAttributeFilter( " class " " mo_tit " ));
        NodeFilter attribute_filter 
=   new  AndFilter( new  TagNameFilter( " p " ),  new  HasChildFilter( new  AndFilter( new  TagNameFilter( " span " ),  new  HasAttributeFilter( " class " " gn_sp1 blue1 " ))));
        NodeFilter img_filter 
=   new  AndFilter( new  TagNameFilter( " span " ),  new  HasChildFilter( new  TagNameFilter( " img " )));
        
        
// 提取标题信息
         try  {
            
// Parser根据过滤器返回所有满足过滤条件的节点
            
//  迭代逐渐查找
            NodeList nodeList = this .getParser().parse(title_filter);
            NodeIterator it 
=  nodeList.elements();
            StringBuffer title 
=   new  StringBuffer();
            
while  (it.hasMoreNodes()) {
                Node node 
=  (Node) it.nextNode();
                String[] names 
=  node.toPlainTextString().split( "   " );
                
for ( int  i  =   0 ; i  <  names.length; i ++ )
                    title.append(names[i]).append(
" - " );
                title.append(
new  Date().getTime());
                
// 创建要生成的文件
                bw  =   new  BufferedWriter( new  FileWriter( new  File( this .getOutputPath()  +  title  +   " .txt " )));
                
// 获取当前提取页的完整URL地址
                 int  startPos  =   this .getInuputFilePath().indexOf( " mirror " +   6 ;
                String url_seg 
=   this .getInuputFilePath().substring(startPos);
                url_seg 
=  url_seg.replaceAll( " \\\\ " " / " );
                String url 
=   " http:/ "   +  url_seg;
                
// 写入当前提取页的完整URL地址
                bw.write(url  +  NEWLINE);
                bw.write(names[
0 +  NEWLINE);
                bw.write(names[
1 +  NEWLINE);
                
            }
            
//  重置Parser
             this .getParser().reset();
            Parser attNameParser 
=   null ;
            Parser attValueParser 
=   null ;
            
// Parser parser=new Parser(" http://www.sina.com.cn ");
            NodeFilter attributeName_filter  =   new  AndFilter( new  TagNameFilter( " span " ),  new  HasAttributeFilter( " class " " gn_sp1 blue1 " ));
            NodeFilter attributeValue_filter 
=   new  AndFilter( new  TagNameFilter( " span " ),  new  HasAttributeFilter( " class " " gn_sp2 " ));
            String attName 
=   "" ;
            String attValue 
=   "" ;
            
//  迭代逐渐查找
            nodeList = this .getParser().parse(attribute_filter);
            it 
=  nodeList.elements();
            
while  (it.hasMoreNodes()) {                
                Node node 
=  (Node) it.nextNode();
                attNameParser 
=   new  Parser();
                attNameParser.setEncoding(
" GB2312 " );
                attNameParser.setInputHTML(node.toHtml());
                NodeList attNameNodeList 
=  attNameParser.parse(attributeName_filter);
                attName 
=  attNameNodeList.elements().nextNode().toPlainTextString();
                
                attValueParser 
=   new  Parser();
                attValueParser.setEncoding(
" GB2312 " );
                attValueParser.setInputHTML(node.toHtml());
                NodeList attValueNodeList 
=  attValueParser.parse(attributeValue_filter);
                attValue 
=  attValueNodeList.elements().nextNode().toPlainTextString();
                bw.write(attName.trim() 
+  attValue.trim());
                bw.newLine();
            }
            
//  重置Parser
             this .getParser().reset();
            String imgUrl 
=   "" ;
            String fileType 
= "" ;
            
//  迭代逐渐查找
            nodeList = this .getParser().parse(img_filter);
            it 
=  nodeList.elements();
            
while  (it.hasMoreNodes()) {                
                Node node 
=  (Node) it.nextNode();
                
                ImageTag imgNode 
=  (ImageTag)node.getChildren().elements().nextNode();
                imgUrl 
=  imgNode.getAttribute( " src " );                
                fileType 
=  imgUrl.trim().substring(imgUrl
                        .lastIndexOf(
" . " +   1 );
                
// 生成新的图片的文件名
                String new_iamge_file  =  StringUtils.encodePassword(imgUrl, HASH_ALGORITHM)  +   " . "   +  fileType;
                
// imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
                
// 利用miorr目录下的图片生成的新的图片
                 this .copyImage(imgUrl, new_iamge_file);
                bw.write(SEPARATOR 
+  NEWLINE);
                bw.write(new_iamge_file 
+  NEWLINE);
            }
            
            
        } 
catch (Exception e) {
            e.printStackTrace();
        } 
finally  {
            
try {
                
if  (bw  !=   null )
                    bw.close();
            }
catch (IOException e){
                e.printStackTrace();
            }
        }
        
    }
}

运行书上的heritrix实例,并按书上的默认设置进行抓取如下URI:(请自己分析整理)

<!--<br /> <br /> Code highlighting produced by Actipro CodeHighlighter (freeware)<br /> http://www.CodeHighlighter.com/<br /> <br /> --> http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html

你可能感兴趣的:(html,mobile)