lucene入门到项目开发

加入jar包  lucene-core-2.4.0.jar  je-analysis-1.4.0.jar   lucene-highlighter-2.4.1.jar  lucene-analyzers-2.4.1.jar

先准备下工具类

Java代码 
package com.cs.lucene.utils; 
 
import java.io.BufferedReader; 
import java.io.File; 
import java.io.FileInputStream; 
import java.io.FileNotFoundException; 
import java.io.InputStreamReader; 
 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.NumberTools; 
import org.apache.lucene.document.Field.Index; 
import org.apache.lucene.document.Field.Store; 
 
public class File2DocumentUtiles { 
 
    /**
     *文件到document的转换
     * @param filepath
     * @return
     */ 
    public static Document file2Document(String filepath) { 
         
        File file = new File(filepath) ; 
         
        Document doc = new Document(); 
        doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词 
        doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词 
        doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词 
        doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引 
         
        return doc; 
    } 
/**
* 根据文件读取文件内容
* @param file
* @return
*/ 
    private static String readFileContent(File file) { 
         
        try { 
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); 
            StringBuffer content = new StringBuffer(); 
             
            for(String line=null; (line = reader.readLine())!=null ;){ 
                content.append(line).append("\n") ; 
            } 
            return content.toString() ; 
        } catch (Exception e) { 
            // TODO Auto-generated catch block 
            e.printStackTrace(); 
        } 
        return null; 
    } 
     
    public static void printDocInfo(Document doc){ 
        System.out.println("--------------------------"); 
        System.out.println("name          =" + doc.get("name")); 
        System.out.println("content       =" + doc.get("content")); 
        System.out.println("size          =" + NumberTools.stringToLong(doc.get("size"))); 
        System.out.println("path          =" + doc.get("path")); 
    } 
     
 



先了解下分词器
Java代码 
package com.cs.lucene.analyzer; 
 
import java.io.StringReader; 
 
import jeasy.analysis.MMAnalyzer; 
 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.Token; 
import org.apache.lucene.analysis.TokenStream; 
import org.junit.Test; 
 
public class AnalyzerTest { 
      String text = "资源来自互联网吴朝辉wwwa的a-b放到" ; 
      Analyzer analyzer = new MMAnalyzer() ; 
       
      @Test 
      public void testAnalyze() throws Exception{ 
          analyze(analyzer,text); 
      } 
 
    private void analyze(Analyzer analyzer2, String text2) throws Exception { 
        System.out.println("----------分词器-------------------"); 
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ; 
        for(Token token = new Token();(token = tokenStream.next(token))!=null;){ 
            System.out.println(token); 
        } 
    } 
       
       



现在看看FSDirectory和RAMDirectory
Java代码 
package com.cs.lucene.directory; 
 
import jeasy.analysis.MMAnalyzer; 
 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriter.MaxFieldLength; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 
import org.apache.lucene.store.RAMDirectory; 
import org.junit.Test; 
 
import com.cs.lucene.utils.File2DocumentUtiles; 
 
public class DirectoryTest { 
    //创建索引用的文件路径 
    String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有.txt"; 
    //存放索引的目录 
    String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ; 
     //分词器 
    Analyzer analyzer = new MMAnalyzer(); //je分词器 
     
     
    /**
     * 利用FSDirectory 创建索引
     * FSDirectory:在文件系统上存放
     * @throws Exception
     */ 
    @Test 
    public void testFSDirectory() throws Exception{  
          //测试文件系统目录   
          Directory dir =  FSDirectory.getDirectory(indexPath) ; 
          Document doc = File2DocumentUtiles.file2Document(filePath); 
           //参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引 
           IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引 
           indexWriter.addDocument(doc) ; 
            
           indexWriter.close() ; 
    } 
    /**
     * 利用RAMDirectory 创建索引
     * RAMDirectory:在内存中存放
     * 优点:读取快
     * 缺点:重新开机,索引没了
     * @throws Exception
     */ 
    @Test 
    public void testRAMDirectory() throws Exception{ 
          //测试文件系统目录 
          Directory dir =  new RAMDirectory() ; 
          Document doc = File2DocumentUtiles.file2Document(filePath); 
           //参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引 
           IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引 
           indexWriter.addDocument(doc) ; 
            
           indexWriter.close() ; 
    } 
    /**
     * 实际应用中,FSDirectory和RAMDirectory联合起来用
     * 操控内存的索引要快,所以在运行时操作RAMDirectory,
     * 但退出时必须保存到到文件系统上,所以退出时操控FSDirectory
     * @throws Exception
     */ 
    @Test 
    public void testRAMDirectoryAndFSDirectory() throws Exception{ 
        //整个过程:从文件系统上读取所以到内存,运行时添加索引,此时的全部索引都在内存中, 
        //退出时再把全部保存到文件系统上 
         
        Directory fsDir = FSDirectory.getDirectory(indexPath) ; 
         //1.启动时读取 
         Directory ramDir = new RAMDirectory(fsDir) ; 
         //运行时操作ramDir 
         IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED); 
         //添加document 
         Document doc = File2DocumentUtiles.file2Document(filePath) ; 
         ramIndexWriter.addDocument(doc) ; 
         ramIndexWriter.close() ;//一定要关闭再合并,因为有缓存 
          
         //2.退出时保存 
         //参数true表示把以前的索引删掉,全部重写 (默认为false) 
         IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED); 
         //new Directory[]{ramDir}:要合并的目录 
         //addIndexesNoOptimize:表示不做优化,做优化检索时相对要慢,但占用的存储空间小 
         fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ; 
         fsIndexWriter.flush() ; //优化之前一定要先刷新缓存 
         fsIndexWriter.optimize() ; //优化一定要在关闭之前做,优化可以提高检索的速度 
         fsIndexWriter.close() ; 
    } 
    @Test 
    public void testOptimize() throws Exception{ 
         
        Directory fsDir = FSDirectory.getDirectory(indexPath) ; 
        IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED); 
     
        fsIndexWriter.optimize() ;  
        fsIndexWriter.close() ; 
    } 



现在来测测索引如何建立以及搜索
Java代码 
package com.cs.lucene.lucene; 
 
import java.io.File; 
import java.util.ArrayList; 
import java.util.HashMap; 
import java.util.List; 
import java.util.Map; 
 
import jeasy.analysis.MMAnalyzer; 
 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriter.MaxFieldLength; 
import org.apache.lucene.queryParser.MultiFieldQueryParser; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.Filter; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.Sort; 
import org.apache.lucene.search.SortField; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.search.highlight.Formatter; 
import org.apache.lucene.search.highlight.Fragmenter; 
import org.apache.lucene.search.highlight.Highlighter; 
import org.apache.lucene.search.highlight.QueryScorer; 
import org.apache.lucene.search.highlight.Scorer; 
import org.apache.lucene.search.highlight.SimpleFragmenter; 
import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 
 
import com.cs.lucene.utils.File2DocumentUtiles; 
 
public class IndexDao { 
    // 存放索引的目录 
    private String indexPath; 
    private Analyzer analyzer = null; // 分词器 
 
    public IndexDao() { 
        this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex"; 
        this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好 
 
    } 
 
    public IndexDao(Analyzer analyzer, String indexPath) { 
        this.analyzer = analyzer; 
        this.indexPath = indexPath; 
    } 
 
    /**
     * 接受一个QuerString字符串 搜索索引并返回结果
     * 
     */ 
    public QueryResult search(String queryString, int firstResult, 
            int maxResults) throws Exception { 
        // 1.把要搜索的fields解析为Query 
        String[] fields = { "name", "content" }; 
        // boosts:需要的理由,标题和内容中出现关键字的得分不一样,在标题中出现时的得分理应高些 
        Map<String, Float> boosts = new HashMap<String, Float>(); 
        boosts.put("name", 3.0f); 
        boosts.put("content", 1.0f); // 默认值 
 
        QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer, 
                boosts);// 多field搜索 
        Query query = queryParser.parse(queryString); 
 
        return search(query, firstResult, maxResults); 
    } 
 
    /*
     * 接受一个Query对象 搜索索引并返回结果
     */ 
    public QueryResult search(Query query, int firstResult, int maxResults) 
            throws Exception { 
        IndexSearcher indexSearcher = null; 
        // 2.进行查询 
        indexSearcher = new IndexSearcher(indexPath); 
        Filter filter = null; // 搜索时的过滤器 
        /** ********过滤器************* */ 
        // 过滤器:把结果再过滤一遍,效率会很低 
        // filter = new 
        // RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true); 
        /** ************************* */ 
        Sort sort = new Sort(); 
        // 默认是按升序排序,参数true:排序结果改为按降序排列 
        sort.setSort(new SortField[] { new SortField("size", true) }); 
        TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort); 
 
        int recordCount = topDocs.totalHits; 
 
        /** ***********准备高亮器******************** */ 
        Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", 
                "</font>"); 
        Scorer scorer = new QueryScorer(query); 
        Highlighter highlighter = new Highlighter(formatter, scorer); 
 
        // 50表示只显示50个字符 这里的50个字符是有关键字的左右部分(称之为最佳部分) 这里只是测试用 
        Fragmenter fragmenter = new SimpleFragmenter(500); 
        highlighter.setTextFragmenter(fragmenter); 
        /** ************************************ */ 
         
        // 3.取出当前的数据 
        List<Document> recordList = new ArrayList<Document>(); 
        int end = Math.min(firstResult + maxResults, recordCount); 
        for (int i = firstResult; i < end; i++) { 
            ScoreDoc scoreDoc = topDocs.scoreDocs[i]; 
 
            int docSn = scoreDoc.doc; 
            Document doc = indexSearcher.doc(docSn); 
 
            // 使用高亮器 
            String hc = highlighter.getBestFragment(analyzer, "content", doc 
                    .get("content")); 
 
            // 如果content中没有搜索的关键字,则截取content的前200个字符 
            if (hc == null) { 
                String content = doc.get("content"); 
                int endIndex = Math.min(200, content.length()); 
                hc = content.substring(0, endIndex); 
            } 
            doc.getField("content").setValue(hc); 
 
            recordList.add(doc); 
        } 
        // 打开结果 
        /*
         * for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ;
         * //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档
         * File2DocumentUtiles.printDocInfo(doc) ; }
         */ 
        // 4.返回结果 
        return new QueryResult(recordCount, recordList); 
    } 
 
    /*
     * 建立索引并保存
     */ 
    public void save(String filePath) throws Exception { 
        Document doc = File2DocumentUtiles.file2Document(filePath); 
        // 在添加doc的时候,可以设定文档的分数,不过不建议这样做 
        // doc.setBoost(1.0f); //默认值 
 
        // 参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引 
        IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false, 
                MaxFieldLength.LIMITED); 
        indexWriter.addDocument(doc); 
        indexWriter.commit(); 
        indexWriter.optimize(); 
        indexWriter.close(); 
 
    } 
     
    public void save(File file) throws Exception { 
        save(file.getAbsolutePath()) ; 
    } 
 
    /*
     * 建立索引并保存 可以直接传入的是目录
     */ 
    public void saveDirectory(File file) throws Exception { 
        if (file.isFile()) { // 如果是文件就建索引并保存 
            save(file.getAbsolutePath()); 
            return; 
        } 
        File[] childs = file.listFiles(); 
        for (int i = 0; i < childs.length; i++) { 
            File f = childs[i]; 
            if (f.isDirectory()) {// 如果是目录就递归调用 
                saveDirectory(f); 
            } else { 
                save(f.getAbsolutePath()); 
            } 
        } 
    } 
 
    /**
     * 测试递归
     */ 
    public void save(File file, int pointer) throws Exception { 
        StringBuffer str = new StringBuffer(); 
        for (int i = 0; i < pointer; i++) { 
            str.append("--"); 
        } 
        if (file.isFile()) { // 如果是文件就建索引并保存 
            System.out.println(str + file.getName()); 
            return; 
        } 
        File[] childs = file.listFiles(); 
        for (int i = 0; i < childs.length; i++) { 
            File f = childs[i]; 
            if (f.isDirectory()) {// 如果是目录就递归调用 
                System.out.println(str + f.getName()); 
                save(f, pointer + 1); 
            } else { 
                System.out.println(str + f.getName()); 
            } 
 
        } 
    } 



Java代码 
package com.cs.lucene.lucene; 
 
import java.util.ArrayList; 
import java.util.List; 
 
import org.apache.lucene.document.Document; 
 
public class QueryResult { 
    private int recordCount = 0; 
    private List<Document> recordResults = new ArrayList<Document>(); 
 
    public QueryResult(int recordCount, List<Document> recordResults) { 
        this.recordCount = recordCount; 
        this.recordResults = recordResults; 
    } 
 
    public int getRecordCount() { 
        return recordCount; 
    } 
 
    public void setRecordCount(int recordCount) { 
        this.recordCount = recordCount; 
    } 
 
    public List<Document> getRecordResults() { 
        return recordResults; 
    } 
 
    public void setRecordResults(List<Document> recordResults) { 
        this.recordResults = recordResults; 
    } 
 




测试索引
Java代码 
package com.cs.lucene.lucene; 
 
import java.io.File; 
 
import jeasy.analysis.MMAnalyzer; 
 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.document.Document; 
import org.junit.Test; 
 
import com.cs.lucene.utils.File2DocumentUtiles; 
 
public class IndexDaoTest { 
 
 
    private IndexDao indexDao = new IndexDao() ; 
     
    /*
     *搜索索引库,并返回结果
     */ 
    @Test 
    public void testSearch() throws Exception{ 
        String queryString = "www*" ; 
        QueryResult queryResults = indexDao.search(queryString ,0, 10) ; 
         //测试结果 
         System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果"); 
          
          for(int i =0 ; i<queryResults.getRecordResults().size();i++){ 
              Document doc = queryResults.getRecordResults().get(i) ; 
              File2DocumentUtiles.printDocInfo(doc) ; 
          } 
    } 
    /*
     * 测试索引源文件并保存到索引库
     */ 
    @Test 
    public void testSave() throws Exception{ 
        String filePath2 = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\readme2.txt"; 
        //源文件 
        //String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有吴朝辉.txt"; 
         
          indexDao.save(filePath2); 
    } 
    /**
     * 用来给目录建索引并保存到索引库
     */ 
    @Test 
    public  void  testSaveDir() throws Exception{ 
        String filepath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\www" ; 
        File file = new File(filepath) ; 
        indexDao.saveDirectory(file); 
    } 



最后我们来看看lucene的查询功能
Java代码 
package com.cs.lucene.query; 
 
import java.util.Date; 
 
import org.apache.lucene.document.DateTools; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.NumberTools; 
import org.apache.lucene.document.DateTools.Resolution; 
import org.apache.lucene.index.Term; 
import org.apache.lucene.search.BooleanQuery; 
import org.apache.lucene.search.PhraseQuery; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.RangeQuery; 
import org.apache.lucene.search.TermQuery; 
import org.apache.lucene.search.WildcardQuery; 
import org.apache.lucene.search.BooleanClause.Occur; 
import org.junit.Test; 
 
import com.cs.lucene.lucene.IndexDao; 
import com.cs.lucene.lucene.QueryResult; 
import com.cs.lucene.utils.File2DocumentUtiles; 
 
public class QueryTest { 
     
    IndexDao indexDao = new IndexDao() ; 
     
    /*
     * 关键词查询
     */ 
    @Test 
    public void testTermQuery() throws Exception{ 
        Term term = new Term("name","资源"); 
        Query query = new TermQuery(term); 
     
        //查询打印结果 
         QueryAndPrintResult(query) ; 
    } 
    /*
     * 范围索引
     * 数字在query中都是字符串,所以要借助NumberTools工具类做转换
     */ 
    @Test 
    public void testRangeQuery() throws Exception{ 
        Term lowerTerm = new Term("size",NumberTools.longToString(200)); 
        Term upperTerm = new Term("size",NumberTools.longToString(500)); 
        //true表示是否包含边界 
        Query query = new RangeQuery(lowerTerm,upperTerm,true) ; 
         
        /*
        Term lowerTerm2 = new Term("size","200");
        Term upperTerm2 = new Term("size","500");
        Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
    */ 
        //查询打印结果 
         QueryAndPrintResult(query) ; 
    } 
     
    /*
     * 测试NumberTools和DateTools
     */ 
    @Test 
    public void testNumberToolsAndDateTools() throws Exception{ 
         
    System.out.println("数字测试:");     
         System.out.println(NumberTools.longToString(200)); 
         System.out.println(NumberTools.longToString(500)); 
         System.out.println(NumberTools.stringToLong("000000000000dw")); 
         
    System.out.println("日期测试:");     
         System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND)); 
         System.out.println(DateTools.dateToString(new Date(), Resolution.DAY)); 
         System.out.println(DateTools.stringToDate("20101005080855")); 
    } 
     
    /*
     * 通配符查询
     * ?:代表一个字符,*:代表0个或多个字符
     */ 
    @Test 
    public void testWildcardQuery() throws Exception{ 
        Term term = new Term("name","*me"); 
        Query query = new WildcardQuery(term) ;  
     
        //查询打印结果 
         QueryAndPrintResult(query) ; 
    } 
    /*
     * 短语查询:查询包含多个短语的query
     */ 
    @Test 
    public void testPhraseQuery() throws Exception{ 
        PhraseQuery phraseQuery = new PhraseQuery() ;  
        phraseQuery.add(new Term("name","资源")) ; 
        phraseQuery.add(new Term("name","作者")) ; 
         
        //setSlop:用来设置两个短语之间的最多可以隔多少个字符 
        phraseQuery.setSlop(20); 
     
        //查询打印结果 
         QueryAndPrintResult(phraseQuery) ; 
    } 
    /**
     * 布尔查询:非常重要
     * 三种关系:
     * 1.MUST和MUST:取得两个查询子句的交集。
     * 2.MUST和MUST_NOT:包含MUST但并且查询结果中不包含MUST_NOT的检索结果。
     * 3.SHOULT和SHOULT:表示"或"关系,最终检索结果为所有检索子句的并集。
     * 注意:有些组合是没有意义的
     * @throws Exception
     */ 
    @Test 
    public void testBooleanQuery() throws Exception{ 
        //条件1 
        PhraseQuery phraseQuery = new PhraseQuery() ;  
        phraseQuery.add(new Term("name","资源")) ; 
        phraseQuery.add(new Term("name","作者")) ; 
        phraseQuery.setSlop(20); 
         
        //条件2 
        Term lowerTerm2 = new Term("size","200"); 
        Term upperTerm2 = new Term("size","500"); 
        Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界 
     
        //合并两个查询 
        BooleanQuery booleanQuery = new BooleanQuery() ; 
        booleanQuery.add(phraseQuery, Occur.MUST) ; 
        booleanQuery.add(rangeQuery,Occur.MUST) ; 
     
        //查询打印结果 
        QueryAndPrintResult(booleanQuery) ; 
    } 
     
     
     
    private void QueryAndPrintResult(Query query) throws Exception{ 
         
        System.out.println("相对应的查询字符串:"+query); 
          QueryResult qr = indexDao.search(query, 0, 100) ; 
         System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果"); 
         
        //打印结果 
          for(int i =0 ; i<qr.getRecordResults().size();i++){ 
              Document doc = qr.getRecordResults().get(i) ; 
              File2DocumentUtiles.printDocInfo(doc) ; 
          } 
    } 

你可能感兴趣的:(Lucene)