lucene3.6全文检索word2007

最近做了一个项目,有个需要是要全文检索上传的word文档的内容,经过经理推荐用lucene,我下载了最新版的lucene,是apache的一个项目.大体的实现分为两部分,一部分是创建索引,一份不是利用索引查询.

创建索引的代码:

package test;  
  
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

/** 
 * 创建索引 Lucene 3.0+ 
 * @author KingViker
 * 
 */  
public class LuceneIndex {  
  
    /** 
     * @param args 
     * @throws IOException  
     */  
    public static void main(String[] args) throws IOException {  
    	 //保存索引文件的地方  
        String indexDir = "F:\\indexDir";
        //将要搜索word文件的地方  
        String dateDir = "F:\\contract";
        IndexWriter indexWriter = null;  
        //创建Directory对象  
        Directory dir = new SimpleFSDirectory(new File(indexDir));  
        IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_36,new StandardAnalyzer(Version.LUCENE_36));
        iwf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        indexWriter = new IndexWriter(dir,iwf);
        File[] files = new File(dateDir).listFiles();  
        long l = System.currentTimeMillis();
        for (int i = 0; i < files.length; i++) {  
            Document doc = new Document();  
            InputStream in = new FileInputStream(files[i]);  
            XWPFWordExtractor w = new XWPFWordExtractor(new XWPFDocument(in));
            //创建Field对象,并放入doc对象中  
            //word文档内容
            doc.add(new Field("contents", w.getText(),Field.Store.YES,Field.Index.ANALYZED));
            //合同id
            doc.add(new Field("contractId","1",Field.Store.YES,Field.Index.NO));
            //写入IndexWriter  
            indexWriter.addDocument(doc); 
        }  
        //查看IndexWriter里面有多少个索引  
        System.out.println("numDocs"+indexWriter.numDocs());  
        indexWriter.close();  
        System.out.println(System.currentTimeMillis()-l);
          
    }  
  
} 
代码里写了注释,这里使用到了poi来解析word,获得word里面全部的文本来创建索引.关于poi操作word我的另一片博客有一个简单的小例子是想word2007中写入自己定义文本的(http://blog.csdn.net/qq413041153/article/details/7854074)有兴趣的童鞋可以看看.

下面是查询的代码:

package test;  
  
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/** 
 * 搜索索引 Lucene 3.0+ 
 * @author KingViker 
 * 
 */  
public class LuceneSearch {  
  
    public static void main(String[] args) throws IOException, ParseException {  
    	//保存索引文件的地方  
        String indexDir = "F:\\indexDir";  
        Directory dir = new SimpleFSDirectory(new File(indexDir));  
        //创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
        IndexReader indexReader = IndexReader.open(dir);
        IndexSearcher indexSearch = new IndexSearcher(indexReader);  
        //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器  
        QueryParser queryParser = new QueryParser(Version.LUCENE_36,  
                "contents", new StandardAnalyzer(Version.LUCENE_36));
        //生成Query对象 
        Query query = queryParser.parse("盖章");
        //搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值  
        TopDocs hits = indexSearch.search(query, 10000);  
        //hits.totalHits表示一共搜到多少个  
        System.out.println("找到了"+hits.totalHits+"个");  
        
        ScoreDoc[] scoreDocs=hits.scoreDocs; 
        //存储符合条件的记录  
        ArrayList<String> list=new ArrayList<String>();  
      
        int index = 0;
        int startIndex = index*25+1;
        int endIndex = index*25+1+25;
        //循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值  
        for (int i = startIndex; i <endIndex&&i<hits.totalHits; i++) {  
        	
        	Document document=indexSearch.doc(scoreDocs[i].doc);  
            list.add(document.getFieldable("contractId").stringValue());  
        	
            ScoreDoc sdoc = hits.scoreDocs[i];  
            Document doc = indexSearch.doc(sdoc.doc);  
            System.out.println(list);              
        }
        indexSearch.close();  
    }  
}  
查询的代码也做了注释,lucene里面我暂时还不知道有什么好的分页方法,只能自己手动的判断需要的条数.如果有谁知道还望不吝赐教.

例子下载地址:http://download.csdn.net/detail/qq413041153/4509738

发原创帖子不宜,实验方法更不易,还望转载时注明出处!


你可能感兴趣的:(Lucene)