最近做了一个项目,有个需要是要全文检索上传的word文档的内容,经过经理推荐用lucene,我下载了最新版的lucene,是apache的一个项目.大体的实现分为两部分,一部分是创建索引,一份不是利用索引查询.
创建索引的代码:
package test;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
/**
* 创建索引 Lucene 3.0+
* @author KingViker
*
*/
public class LuceneIndex {
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
//保存索引文件的地方
String indexDir = "F:\\indexDir";
//将要搜索word文件的地方
String dateDir = "F:\\contract";
IndexWriter indexWriter = null;
//创建Directory对象
Directory dir = new SimpleFSDirectory(new File(indexDir));
IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_36,new StandardAnalyzer(Version.LUCENE_36));
iwf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
indexWriter = new IndexWriter(dir,iwf);
File[] files = new File(dateDir).listFiles();
long l = System.currentTimeMillis();
for (int i = 0; i < files.length; i++) {
Document doc = new Document();
InputStream in = new FileInputStream(files[i]);
XWPFWordExtractor w = new XWPFWordExtractor(new XWPFDocument(in));
//创建Field对象,并放入doc对象中
//word文档内容
doc.add(new Field("contents", w.getText(),Field.Store.YES,Field.Index.ANALYZED));
//合同id
doc.add(new Field("contractId","1",Field.Store.YES,Field.Index.NO));
//写入IndexWriter
indexWriter.addDocument(doc);
}
//查看IndexWriter里面有多少个索引
System.out.println("numDocs"+indexWriter.numDocs());
indexWriter.close();
System.out.println(System.currentTimeMillis()-l);
}
}
代码里写了注释,这里使用到了poi来解析word,获得word里面全部的文本来创建索引.关于poi操作word我的另一片博客有一个简单的小例子是想word2007中写入自己定义文本的(http://blog.csdn.net/qq413041153/article/details/7854074)有兴趣的童鞋可以看看.
下面是查询的代码:
package test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索索引 Lucene 3.0+
* @author KingViker
*
*/
public class LuceneSearch {
public static void main(String[] args) throws IOException, ParseException {
//保存索引文件的地方
String indexDir = "F:\\indexDir";
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexReader indexReader = IndexReader.open(dir);
IndexSearcher indexSearch = new IndexSearcher(indexReader);
//创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
QueryParser queryParser = new QueryParser(Version.LUCENE_36,
"contents", new StandardAnalyzer(Version.LUCENE_36));
//生成Query对象
Query query = queryParser.parse("盖章");
//搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = indexSearch.search(query, 10000);
//hits.totalHits表示一共搜到多少个
System.out.println("找到了"+hits.totalHits+"个");
ScoreDoc[] scoreDocs=hits.scoreDocs;
//存储符合条件的记录
ArrayList<String> list=new ArrayList<String>();
int index = 0;
int startIndex = index*25+1;
int endIndex = index*25+1+25;
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = startIndex; i <endIndex&&i<hits.totalHits; i++) {
Document document=indexSearch.doc(scoreDocs[i].doc);
list.add(document.getFieldable("contractId").stringValue());
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
System.out.println(list);
}
indexSearch.close();
}
}
查询的代码也做了注释,lucene里面我暂时还不知道有什么好的分页方法,只能自己手动的判断需要的条数.如果有谁知道还望不吝赐教.
例子下载地址:http://download.csdn.net/detail/qq413041153/4509738
发原创帖子不宜,实验方法更不易,还望转载时注明出处!