lunece是一个全文检索工具,lunece的出现可以对关键词和部分数据建立索引,优化查询效率
这里引入需要lunecejar version 3.5 和junit jar version4.0
学习lunece需要最重要以下几个
索引部分
分词器
搜索部分
v1.下面先了解索引部分
v1.建立索引
首先建立一个IndexUtil类
//创建索引
public void index() {
IndexWriter writer=null;
try {
// 1.创建Directory 将索引建立在什么地方(是内存中还是硬盘),这里保存到硬盘上
//Directory directory=new RAMDirectory();//建立在内存中
Directory directroy=FSDirectory.open(new File("F:/lunece"));
// 2.创建IndexWriter 写入索引
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));
writer=new IndexWriter(directroy,iwc);
// 3.创建Document对象 相当于数据库中的表,或者硬盘的某个文件
Document doc=null;
File f=new File("F:/lunecetest1");
for(File file:f.listFiles()){
doc=new Document();
// 4.为Document添加Filed 相当于表中的字段,为那些字段建立索引
doc.add(new Field("content", new FileReader(file)));
/**
* Field.Store.YES或者NO(存储域选项)
设置为YES表示或把这个域中的内容完全存储到文件中,方便进行文本的还原
设置为NO表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get)
*/
doc.add(new Field("filename", file.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
/***
* Field.Index(索引选项)
Index.ANALYZED:进行分词和索引,适用于标题、内容等
Index.NOT_ANALYZED:进行索引,但是不进行分词,如果身份证号,姓名,ID等,适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
*/
doc.add(new Field("path", file.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
// 5.通过IndexWriter添加文档到索引中
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (LockObtainFailedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
try {
if(writer!=null)writer.close();//6.这里要关闭writer,不然写入索引可能会有问题
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
v3.搜索索引
//搜索索引
public void search(){
try {
//1.创建打开索引目录
Directory directory = FSDirectory.open(new File("f:/lunece"));
//2.读取索引
IndexReader reader = IndexReader.open(directory);
//3.根据IndexReader 创建IndexSearcher 对reader在进行解析搜索索引
IndexSearcher searcher = new IndexSearcher(reader);
//4.创建Query对象,对那个解析索引域进行搜索如内容 StandardAnalyzer 分词器
//创建parse来确定搜索的内容,第二个参数代表搜索的域
QueryParser parse=new QueryParser(Version.LUCENE_35,"content",new StandardAnalyzer(Version.LUCENE_35));
//创建Query 表示搜索的域中包含java的文档
Query query=parse.parse("java");
//5.根据searcher搜索并且返回TopDocs,执行搜索
TopDocs tds=searcher.search(query, 10);
//6.根据TopDocs获取ScoreDoc对象
ScoreDoc[] sds=tds.scoreDocs;
for(ScoreDoc sd:sds){
//7.根据searcher和ScoreDoc对象获取具体Document对象
Document d=searcher.doc(sd.doc);
//8根据Document对象获取需要的值
System.out.println(d.get("filename")+"["+d.get("path")+"]");
}
reader.close(); //关闭reader
} catch (Exception e) {
// TODO: handle exception
}
}
//查询索引
public void query() {
try {
IndexReader reader = IndexReader.open(directory);
//通过reader可以有效的获取到文档的数量
System.out.println("numDocs:"+reader.numDocs());
System.out.println("maxDocs:"+reader.maxDoc());
System.out.println("deleteDocs:"+reader.numDeletedDocs());
reader.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
//删除索引
列如我有
private String[] ids = {"1","2","3","4","5","6"};
private String[] emails = {"[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]"};
private String[] contents = {
"welcome to visited the space,I like book",
"hello boy, I like pingpeng ball",
"my name is cc I like game",
"I like football",
"I like football and I like basketball too",
"I like movie and swim"
};
private Date[] dates = null;
private int[] attachs = {2,3,1,4,5,5};
private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
private Directory directory =directory = new RAMDirectory();
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
writer.deleteAll();
Document doc = null;
for(int i=0;i<ids.length;i++) {
doc = new Document();
doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
//存储数字
doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
//存储日期
doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
System.out.println(et);
if(scores.containsKey(et)) {
doc.setBoost(scores.get(et));
} else {
doc.setBoost(0.5f);
}
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer!=null)writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void delete() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,
new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
//参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
//此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复
writer.deleteDocuments(new Term("id","1"));
writer.commit();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer!=null) writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
然后建立一个TestLunece测试类
@Test
public void testIndex(){
IndexUtil indexUtil=new IndexUtil();
indexUtil.index();
}
@Test
public void testDelete() {
IndexUtil iu = new IndexUtil();
iu.delete();
}
//恢复索引
public void undelete() {
//使用IndexReader进行恢复
try {
IndexReader reader = IndexReader.open(directory,false);
//恢复时,必须把IndexReader的只读(readOnly)设置为false
reader.undeleteAll();
reader.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (StaleReaderException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
v4.搜索部分
建立SearcherUtil