本篇目录:
1.索引建立的步骤
2.域的存储选项和索引选项
3.索引的增删查改
4.IndexReader的单例模式
5.域为数字和日期时的处理
6.生命周期
1.索引建立的步骤
建立索引就是为了在检索时从索引文件中进行快速查找。
先创建Directory和IndexWriter,然后创建文档Document,之后为文档添加域Field,域的存储选项Field.Store和索引选项Field.Index均需要设置;
文档相当于表中的每一条记录,域相当于表中的每一个字段。
最后通过IndexWriter将文档添加到索引中;
2.域的存储选项和索引选项
Field.Index的值
|
是否索引
|
是否分词
|
是否加权
|
NO
|
X
|
|
|
NOT_ANALYZED_NO_NORMS
|
V
|
X
|
X
|
ANALYZED
|
V
|
V
|
V
|
ANALYZED_NO_NORMS
|
V
|
V
|
X
|
NOT_ANALYZED
|
V
|
X
|
V
|
3.索引的增删查改
参加网络视频,好像授课者是一位大学老师。直接粘在这里,以备查用。
0、查询索引的基本信息:
通过IndexReader加载索引文件就可以直接获取文档的数量了(numDoc和maxDoc)
1、删除
2、恢复删除
3、强制删除
4、优化和合并索引
5、更新索引
package org.itat.index; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.StaleReaderException; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; public class IndexUtil { private String[] ids = {"1","2","3","4","5","6"}; private String[] emails = {"[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]"}; private String[] contents = { "welcome to visited the space,I like book", "hello boy, I like pingpeng ball", "my name is cc I like game", "I like football", "I like football and I like basketball too", "I like movie and swim" }; private Date[] dates = null; private int[] attachs = {2,3,1,4,5,5}; private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"}; private Directory directory = null; private Map<String,Float> scores = new HashMap<String,Float>(); private static IndexReader reader = null; public IndexUtil() { try { setDates(); scores.put("itat.org",2.0f); scores.put("zttc.edu", 1.5f); //directory = FSDirectory.open(new File("d:/lucene/index02")); directory = new RAMDirectory(); index(); reader = IndexReader.open(directory,false); } catch (IOException e) { e.printStackTrace(); } } public IndexSearcher getSearcher() { try { if(reader==null) { reader = IndexReader.open(directory,false); } else { IndexReader tr = IndexReader.openIfChanged(reader); if(tr!=null) { reader.close(); reader = tr; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } private void setDates() { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); try { dates = new Date[ids.length]; dates[0] = sdf.parse("2010-02-19"); dates[1] = sdf.parse("2012-01-11"); dates[2] = sdf.parse("2011-09-19"); dates[3] = sdf.parse("2010-12-22"); dates[4] = sdf.parse("2012-01-01"); dates[5] = sdf.parse("2011-05-19"); } catch (ParseException e) { e.printStackTrace(); } } public void undelete() { //使用IndexReader进行恢复 try { IndexReader reader = IndexReader.open(directory,false); //恢复时,必须把IndexReader的只读(readOnly)设置为false reader.undeleteAll(); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (StaleReaderException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void merge() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); //会将索引合并为2段,这两段中的被删除的数据会被清空 //特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销, //Lucene会根据情况自动处理的 writer.forceMerge(2); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void forceDelete() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); writer.forceMergeDeletes(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void delete() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值 //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复 writer.deleteDocuments(new Term("id","1")); writer.commit(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void delete02() { try { reader.deleteDocuments(new Term("id","1")); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void update() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); /* * Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集 * 先删除之后再添加 */ Document doc = new Document(); doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); writer.updateDocument(new Term("id","1"), doc); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void query() { try { IndexReader reader = IndexReader.open(directory); //通过reader可以有效的获取到文档的数量 System.out.println("numDocs:"+reader.numDocs()); System.out.println("maxDocs:"+reader.maxDoc()); System.out.println("deleteDocs:"+reader.numDeletedDocs()); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void index() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.deleteAll(); Document doc = null; for(int i=0;i<ids.length;i++) { doc = new Document(); doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); //存储数字 doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i])); //存储日期 doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime())); String et = emails[i].substring(emails[i].lastIndexOf("@")+1); System.out.println(et); if(scores.containsKey(et)) { doc.setBoost(scores.get(et)); } else { doc.setBoost(0.5f); } writer.addDocument(doc); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null)writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void search01() { try { IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TermQuery query = new TermQuery(new Term("email","[email protected]")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd:tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println("("+sd.doc+"-"+doc.getBoost()+"-"+sd.score+")"+ doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+ doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]); } reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void search02() { try { IndexSearcher searcher = getSearcher(); TermQuery query = new TermQuery(new Term("content","like")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd:tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println(doc.get("id")+"---->"+ doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+ doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]); } searcher.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
4.IndexReader的单例模式
public IndexSearcher getSearcher() { try { if(reader==null) { reader = IndexReader.open(directory,false); } else { IndexReader tr = IndexReader.openIfChanged(reader); if(tr!=null) { reader.close(); reader = tr; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; }
5.域为数字和日期时的处理
//存储数字 doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i])); //存储日期 doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
6.IndexReader的生命周期
对于IndexReader而言,之所以使用单例模式,是因为反复使用IndexReader.open打开会有很大的开销,所以一般在整个程序的生命周期中只会打开一个IndexReader,通过这个IndexReader来创建不同的IndexSearcher。
但如果使用单例模式,可能出现的问题有:
1、如果IndexWriter在创建完成之后,没有关闭,需要进行commit操作之后才能提交,才会更新索引文件;
2、当使用IndexWriter修改了索引之后,IndexReader不会自动更新它加载过的索引信息,所以需要使用IndexReader.openIfChange方法操作。