package com.wzh.ml.two; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class HelloIndex { // 邮件编号 private String[] ids = { "1", "2", "3", "4", "5", "6" }; // 邮件主题 private String[] names = { "Michael", "Scofield", "Tbag", "Jack", "Jade", "Jadyer" }; // 邮件地址 private String[] emails = { "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]" }; // 邮件内容 private String[] contents = { "my blog", "my website", "my name", "I am JavaDeveloper", "I am from Haerbin", "I lik-e Lucene" }; // 邮件附件(为数字和日期加索引,与,字符串加索引的方式不同) private int[] attachs = { 9, 3, 5, 4, 1, 2 }; // 邮件日期 private Date[] dates = new Date[ids.length]; // 它的创建是比较耗时耗资源的,所以这里只让它创建一次,此时reader处于整个生命周期中,实际应用中也可能直接放到ApplicationContext里面 private static IndexReader reader = null; private Directory directory = null; public HelloIndex() { SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); try { dates[0] = sdf.parse("20120601"); dates[1] = sdf.parse("20120603"); dates[2] = sdf.parse("20120605"); dates[3] = sdf.parse("20120607"); dates[4] = sdf.parse("20120609"); dates[5] = sdf.parse("20120611"); directory = FSDirectory.open(new File("F:\\MyLucene\\index2")); } catch (Exception e) { e.printStackTrace(); } } /*** * 获取IndexReader * * @return */ private IndexReader getIndexReader() { try { if (reader == null) { reader = IndexReader.open(directory); } else { // if the index was changed since the provided reader was // opened, open and return a new reader; else,return null // 如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null IndexReader ir = IndexReader.openIfChanged(reader); if (ir != null) { reader.close(); // 关闭原reader reader = ir; // 赋予新reader } } return reader; } catch (Exception e) { e.printStackTrace(); } return null; // 发生异常则返回null } /*** * 获取文档数量 */ public void getDocCount() { System.out.println("maxDoc:" + this.getIndexReader().maxDoc()); System.out.println("numDoc:" + this.getIndexReader().numDocs()); System.out.println("deleteDoc:" + this.getIndexReader().numDeletedDocs()); } /*** * 创建索引 */ public void createIndex() { IndexWriter writer = null; Document doc = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36))); writer.deleteAll(); // 创建索引之前,先把文档清空掉 for (int i = 0; i < ids.length; i++) { doc = new Document(); doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED)); doc.add(new NumericField("attach", Field.Store.YES, true) .setIntValue(attachs[i])); // 为数字加索引(第三个参数指定是否索引) doc.add(new NumericField("date", Field.Store.YES, true) .setLongValue(dates[i].getTime())); // 为日期加索引 /* * 建立索引时加权 定义排名规则,即加权,这里是为指定邮件名结尾的emails加权 */ if (emails[i].endsWith("jadyer.cn")) { doc.setBoost(2.0f); } else if (emails[i].endsWith("jadyer.me")) { doc.setBoost(1.5f); // 为文档加权…默认为1.0,权值越高则排名越高,显示得就越靠前 } else { doc.setBoost(0.5f); // 注意它的参数类型是Float } writer.addDocument(doc); } } catch (Exception e) { e.printStackTrace(); } finally { if (null != writer) { try { writer.close(); } catch (IOException ce) { ce.printStackTrace(); } } } } /*** * 查找文件 */ public void searchFile() { IndexSearcher searcher = new IndexSearcher(getIndexReader()); Query query = new TermQuery(new Term("content", "my")); try { TopDocs docs = searcher.search(query, 10); ScoreDoc[] scoreDocs = docs.scoreDocs; for (ScoreDoc sd : scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.print("(doc的序列号:" + sd.doc + "|权重:" + doc.getBoost() + "|分数:" + sd.score + ")" + doc.get("name") + "[" + doc.get("email") + "]-->"); System.out.println("id:" + doc.get("id") + "," + doc.get("attach") + "," + new SimpleDateFormat("yyyyMMdd").format(new Date(Long .parseLong(doc.get("date"))))); } } catch (Exception e) { e.printStackTrace(); } finally { if (null != searcher) { try { searcher.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 更新索引 * * @see Lucene其实并未提供更新索引的方法,这里的更新操作内部是先删除再添加的方式 * @see 因为Lucene认为更新索引的代价,与删除后重建索引的代价,二者是差不多的 */ public void updateIndex() { IndexWriter writer = null; Document doc = new Document(); try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36))); doc.add(new Field("id", "1111", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("name", names[0], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email", emails[0], Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("content", contents[0], Field.Store.NO, Field.Index.ANALYZED)); doc.add(new NumericField("attach", Field.Store.YES, true) .setIntValue(attachs[0])); doc.add(new NumericField("date", Field.Store.YES, true) .setLongValue(dates[0].getTime())); // 其实它会先删除索引文档中id为1的文档,然后再将这里的doc对象重新索引,所以即便这里的1!=1111,但它并不会报错 // 所以在执行完该方法后:maxDocs=7,numDocs=6,deletedDocs=1,就是因为Lucene会先删除再添加 writer.updateDocument(new Term("id", "1"), doc); } catch (Exception e) { e.printStackTrace(); } finally { if (null != writer) { try { writer.close(); } catch (IOException ce) { ce.printStackTrace(); } } } } /** * 删除索引 * * @see * ---------------------------------------------------------------------- * ------------------------------- * @see 在执行完该方法后,再执行本类的searchFile()方法,得知numDocs=5,maxDocs=6,deletedDocs=1 * @see 这说明此时删除的文档并没有被完全删除,而是存储在一个回收站中,它是可以恢复的 * @see * ---------------------------------------------------------------------- * ------------------------------- * @see 从回收站中清空索引IndexWriter * @see 对于清空索引,Lucene3.5之前叫做优化,调用的是IndexWriter.optimize()方法,但该方法已被禁用 * @see 因为optimize时它会全部更新索引,这一过程所涉及到的负载是很大的,于是弃用了该方法,使用forceMerge代替 * @see 使用IndexWriter.forceMergeDeletes()方法可以强制清空回收站中的内容 * @see 另外IndexWriter.forceMerge(3)方法会将索引合并为3段,这3段中的被删除的数据也会被清空 * @see 但其在Lucene3.5之后不建议使用,因为其会消耗大量的开销,而Lucene会根据情况自动处理的 * @see * ---------------------------------------------------------------------- * ------------------------------- */ public void deleteIndex() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36))); // 其参数可以传Query或Term…Query指的是可以查询出一系列的结果并将其全部删掉,而Term属于精确查找 writer.deleteDocuments(new Term("id", "1")); // 删除索引文档中id为1的文档 } catch (Exception e) { e.printStackTrace(); } finally { if (null != writer) { try { writer.close(); } catch (IOException ce) { ce.printStackTrace(); } } } } /** * 恢复索引 * * @see 建议弃用 */ @Deprecated public void unDeleteIndex() { IndexReader reader = null; try { // IndexReader.open(directory)此时该IndexReader默认的readOnly=true,而在恢复索引时应该指定其为非只读的 reader = IndexReader.open(directory, false); // Deprecated. Write support will be removed in Lucene 4.0. There // will be no replacement for this method. reader.undeleteAll(); } catch (Exception e) { e.printStackTrace(); } finally { if (null != reader) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } }