1.前言
之前的博客《Lucene全文检索之HelloWorld》已经简单介绍了Lucene的索引生成和检索。本文着重介绍Lucene的索引删除。
2.应用场景:
索引建立完成后,因为有些原因,被索引的文件已经删除。此时,索引仍然存在,为了不产生“虚假检索结果”,需要将失效的索引删除
3.HelloLucene类(重点关注deleteIndexByQuery方法)
package com.njupt.zhb; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /* *@author: ZhengHaibo *web: http://blog.csdn.net/nuptboyzhb *mail: [email protected] *2013-08-27 Nanjing,njupt,China */ public class HelloLucene { /** * Index all text files under a directory. * String indexPath = "index";//索引保存的路径 * String docsPath = "";//文档保存的路径(待索引) */ public void index(String indexPath,String docsPath) { try { // 1.创建Directory Directory dir = FSDirectory.open(new File(indexPath));//保存在硬盘上 // 2.创建IndexWriter Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 设置创建或追加模式 IndexWriter writer = new IndexWriter(dir, iwc); final File docDir = new File(docsPath); indexDocs(writer, docDir); writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void indexDocs(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) {//如果是文件夹,则遍历文件夹内的所有文件 String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else {//如果是文件 FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { // 3.创建Document对象 Document doc = new Document(); // 4.为Document添加Field // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: //以文件的文件路径建立Field Field pathField = new StringField("path", file.getPath(),Field.Store.YES); doc.add(pathField);//添加到文档中 //以文件的名称建立索引域 doc.add( new StringField("filename", file.getName(),Field.Store.YES));//添加到文档中 // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(),Field.Store.YES)); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. //以文件的内容建立索引域(Field) doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + file); writer.addDocument(doc);//将文档写入到索引中(以创建的方式) } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()),doc);//以追加方式写入到索引中 } } finally { fis.close(); } } } } /** * 搜索 * http://blog.csdn.net/nuptboyzhb */ public void searcher(String indexPath,String searchKeyword){ try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); String field = "contents";//搜索域是:文档的内容 QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer); Query query= parser.parse(searchKeyword);//搜索内容中含有searchKeyword字符串的文档 TopDocs tds=searcher.search(query, 10);//搜索前十个 ScoreDoc[] sds= tds.scoreDocs; for (ScoreDoc sd:sds) {//将内容中含有“南京”关键字的文档遍历一遍 Document document=searcher.doc(sd.doc); System.out.println("score:"+sd.score+"--filename:"+document.get("filename")+ "--path:"+document.get("path")+"--time"+document.get("modified"));//打印检索结果中文档的路径 } reader.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 删除索引 * @param indexPath 索引所在的路径 * @param deleteKeyword 删除含有该内容的索引 */ public void deleteIndexByQuery(String indexPath,String deleteKeyword){ try { //1.新建一个IndexWrite IndexWriter writer = new IndexWriter(FSDirectory.open(new File(indexPath)),new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44))); //2.生成一个Query Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); String field = "contents";//搜索域是:文档的内容 QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer); Query query= parser.parse(deleteKeyword);//生成搜索内容中含有deleteKeyword的文档 //3.按Query参数的方式删除索引,即删除了含有deleteKeyword的索引 writer.deleteDocuments(query); writer.commit();//提交,正是删除 writer.close();//关闭 // //writer.deleteDocuments(new Term(field, "")); }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
package com.njupt.zhb; import org.junit.Test; /* *@author: ZhengHaibo *web: http://blog.csdn.net/nuptboyzhb *mail: [email protected] *2013-08-25 Nanjing,njupt,China */ public class TestJunit { @Test public void TestIndex(){ HelloLucene hLucene=new HelloLucene(); hLucene.index("index", "D:\\lucene"); } @Test public void TestSearcher(){ HelloLucene hLucene=new HelloLucene(); hLucene.searcher("index","南京"); } @Test public void TestDeleteIndexByQuery(){ HelloLucene hLucene=new HelloLucene(); System.out.println("未删除前,查询关键字:北京 --结果:"); hLucene.searcher("index","北京"); hLucene.deleteIndexByQuery("index", "北京"); System.out.println("删除后,查询关键字:北京 --结果:"); hLucene.searcher("index","北京"); } }
5.1运行TestIndex方法
>控制台打印的信息
updating D:\lucene\lucene1.txt updating D:\lucene\lucene2.txt updating D:\lucene\lucene3.txt updating D:\lucene\北京.txt updating D:\lucene\南京.txt
5.2运行TestSearcher方法
>搜索含有关键字“南京”的文档
score:0.53033006--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375 score:0.48666292--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791 score:0.2155931--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795 score:0.1530931--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486
5.3运行TestDeleteIndexByQuery方法
>
未删除前,查询关键字:北京 --结果: score:0.4847152--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791 score:0.39226472--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795 score:0.10348864--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375 score:0.029874597--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486 删除后,查询关键字:北京 --结果:删除后,再次查询关键字时,无查询结果。
此时,index目录下的文件结构为:
多出了一个_0_1.del文件
项目源代码:http://download.csdn.net/detail/nuptboyzhb/6041239
未经允许,不得用于商业目的