Lucene3.5 之 索引删除和更新


  1. package com.ethan.index;  
  2.   
  3. import java.io.File;  
  4. import java.io.IOException;  
  5.   
  6. import org.apache.commons.io.FileUtils;  
  7. import org.apache.lucene.analysis.standard.StandardAnalyzer;  
  8. import org.apache.lucene.document.Document;  
  9. import org.apache.lucene.document.Field;  
  10. import org.apache.lucene.index.CorruptIndexException;  
  11. import org.apache.lucene.index.IndexReader;  
  12. import org.apache.lucene.index.IndexReader.FieldOption;  
  13. import org.apache.lucene.index.IndexWriter;  
  14. import org.apache.lucene.index.IndexWriterConfig;  
  15. import org.apache.lucene.index.Term;  
  16. import org.apache.lucene.store.Directory;  
  17. import org.apache.lucene.store.FSDirectory;  
  18. import org.apache.lucene.store.LockObtainFailedException;  
  19. import org.apache.lucene.util.Version;  
  20.   
  21. public class IndexUtil {  
  22.     private String[] ids = {"1","2","3","4","5","6"};  
  23.     private String[] emails = {"[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]"};  
  24.     private String[] contents = {  
  25.             "welcome to nba hot",  
  26.             "my name is ethan",  
  27.             "someone like you ",  
  28.             "rolling in the deep, you like",  
  29.             "i like fast........",  
  30.             "l like sports"  
  31.     };  
  32.       
  33.     private int[] attachs = {2,3,1,5,4,6};  
  34.     private String[] names = {"ethan","sara","michael","wade","lin","paul"};  
  35.       
  36.     private Directory directory = null;  
  37.       
  38.     public IndexUtil() {  
  39.          try {  
  40.             directory = FSDirectory.open(new File("C:\\Users\\ETHAN\\workspace\\hellolucene\\index02"));  
  41.         } catch (IOException e) {  
  42.             e.printStackTrace();  
  43.         }  
  44.     }  
  45.     public void index() {  
  46.         IndexWriter writer = null;  
  47.           
  48.         try {  
  49.             writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)) );  
  50.             Document doc = null;  
  51.             for(int i=0;i
  52.                 doc = new Document();  
  53.                 doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
  54.                 doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));  
  55.               
  56.                 doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));  
  57.                 doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));  
  58.                   
  59.                 writer.addDocument(doc);  
  60.             }  
  61.         } catch (CorruptIndexException e) {  
  62.             e.printStackTrace();  
  63.         } catch (LockObtainFailedException e) {  
  64.             e.printStackTrace();  
  65.         } catch (IOException e) {  
  66.             e.printStackTrace();  
  67.         } finally {  
  68.             if(writer!=null) {  
  69.                 try {  
  70.                     writer.close();  
  71.                 } catch (CorruptIndexException e) {  
  72.                     e.printStackTrace();  
  73.                 } catch (IOException e) {  
  74.                     e.printStackTrace();  
  75.                 }  
  76.             }  
  77.         }  
  78.           
  79.     }  
  80.   
  81.     public void query() {  
  82.           
  83.         try {  
  84.             IndexReader reader = IndexReader.open(directory);  
  85.             //被存储的  
  86.             System.out.println("numDocs: "+reader.numDocs());  
  87.               
  88.             //文档总量  
  89.             System.out.println("maxDocs: "+reader.maxDoc());  
  90.             //删除的文档  
  91.             System.out.println("deleteDocs: "+reader.numDeletedDocs());;  
  92.         } catch (CorruptIndexException e) {  
  93.             e.printStackTrace();  
  94.         } catch (IOException e) {  
  95.             e.printStackTrace();  
  96.         }  
  97.     }  
  98.   
  99.     public void delete() {  
  100.         IndexWriter writer = null;  
  101.           
  102.         try {  
  103.             writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));  
  104.               
  105.             //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值  
  106.             //这里删除id=1的文档,还会留在”回收站“。xxx.del  
  107.             writer.deleteDocuments(new Term("id","1"));  
  108.         } catch (CorruptIndexException e) {  
  109.             e.printStackTrace();  
  110.         } catch (LockObtainFailedException e) {  
  111.             e.printStackTrace();  
  112.         } catch (IOException e) {  
  113.             e.printStackTrace();  
  114.         } finally {  
  115.             if(writer!=null) {  
  116.                 try {  
  117.                     writer.close();  
  118.                 } catch (CorruptIndexException e) {  
  119.                     e.printStackTrace();  
  120.                 } catch (IOException e) {  
  121.                     e.printStackTrace();  
  122.                 }  
  123.             }  
  124.         }  
  125.           
  126.     }  
  127.   
  128.     public void undelete() {  
  129.         //使用IndexReader进行恢复  
  130.         IndexReader reader = null;  
  131.         try {  
  132.             //set readOnly=false  
  133.             reader = IndexReader.open(directory,false);  
  134.             reader.undeleteAll();  
  135.         } catch (CorruptIndexException e) {  
  136.             e.printStackTrace();  
  137.         } catch (IOException e) {  
  138.             e.printStackTrace();  
  139.         } finally {  
  140.              if(reader!=null) {  
  141.                  try {  
  142.                     reader.close();  
  143.                 } catch (IOException e) {  
  144.                     e.printStackTrace();  
  145.                 }  
  146.              }  
  147.         }  
  148.           
  149.     }  
  150.   
  151.     public void forceDelete() {  
  152.         IndexWriter writer = null;  
  153.           
  154.         try {  
  155.             writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));  
  156.               
  157.             //强制优化,del文件就没了,回收站清空  
  158.             writer.forceMergeDeletes();  
  159.         } catch (CorruptIndexException e) {  
  160.             e.printStackTrace();  
  161.         } catch (LockObtainFailedException e) {  
  162.             e.printStackTrace();  
  163.         } catch (IOException e) {  
  164.             e.printStackTrace();  
  165.         } finally {  
  166.             if(writer!=null) {  
  167.                 try {  
  168.                     writer.close();  
  169.                 } catch (CorruptIndexException e) {  
  170.                     e.printStackTrace();  
  171.                 } catch (IOException e) {  
  172.                     e.printStackTrace();  
  173.                 }  
  174.             }  
  175.         }  
  176.     }  
  177.   
  178.     /* 
  179.      * 自己手动merge 
  180.      * 多次创建索引,文件会增多, 
  181.      * 比如 5次的话,5个id=1的 
  182.      *  
  183.      * merge后合并为n段 
  184.      */  
  185.       
  186.     public void merge() {  
  187.         IndexWriter writer = null;  
  188.         try {  
  189.             writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));  
  190.               
  191.             //将索引合并为2段,这两段中的del文件会被清空  
  192.             //3.5后不建议使用,开销大,lucene会根据情况自动处理  
  193.             writer.forceMerge(2);  
  194.         } catch (CorruptIndexException e) {  
  195.             e.printStackTrace();  
  196.         } catch (LockObtainFailedException e) {  
  197.             e.printStackTrace();  
  198.         } catch (IOException e) {  
  199.             e.printStackTrace();  
  200.         } finally {  
  201.             if(writer!=null) {  
  202.                 try {  
  203.                     writer.close();  
  204.                 } catch (CorruptIndexException e) {  
  205.                     e.printStackTrace();  
  206.                 } catch (IOException e) {  
  207.                     e.printStackTrace();  
  208.                 }  
  209.             }  
  210.         }  
  211.     }  
  212.   
  213.     /* 
  214.      * 更新操作 
  215.      */  
  216.     public void update() {  
  217.         IndexWriter writer = null;  
  218.         try {  
  219.             writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));  
  220.               
  221.             //lucene没有提供更新方法,这里操作分为两步  
  222.             //匹配后删除 和 添加新的  
  223.               
  224.             Document doc = new Document();  
  225.             doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
  226.             doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));  
  227.           
  228.             doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));  
  229.             doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));  
  230.               
  231.             writer.updateDocument(new Term("id","1"),doc);  
  232.         } catch (CorruptIndexException e) {  
  233.             e.printStackTrace();  
  234.         } catch (LockObtainFailedException e) {  
  235.             e.printStackTrace();  
  236.         } catch (IOException e) {  
  237.             e.printStackTrace();  
  238.         } finally {  
  239.             if(writer!=null) {  
  240.                 try {  
  241.                     writer.close();  
  242.                 } catch (CorruptIndexException e) {  
  243.                     e.printStackTrace();  
  244.                 } catch (IOException e) {  
  245.                     e.printStackTrace();  
  246.                 }  
  247.             }  
  248.         }  
  249.     }  
  250. }  
[java] view plain copy
  1. package com.ethan.test;  
  2.   
  3. import org.junit.Test;  
  4.   
  5. import com.ethan.index.IndexUtil;  
  6.   
  7. public class IndexTest {  
  8.       
  9.     @Test  
  10.     public void testIndex() {  
  11.         IndexUtil iu = new IndexUtil();  
  12.         iu.index();  
  13.     }  
  14.     /* 
  15.      * numDocs: 24 
  16.         maxDocs: 24 
  17.         deleteDocs: 0 
  18.      */  
  19.     @Test  
  20.     public void testQuery() {  
  21.         IndexUtil iu = new IndexUtil();  
  22.         iu.query();  
  23.     }  
  24.     /* 
  25.      * numDocs: 20 
  26.         maxDocs: 24 
  27.         deleteDocs: 4 (id=1 4条) 
  28.      */  
  29.     @Test  
  30.     public void testDelete() {  
  31.         IndexUtil iu = new IndexUtil();  
  32.         iu.delete();  
  33.     }  
  34.       
  35.     /* 
  36.      * numDocs: 7 
  37.         maxDocs: 7 
  38.         deleteDocs: 0 
  39.      */  
  40.     @Test  
  41.     public void testUnDelete() {  
  42.         IndexUtil iu = new IndexUtil();  
  43.         iu.undelete();  
  44.     }  
  45.       
  46.     /* 
  47.      * numDocs: 6 
  48.         maxDocs: 6(7) 
  49.         deleteDocs: 0(1) 
  50.      */  
  51.     @Test  
  52.     public void testForceDelete() {  
  53.         IndexUtil iu = new IndexUtil();  
  54.         iu.forceDelete();  
  55.     }  
  56.       
  57.     /* 
  58.      * merge后: 
  59.      * numDocs: 20 
  60.         maxDocs: 21 
  61.         deleteDocs: 1(因为强制合并为2段,所以_0_1.del没删) 
  62.         _0为第一段,不动,把后边的合并为一段 
  63.      */  
  64.     @Test  
  65.     public void testMerge() {  
  66.         IndexUtil iu = new IndexUtil();  
  67.         iu.merge();  
  68.     }  
  69.       
  70.     /* 
  71.      * numDocs: 6 
  72.         maxDocs: 7 
  73.         deleteDocs: 1 
  74.      
  75.         删除后 add 
  76.      */  
  77.     @Test  
  78.     public void testUpdate() {  
  79.         IndexUtil iu = new IndexUtil();  
  80.         iu.update();  
  81.     }  
  82. }  

索引文件中文件 表示含义:


0.fnm: 保存的field的信息,有哪几个字段


0.fdt,0.fdx:  Store.YES的对应字段的值


0.frq:单词出现的频率


0.nrm: 存储评分信息,权重


0.prx: 偏移量


0.tii,0.tis: 存储索引信息


文档和域的概念:


文档相当于表中的一条记录,域相当于表中每一个字段

optimize() 已被启用,开销比较大
forceMergeDeletes() 强制把回收站的内容给删掉


当segment比较多时,lucene会自动优化处理

你可能感兴趣的:(Java技术)