lunece学习笔记

lunece是一个全文检索工具,lunece的出现可以对关键词和部分数据建立索引,优化查询效率

这里引入需要lunecejar version 3.5 和junit jar version4.0

学习lunece需要最重要以下几个

  1. 索引部分

  2. 分词器

  3. 搜索部分

v1.下面先了解索引部分

 

v1.建立索引

首先建立一个IndexUtil类

 //创建索引
 public void index() {

  
  IndexWriter writer=null;
  try {
   // 1.创建Directory 将索引建立在什么地方(是内存中还是硬盘),这里保存到硬盘上
   //Directory directory=new RAMDirectory();//建立在内存中
   Directory directroy=FSDirectory.open(new File("F:/lunece"));
   // 2.创建IndexWriter 写入索引
   IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));
   writer=new IndexWriter(directroy,iwc);
   // 3.创建Document对象 相当于数据库中的表,或者硬盘的某个文件
   Document doc=null;
   File f=new File("F:/lunecetest1");
   for(File file:f.listFiles()){
    doc=new Document();
    // 4.为Document添加Filed 相当于表中的字段,为那些字段建立索引
    doc.add(new Field("content", new FileReader(file)));
    /**
     *  Field.Store.YES或者NO(存储域选项)
       设置为YES表示或把这个域中的内容完全存储到文件中,方便进行文本的还原
       设置为NO表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get)

     */
    doc.add(new Field("filename", file.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
    /***
     *  Field.Index(索引选项)
        Index.ANALYZED:进行分词和索引,适用于标题、内容等
        Index.NOT_ANALYZED:进行索引,但是不进行分词,如果身份证号,姓名,ID等,适用于精确搜索
        Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息
        Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息

     */
    doc.add(new Field("path", file.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
    // 5.通过IndexWriter添加文档到索引中
    writer.addDocument(doc);
   }
  } catch (CorruptIndexException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }finally {
   try {       
    if(writer!=null)writer.close();//6.这里要关闭writer,不然写入索引可能会有问题
   } catch (CorruptIndexException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
  
  }
  

 }
 

v3.搜索索引

//搜索索引
 public void search(){
  try {
   //1.创建打开索引目录
   Directory directory = FSDirectory.open(new File("f:/lunece"));
   //2.读取索引
   IndexReader reader = IndexReader.open(directory);
   //3.根据IndexReader 创建IndexSearcher 对reader在进行解析搜索索引
   IndexSearcher searcher = new IndexSearcher(reader);
   //4.创建Query对象,对那个解析索引域进行搜索如内容  StandardAnalyzer 分词器 
   //创建parse来确定搜索的内容,第二个参数代表搜索的域
   QueryParser parse=new QueryParser(Version.LUCENE_35,"content",new StandardAnalyzer(Version.LUCENE_35));
   //创建Query 表示搜索的域中包含java的文档
   Query query=parse.parse("java");
   //5.根据searcher搜索并且返回TopDocs,执行搜索
   TopDocs tds=searcher.search(query, 10);
   //6.根据TopDocs获取ScoreDoc对象
   ScoreDoc[] sds=tds.scoreDocs;
   for(ScoreDoc sd:sds){
    
    //7.根据searcher和ScoreDoc对象获取具体Document对象
    Document d=searcher.doc(sd.doc);
    //8根据Document对象获取需要的值
    System.out.println(d.get("filename")+"["+d.get("path")+"]");
    
   }
   reader.close(); //关闭reader
  } catch (Exception e) {
   // TODO: handle exception
  }
 }

  //查询索引
 public void query() {
  try {
   IndexReader reader = IndexReader.open(directory);
   //通过reader可以有效的获取到文档的数量
   System.out.println("numDocs:"+reader.numDocs());
   System.out.println("maxDocs:"+reader.maxDoc());
   System.out.println("deleteDocs:"+reader.numDeletedDocs());
   reader.close();
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
 }

//删除索引

 

列如我有

private String[] ids = {"1","2","3","4","5","6"};

private String[] emails = {"[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]"};
 private String[] contents = {
   "welcome to visited the space,I like book",
   "hello boy, I like pingpeng ball",
   "my name is cc I like game",
   "I like football",
   "I like football and I like basketball too",
   "I like movie and swim"
 };
 private Date[] dates = null;
 private int[] attachs = {2,3,1,4,5,5};
 private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};

private Directory directory =directory = new RAMDirectory();


 public void index() {
  IndexWriter writer = null;
  try {
   writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
   writer.deleteAll();
   Document doc = null;
   for(int i=0;i<ids.length;i++) {
    doc = new Document();
    doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
    doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
    doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED));
    doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
    doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
    //存储数字
    doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
    //存储日期
    doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
    String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
    System.out.println(et);
    if(scores.containsKey(et)) {
     doc.setBoost(scores.get(et));
    } else {
     doc.setBoost(0.5f);
    }
    writer.addDocument(doc);
   }
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   try {
    if(writer!=null)writer.close();
   } catch (CorruptIndexException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
  }
 }

 

 public void delete() {
  IndexWriter writer = null;
  
  try {
   writer = new IndexWriter(directory,
     new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
   //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
   //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复
   writer.deleteDocuments(new Term("id","1"));
   writer.commit();
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   try {
    if(writer!=null) writer.close();
   } catch (CorruptIndexException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
  }
 }

然后建立一个TestLunece测试类

 @Test
 public void testIndex(){
  
  IndexUtil indexUtil=new IndexUtil();
  indexUtil.index();
 }

 

@Test
 public void testDelete() {
  IndexUtil iu = new IndexUtil();
  iu.delete();
 }

 

  //恢复索引
 public void undelete() {
  //使用IndexReader进行恢复
  try {
   IndexReader reader = IndexReader.open(directory,false);
   //恢复时,必须把IndexReader的只读(readOnly)设置为false
   reader.undeleteAll();
   reader.close();
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (StaleReaderException e) {
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
 }
 

v4.搜索部分

建立SearcherUtil

 

你可能感兴趣的:(lunece)