《Lucene In Action》 02 Hello Lucene World

Indexer:

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.Directory;

import org.apache.lucene.util.Version;



import java.io.File;

import java.io.FileFilter;

import java.io.IOException;

import java.io.FileReader;



// From chapter 1



/**

 * This code was originally written for

 * Erik's Lucene intro java.net article

 */

public class Indexer {



  public static void main(String[] args) throws Exception {

    if (args.length != 2) {

      throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()

        + " <index dir> <data dir>");

    }

    String indexDir = args[0];         //1

    String dataDir = args[1];          //2



    long start = System.currentTimeMillis();

    Indexer indexer = new Indexer(indexDir);

    int numIndexed;

    try {

      numIndexed = indexer.index(dataDir, new TextFilesFilter());

    } finally {

      indexer.close();

    }

    long end = System.currentTimeMillis();



    System.out.println("Indexing " + numIndexed + " files took "

      + (end - start) + " milliseconds");

  }



  private IndexWriter writer;



  public Indexer(String indexDir) throws IOException {

    Directory dir = FSDirectory.open(new File(indexDir));

    writer = new IndexWriter(dir,            //3

                 new StandardAnalyzer(       //3

                     Version.LUCENE_30),//3

                 true,                       //3

                             IndexWriter.MaxFieldLength.UNLIMITED); //3

  }



  public void close() throws IOException {

    writer.close();                             //4

  }



  public int index(String dataDir, FileFilter filter)

    throws Exception {



    File[] files = new File(dataDir).listFiles();



    for (File f: files) {

      if (!f.isDirectory() &&

          !f.isHidden() &&

          f.exists() &&

          f.canRead() &&

          (filter == null || filter.accept(f))) {

        indexFile(f);

      }

    }



    return writer.numDocs();                     //5

  }



  private static class TextFilesFilter implements FileFilter {

    public boolean accept(File path) {

      return path.getName().toLowerCase()        //6

             .endsWith(".txt");                  //6

    }

  }



  protected Document getDocument(File f) throws Exception {

    Document doc = new Document();

    doc.add(new Field("contents", new FileReader(f)));      //7

    doc.add(new Field("filename", f.getName(),              //8

                Field.Store.YES, Field.Index.NOT_ANALYZED));//8

    doc.add(new Field("fullpath", f.getCanonicalPath(),     //9

                Field.Store.YES, Field.Index.NOT_ANALYZED));//9

    return doc;

  }



  private void indexFile(File f) throws Exception {

    System.out.println("Indexing " + f.getCanonicalPath());

    Document doc = getDocument(f);

    writer.addDocument(doc);                              //10

  }

}

索引过程核心类:

IndexWriter

  负责新建或打开已有索引,以及向索引中添加、删除或更新被索引文档信息,一般要通过构造器传入Directory和Analyzer

Directory

  抽象类,描述了索引的存放位置

Analyzer

  负责从被索引文本中提取语汇单元,只能处理纯文本文件,如果不是纯文本,需要先转换(如使用Tika)

Document

  Document对象代表一些Field的集合

Field

  Lucene只处理从二进制文档中提取的一Field形式出现的文本,文档的元数据作为文档的不同域单独存储并索引

题外话:Lucene内核本身只处理java.lang.String、java.io.Reader和本地数字类型(int、float等)

Searcher:

import org.apache.lucene.document.Document;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.Directory;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.util.Version;



import java.io.File;

import java.io.IOException;



// From chapter 1



/**

 * This code was originally written for

 * Erik's Lucene intro java.net article

 */

public class Searcher {



  public static void main(String[] args) throws IllegalArgumentException,

        IOException, ParseException {

    if (args.length != 2) {

      throw new IllegalArgumentException("Usage: java " + Searcher.class.getName()

        + " <index dir> <query>");

    }



    String indexDir = args[0];               //1 

    String q = args[1];                      //2   



    search(indexDir, q);

  }



  public static void search(String indexDir, String q)

    throws IOException, ParseException {



    Directory dir = FSDirectory.open(new File(indexDir)); //3

    IndexSearcher is = new IndexSearcher(dir);   //3   



    QueryParser parser = new QueryParser(Version.LUCENE_30, // 4

                                         "contents",  //4

                     new StandardAnalyzer(          //4

                       Version.LUCENE_30));  //4

    Query query = parser.parse(q);              //4   

    long start = System.currentTimeMillis();

    TopDocs hits = is.search(query, 10); //5

    long end = System.currentTimeMillis();



    System.err.println("Found " + hits.totalHits +   //6  

      " document(s) (in " + (end - start) +        // 6

      " milliseconds) that matched query '" +     // 6

      q + "':");                                   // 6



    for(ScoreDoc scoreDoc : hits.scoreDocs) {

      Document doc = is.doc(scoreDoc.doc);               //7      

      System.out.println(doc.get("fullpath"));  //8  

    }



    is.close();                                //9

  }

}

搜索过程核心类:

IndexSearcher

  用于搜索由IndexWriter创建的索引,构造器需要传入Directory获取创建的索引。然后提供搜索方法

Term

  Term对象是搜索的基本单元(与Field类似)

Query q = new TermQuery(new Term("contents","lucene"));

TopDocs hits = searcher.search(q,10);

Query

  Query是所有查询类的基类,如TermQuery、BooleanQuery

TermQuery

  TermQuery是最基本最简单的查询类型之一,用于匹配指定域中包含指定项的文档

TopDocs

  是一个简单的指针容器,容纳查询结果

 

汤能养身整理,转载注明

 

你可能感兴趣的:(Lucene)