Indexer:
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.FileReader; // From chapter 1 /** * This code was originally written for * Erik's Lucene intro java.net article */ public class Indexer { public static void main(String[] args) throws Exception { if (args.length != 2) { throw new IllegalArgumentException("Usage: java " + Indexer.class.getName() + " <index dir> <data dir>"); } String indexDir = args[0]; //1 String dataDir = args[1]; //2 long start = System.currentTimeMillis(); Indexer indexer = new Indexer(indexDir); int numIndexed; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } finally { indexer.close(); } long end = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } private IndexWriter writer; public Indexer(String indexDir) throws IOException { Directory dir = FSDirectory.open(new File(indexDir)); writer = new IndexWriter(dir, //3 new StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3 IndexWriter.MaxFieldLength.UNLIMITED); //3 } public void close() throws IOException { writer.close(); //4 } public int index(String dataDir, FileFilter filter) throws Exception { File[] files = new File(dataDir).listFiles(); for (File f: files) { if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) { indexFile(f); } } return writer.numDocs(); //5 } private static class TextFilesFilter implements FileFilter { public boolean accept(File path) { return path.getName().toLowerCase() //6 .endsWith(".txt"); //6 } } protected Document getDocument(File f) throws Exception { Document doc = new Document(); doc.add(new Field("contents", new FileReader(f))); //7 doc.add(new Field("filename", f.getName(), //8 Field.Store.YES, Field.Index.NOT_ANALYZED));//8 doc.add(new Field("fullpath", f.getCanonicalPath(), //9 Field.Store.YES, Field.Index.NOT_ANALYZED));//9 return doc; } private void indexFile(File f) throws Exception { System.out.println("Indexing " + f.getCanonicalPath()); Document doc = getDocument(f); writer.addDocument(doc); //10 } }
索引过程核心类:
IndexWriter
负责新建或打开已有索引,以及向索引中添加、删除或更新被索引文档信息,一般要通过构造器传入Directory和Analyzer
Directory
抽象类,描述了索引的存放位置
Analyzer
负责从被索引文本中提取语汇单元,只能处理纯文本文件,如果不是纯文本,需要先转换(如使用Tika)
Document
Document对象代表一些Field的集合
Field
Lucene只处理从二进制文档中提取的一Field形式出现的文本,文档的元数据作为文档的不同域单独存储并索引
题外话:Lucene内核本身只处理java.lang.String、java.io.Reader和本地数字类型(int、float等)
Searcher:
import org.apache.lucene.document.Document; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import java.io.File; import java.io.IOException; // From chapter 1 /** * This code was originally written for * Erik's Lucene intro java.net article */ public class Searcher { public static void main(String[] args) throws IllegalArgumentException, IOException, ParseException { if (args.length != 2) { throw new IllegalArgumentException("Usage: java " + Searcher.class.getName() + " <index dir> <query>"); } String indexDir = args[0]; //1 String q = args[1]; //2 search(indexDir, q); } public static void search(String indexDir, String q) throws IOException, ParseException { Directory dir = FSDirectory.open(new File(indexDir)); //3 IndexSearcher is = new IndexSearcher(dir); //3 QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 "contents", //4 new StandardAnalyzer( //4 Version.LUCENE_30)); //4 Query query = parser.parse(q); //4 long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); //5 long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + //6 " document(s) (in " + (end - start) + // 6 " milliseconds) that matched query '" + // 6 q + "':"); // 6 for(ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); //7 System.out.println(doc.get("fullpath")); //8 } is.close(); //9 } }
搜索过程核心类:
IndexSearcher
用于搜索由IndexWriter创建的索引,构造器需要传入Directory获取创建的索引。然后提供搜索方法
Term
Term对象是搜索的基本单元(与Field类似)
Query q = new TermQuery(new Term("contents","lucene")); TopDocs hits = searcher.search(q,10);
Query
Query是所有查询类的基类,如TermQuery、BooleanQuery
TermQuery
TermQuery是最基本最简单的查询类型之一,用于匹配指定域中包含指定项的文档
TopDocs
是一个简单的指针容器,容纳查询结果
汤能养身整理,转载注明