看Lucene in Action的时候,练习的一个入门例子。
在使用Lucene进行文本内容搜索前,需要先对指定的目录下的文件进行建立索引,代码如下:
import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Indexer { public static void main(String[] args) { if(args.length != 2) { throw new IllegalArgumentException("Usage : java " + Indexer.class.getName() + " <index dir><data dir>"); } String indexDir = args[0]; String dataDir = args[1]; long start = System.currentTimeMillis(); Indexer indexer = null; try { indexer = new Indexer(indexDir); } catch (IOException e) { e.printStackTrace(); } int numIndexed = 0; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } catch (Exception e) { e.printStackTrace(); }finally { try { indexer.close(); } catch (IOException e) { e.printStackTrace(); } } long end = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + "mi"); } private IndexWriter writer; public Indexer(String indexDir) throws IOException { Directory dir = FSDirectory.open(new File(indexDir)); writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_30, new StandardAnalyzer(Version.LUCENE_30))); } public void close() throws IOException { writer.close(); } public int index(String dataDir, FileFilter filter) throws Exception{ File[] files = new File(dataDir).listFiles(); for(File file : files) { if(!file.isDirectory() && !file.isHidden() && file.exists() && file.canRead() && (filter == null || filter.accept(file))) { indexFile(file); } } return writer.numDocs(); } private static class TextFilesFilter implements FileFilter { @Override public boolean accept(File path) { return path.getName().toLowerCase().endsWith(".txt"); } } /** * 声明三个索引的查询域,一个contents,一个filename,一个fullpath * @param f * @return * @throws Exception */ protected Document getDocument(File f) throws Exception { Document doc = new Document(); doc.add(new Field("contents", new FileReader(f))); doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); return doc; } private void indexFile(File f) throws Exception { System.out.println("Indexing " + f.getCanonicalPath()); Document doc = getDocument(f); writer.addDocument(doc); } }
在使用命令方式执行后在传入的目录下会生成如图的文件:
成功生成索引后,便可以进行基本的搜索了,检索代码如下:
import java.io.File; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searcher { public static void main(String[] args) throws IOException, ParseException { if(args.length != 2) { throw new IllegalArgumentException("Usage: java " + Searcher.class.getName() + " <index dir><query>"); } String indexDir = args[0]; String q = args[1]; search(indexDir, q); } private static void search(String indexDir, String q) throws IOException, ParseException { Directory dir = FSDirectory.open(new File(indexDir)); DirectoryReader reader = DirectoryReader.open(dir); IndexSearcher is = new IndexSearcher(reader); // 每个Term都对应一个Field域 Query query = new TermQuery(new Term("contents", q)); long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " mi) that matched query '" + q + "' :"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); System.out.println(doc.get("fullpath")); System.out.println(Arrays.toString(doc.getValues("filename"))); System.out.println(doc.get("contents")); } Query qu = new TermQuery(new Term("filename", "1.txt")); TopDocs hits1 = is.search(qu, 10); for(ScoreDoc score : hits1.scoreDocs) { System.out.println(is.doc(score.doc).get("fullpath")); } reader.close(); dir.close(); } }
命令中执行如搜索目录下是否有“Hi”的单词,返回结果为: