首先是建立索引的类文件:
package com.jereh.lucene; import java.io.*; import java.util.Date; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * 创建索引 Lucene 3.0+ * * @author Administrator * */ public class Indexer { /** * @param args * @throws IOException */ public static void index(String dateDir, String indexDir) throws IOException { IndexWriter indexWriter = null; // 创建Directory对象 Directory dir = FSDirectory.getDirectory(new File(indexDir)); // 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,第三个表示是否是创建,如果为false为在此基础上面修改,第四表示表示分词的最大值,比如说new // MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED indexWriter = new IndexWriter(dir,new PaodingAnalyzer()); File[] files = new File(dateDir).listFiles(); for (int i = 0; i < files.length; i++) { Document doc = new Document(); // 创建Field对象,并放入doc对象中 doc.add(new Field("contents", readContents(files[i], "UTF-8"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.add(new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.TOKENIZED)); doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.TOKENIZED)); // 写入IndexWriter indexWriter.addDocument(doc); } // 查看IndexWriter里面有多少个索引 System.out.println("numDocs:" + indexWriter.numRamDocs()); indexWriter.optimize(); indexWriter.close(); } public static String readContents(File file, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(file), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } }
其次是进行搜索的类:
package com.jereh.lucene; import java.io.File; import java.io.IOException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * 搜索索引 Lucene 3.0+ * * @author Administrator * */ public class Searcher { public static void search(String indexDir) throws IOException, ParseException { Directory dir = FSDirectory.getDirectory(new File(indexDir)); // 创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了 IndexSearcher indexSearch = new IndexSearcher(dir); // 创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器 QueryParser queryParser = new QueryParser("filename",new PaodingAnalyzer()); // 生成Query对象 Query query = queryParser.parse("滑移装载机"); // 搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值 //TopDocs hits = indexSearch.search(query, 10); Hits hits = indexSearch.search(query); // hits.totalHits表示一共搜到多少个 System.out.println("找到了" + hits.length() + "个"); // 循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值 Document doc = null; for(int i=0;i<hits.length();i++){ doc = hits.doc(i); System.out.print(doc.get("filename")); } indexSearch.close(); } }
最后是运行的类:
package com.jereh.lucene; import java.io.File; import java.io.IOException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import net.paoding.analysis.examples.gettingstarted.BoldFormatter; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; public class Test { public static void main(String[] args) throws IOException, ParseException { // try { // //Indexer.index("E:/code/jrcms_liugong/website/products/", // "F:/workspace/spring-mvc/WebRoot/WEB-INF/index/"); // Searcher.search("F:/workspace/spring-mvc/WebRoot/WEB-INF/index/"); // } catch (Exception e) { // e.printStackTrace(); // } // 将庖丁封装成符合Lucene要求的Analyzer规范 String dateDir = "E:/code/jrcms_liugong/website/about/"; Analyzer analyzer = new PaodingAnalyzer(); File[] files = new File(dateDir).listFiles(); for (File f : files) { // 读取本类目录下的text.txt文件 String content = Indexer.readContents(f, "UTF-8"); // 接下来是标准的Lucene建立索引和检索的代码 Directory ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, analyzer); Document doc = new Document(); Field fname = new Field("filename",f.getName(),Field.Store.YES,Field.Index.UN_TOKENIZED); Field fd = new Field("contents", content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fname); doc.add(fd); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(ramDir); String queryString = "国家级企业技术中心"; QueryParser parser = new QueryParser("contents", analyzer); Query query = parser.parse(queryString); Searcher searcher = new IndexSearcher(ramDir); query = query.rewrite(reader); //System.out.println("Searching for: " + query.toString("contents")); Hits hits = searcher.search(query); BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get("filename"); // int maxNumFragmentsRequired = 5; // String fragmentSeparator = "..."; // TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(hits.id(i), "contents"); // TokenStream tokenStream = TokenSources.getTokenStream(tpv); // String result = highlighter.getBestFragments(tokenStream, text, // maxNumFragmentsRequired, fragmentSeparator); // System.out.println("\n" + result); System.out.println(text); } reader.close(); } } }