建立索引:
package paoding; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; public class IndexFiles { public static void main(String[] args) { long start = System.currentTimeMillis(); try { // 获取Paoding中文分词器 Analyzer analyzer = new PaodingAnalyzer(); // Analyzer analyzer = new StandardAnalyzer(); // indexWriter建立索引 IndexWriter writer = new IndexWriter("f:\\indexpaoding", analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); indexDocs(writer, new File("F:\\徐剛:28tel(繁firfox)")); writer.optimize(); writer.close(); System.out.println("用时:" + (System.currentTimeMillis() - start) + " 毫秒"); } catch (IOException e) { e.printStackTrace(); } } // 遍历文件夹文件,对需要的文件建立索引 static void indexDocs(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { if (file.getName().endsWith(".htm") || file.getName().endsWith(".html") || file.getName().endsWith(".jsp") || file.getName().endsWith(".php") || file.getName().endsWith(".txt")) { System.out.println("添加 " + file); try { // 针对参数文件建立索引文档 ,一个Document就相当于一跳记录 Document doc = new Document(); // Field.Index.ANALYZED 文件名称 建立索引,分词 doc.add(new Field("filename", file.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("contents", ReadFile(file), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); // new InputStreamReader(new // FileInputStream(file.getCanonicalPath()), "utf-8"))); writer.addDocument(doc); } catch (FileNotFoundException fnfe) { ; } } } } } // 用字符串形式,读取一个File的内容 public static String ReadFile(File f) { String line = null; StringBuffer temp = new StringBuffer(); try { BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(f), "utf-8")); while ((line = br.readLine()) != null) { temp.append(line); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return temp.toString(); } }
用来搜索:并带简单分页效果
package paoding; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.TokenGroup; import org.apache.lucene.search.highlight.TokenSources; public class SearchFiles { /** * * @param key * 搜索的关键字 * @param perPage * 每页显示多少条记录 * @param begin * 从第几页开始显示 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ int CACHE_PAGE = 3; // 缓存的页面数 public void search(String key, int perPage, int begin) throws CorruptIndexException, IOException, ParseException { String IDNEX_PATH = "f:\\indexpaoding"; //索引所在目录 int total_Page = 0; // 总页数 // 获取Paoding中文分词器 Analyzer analyzer = new PaodingAnalyzer(); // Analyzer analyzer = new StandardAnalyzer(); // 检索 IndexReader reader = IndexReader.open(IDNEX_PATH); Searcher searcher = new IndexSearcher(reader); /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */ BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; Query query = MultiFieldQueryParser.parse(key, new String[] { "filename", "contents" }, clauses, analyzer); // QueryParser parser = new QueryParser("contents", analyzer); // Query query = parser.parse(key); TopDocCollector collector = new TopDocCollector(perPage * CACHE_PAGE); // perPage searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; int numTotalHits = collector.getTotalHits(); System.out.println("符合查询词的文件数:" + numTotalHits); // 获得总页数 if (numTotalHits % perPage != 0) { total_Page = numTotalHits / perPage + 1; } else { total_Page = numTotalHits / perPage; } if (begin > total_Page) { System.err.println("超出范围"); } else { // 如果起始页大于缓存页,这就代表我们需要重新搜索更多的资源 if (begin > CACHE_PAGE) { // 这时,我把搜索的资源都搜索出来,缓存页数=总页数 CACHE_PAGE = total_Page; // 返回调用 search(key, perPage, begin); // collector = new TopDocCollector( numTotalHits ); //缓存不够,重新搜索 // searcher.search(query, collector); // hits = collector.topDocs().scoreDocs; } else { int temp = (begin - 1) * perPage + perPage; if ((begin - 1) * perPage + perPage > numTotalHits) { temp = numTotalHits; } // 根据参数,从指定的位置开始获取数据(用于分页) for (int i = (begin - 1) * perPage; i < temp; i++) { System.out.println(i); int docId = hits[i].doc; Document doc3 = searcher.doc(docId); String filename = doc3.get("filename"); System.out.println("filename=" + filename); // 高亮处理 String text = doc3.get("contents"); TermPositionVector tpv = (TermPositionVector) reader .getTermFreqVector(hits[i].doc, "contents"); TokenStream ts = TokenSources.getTokenStream(tpv); Formatter formatter = new Formatter() { public String highlightTerm(String srcText, TokenGroup g) { if (g.getTotalScore() <= 0) { return srcText; } return "<b>" + srcText + "</b>"; } }; Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); String result = highlighter.getBestFragments(ts, text, 5, "…"); System.out.println("result:\n\t" + result); } System.out.println("循环结束"); } } reader.close(); System.out.println("关闭reader"); } public static void main(String[] args) throws Exception { SearchFiles sf = new SearchFiles(); sf.search("vvczvxcxz", 5, 1); } }