1.创建索引:
package com.prl.utils.lucene; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; import org.wltea.analyzer.lucene.IKAnalyzer; public class IndexUtils { private final static String charset = "utf-8"; public void createIndex(String docPath, String indexPath) throws Exception { IndexWriter indexWriter = null; // 创建Directory对象 Directory dir = new SimpleFSDirectory(new File(indexPath)); Analyzer analyzer = new IKAnalyzer(); // 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,第三个表示是否是创建,如果为false为在此基础上面修改,第四表示表示分词的最大值,比如说new // MaxFieldLength(2),就表示两个字一分,一般用 IndexWriter.MaxFieldLength.LIMITED indexWriter = new IndexWriter(dir, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED); File[] files = new File(docPath).listFiles(); for (int i = 0; i < files.length; i++) { if (files[i].isFile()) { FileInputStream fis = new FileInputStream(files[i]); Reader reader = new InputStreamReader(fis, charset); String fileContent = readTextFile(files[i].getAbsolutePath(),charset); Map<String,String> parserResult = getPlainText(fileContent); fileContent = parserResult.get("plainText"); String title = parserResult.get("title"); Document doc = new Document(); // 创建Field对象,并放入doc对象中 // doc.add(new Field("contents", new FileReader(files[i]))); // doc.add(new Field("contents", reader)); doc.add(new Field("contents", fileContent, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("pagetitle", title, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("indexDate", DateTools.dateToString( new Date(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED)); // 写入IndexWriter indexWriter.addDocument(doc); // showAnalyzerResult(analyzer,getFileContent(files[i].getCanonicalPath())); } } System.out.println("indexWriter.numDocs():" + indexWriter.numDocs()); indexWriter.optimize(); indexWriter.close(); } public void showAnalyzerResult(Analyzer analyzer, String s) throws Exception { System.out.println("分词结果:"); StringReader reader = new StringReader(s); TokenStream ts = analyzer.tokenStream(s, reader); TermAttribute termAtt = (TermAttribute) ts .getAttribute(TermAttribute.class); while (ts.incrementToken()) { System.out.print(termAtt.term()); System.out.print(' '); // System.out.println(typeAtt.type()); } System.out.println("\n分析完毕......................."); } public static String readTextFile(String sFileName, String sEncode) { StringBuffer sbStr = new StringBuffer(); try { File ff = new File(sFileName); InputStreamReader read = new InputStreamReader(new FileInputStream( ff), sEncode); BufferedReader ins = new BufferedReader(read); String dataLine = ""; while (null != (dataLine = ins.readLine())) { sbStr.append(dataLine); // sbStr.append("\r\n"); } ins.close(); } catch (Exception e) { e.printStackTrace(); } return sbStr.toString(); } /** * 获取html纯文本信息 * * @param str * @return */ public static Map<String,String> getPlainText(String str) { Map<String,String> result = new HashMap<String,String>(); try { Parser parser = new Parser(); parser.setInputHTML(str); StringBean sb = new StringBean(); // 设置不需要得到页面所包含的链接信息 sb.setLinks(false); // 设置将不间断空格由正规空格所替代 sb.setReplaceNonBreakingSpaces(true); // 设置将一序列空格由一个单一空格所代替 sb.setCollapse(true); parser.visitAllNodesWith(sb); String plainText = sb.getStrings(); Parser myParser = Parser.createParser(str, charset); HtmlPage visitor = new HtmlPage(myParser); myParser.visitAllNodesWith(visitor); String title = visitor.getTitle(); //System.out.println("title="+title); result.put("title",title); result.put("plainText",plainText); } catch (ParserException e) { } return result; } }
2.搜索,并高亮显示
package com.prl.utils.lucene; import java.io.File; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; public class SerachUtils { public SearchResult search(String indexPath, String keyWords, Integer maxSerach) throws IOException, ParseException, InvalidTokenOffsetsException { SearchResult searchResult = new SearchResult(); Directory dir = new SimpleFSDirectory(new File(indexPath)); // 创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了 IndexSearcher indexSearch = new IndexSearcher(dir); // 创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器 Analyzer analyzer = new IKAnalyzer(); QueryParser queryParser = new QueryParser(Version.LUCENE_30,"contents", analyzer); // 生成Query对象 Query query = queryParser.parse(keyWords); // 搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值 TopDocs hits = indexSearch.search(query, maxSerach); // hits.totalHits表示一共搜到多少个 // 循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值 searchResult.setMatchCount(hits.totalHits); searchResult.setKeyWords(keyWords); for (int i = 0; i < hits.scoreDocs.length; i++) { ScoreDoc sdoc = hits.scoreDocs[i]; Document doc = indexSearch.doc(sdoc.doc); String fileName = doc.get("filename"); String contents = doc.get("contents"); String filetitle = doc.get("pagetitle"); SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>"); Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); if (contents != null) { TokenStream tokenStream = analyzer.tokenStream("contents",new StringReader(contents)); String matchText = highlighter.getBestFragment(tokenStream,contents); searchResult.addMatchItem(filetitle,fileName,matchText); } } indexSearch.close(); return searchResult; } }
上面的代码有清晰的注释,一看就明白,不用多解释了吧~~~