lucene笔记-创建索引,搜索结果高亮显示

1.创建索引:

package com.prl.utils.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class IndexUtils {

	private final static String charset = "utf-8";

	public void createIndex(String docPath, String indexPath) throws Exception {

		IndexWriter indexWriter = null;
		// 创建Directory对象
		Directory dir = new SimpleFSDirectory(new File(indexPath));

		Analyzer analyzer = new IKAnalyzer();
		// 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,第三个表示是否是创建,如果为false为在此基础上面修改,第四表示表示分词的最大值,比如说new
		// MaxFieldLength(2),就表示两个字一分,一般用 IndexWriter.MaxFieldLength.LIMITED
		indexWriter = new IndexWriter(dir, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);
		File[] files = new File(docPath).listFiles();

		for (int i = 0; i < files.length; i++) {
			if (files[i].isFile()) {

				FileInputStream fis = new FileInputStream(files[i]);
				Reader reader = new InputStreamReader(fis, charset);

				String fileContent = readTextFile(files[i].getAbsolutePath(),charset);
				
				Map<String,String> parserResult = getPlainText(fileContent);
				fileContent = parserResult.get("plainText");
				String title = parserResult.get("title");

				Document doc = new Document();
				// 创建Field对象,并放入doc对象中
				// doc.add(new Field("contents", new FileReader(files[i])));
				// doc.add(new Field("contents", reader));
				doc.add(new Field("contents", fileContent, Field.Store.YES,
						Field.Index.ANALYZED));
				doc.add(new Field("filename", files[i].getName(),
						Field.Store.YES, Field.Index.NOT_ANALYZED));
				doc.add(new Field("pagetitle", title, Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				doc.add(new Field("indexDate", DateTools.dateToString(
						new Date(), DateTools.Resolution.DAY), Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				// 写入IndexWriter
				indexWriter.addDocument(doc);

				// showAnalyzerResult(analyzer,getFileContent(files[i].getCanonicalPath()));
			}
		}

		System.out.println("indexWriter.numDocs():" + indexWriter.numDocs());

		indexWriter.optimize();
		indexWriter.close();

	}

	public void showAnalyzerResult(Analyzer analyzer, String s)
			throws Exception {

		System.out.println("分词结果:");
		StringReader reader = new StringReader(s);
		TokenStream ts = analyzer.tokenStream(s, reader);

		TermAttribute termAtt = (TermAttribute) ts
				.getAttribute(TermAttribute.class);

		while (ts.incrementToken()) {
			System.out.print(termAtt.term());
			System.out.print(' ');
			// System.out.println(typeAtt.type());
		}
		System.out.println("\n分析完毕.......................");
	}

	public static String readTextFile(String sFileName, String sEncode) {
		StringBuffer sbStr = new StringBuffer();

		try {
			File ff = new File(sFileName);
			InputStreamReader read = new InputStreamReader(new FileInputStream(
					ff), sEncode);
			BufferedReader ins = new BufferedReader(read);

			String dataLine = "";
			while (null != (dataLine = ins.readLine())) {
				sbStr.append(dataLine);
				// sbStr.append("\r\n");
			}
			ins.close();
		} catch (Exception e) {
			e.printStackTrace();
		}

		return sbStr.toString();
	}

	/**
	 * 获取html纯文本信息
	 * 
	 * @param str
	 * @return
	 */
	public static Map<String,String> getPlainText(String str) {
		Map<String,String> result = new HashMap<String,String>();
		try {
			Parser parser = new Parser();
			parser.setInputHTML(str);

			StringBean sb = new StringBean();
			// 设置不需要得到页面所包含的链接信息
			sb.setLinks(false);
			// 设置将不间断空格由正规空格所替代
			sb.setReplaceNonBreakingSpaces(true);
			// 设置将一序列空格由一个单一空格所代替
			sb.setCollapse(true);
			parser.visitAllNodesWith(sb);
			String plainText = sb.getStrings();
			
			Parser myParser = Parser.createParser(str, charset);
			HtmlPage visitor = new HtmlPage(myParser);
			myParser.visitAllNodesWith(visitor);
			String title = visitor.getTitle();
			
			//System.out.println("title="+title);
			
			result.put("title",title);
			result.put("plainText",plainText);			
			
		} catch (ParserException e) {
		}
		return result;
	}
}

 

2.搜索,并高亮显示

package com.prl.utils.lucene;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class SerachUtils {
	public SearchResult search(String indexPath, String keyWords,
			Integer maxSerach) throws IOException, ParseException, InvalidTokenOffsetsException {

		SearchResult searchResult = new SearchResult();

		Directory dir = new SimpleFSDirectory(new File(indexPath));
		// 创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
		IndexSearcher indexSearch = new IndexSearcher(dir);
		// 创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
		Analyzer analyzer = new IKAnalyzer();
		QueryParser queryParser = new QueryParser(Version.LUCENE_30,"contents", analyzer);
		// 生成Query对象
		Query query = queryParser.parse(keyWords);
		// 搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
		TopDocs hits = indexSearch.search(query, maxSerach);
		// hits.totalHits表示一共搜到多少个
		// 循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
		searchResult.setMatchCount(hits.totalHits);
		searchResult.setKeyWords(keyWords);
		for (int i = 0; i < hits.scoreDocs.length; i++) {
			ScoreDoc sdoc = hits.scoreDocs[i];
			Document doc = indexSearch.doc(sdoc.doc);
			String fileName = doc.get("filename");
			String contents = doc.get("contents");
			String filetitle = doc.get("pagetitle");
			
			SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>");
			Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(query));
			highlighter.setTextFragmenter(new SimpleFragmenter(50));
			if (contents != null) {
				TokenStream tokenStream = analyzer.tokenStream("contents",new StringReader(contents));
				String matchText = highlighter.getBestFragment(tokenStream,contents);
				searchResult.addMatchItem(filetitle,fileName,matchText);				
			}
		}
		indexSearch.close();

		return searchResult;
	}
}

 

上面的代码有清晰的注释,一看就明白,不用多解释了吧~~~

你可能感兴趣的:(apache,Lucene)