lucene整合中文分词器mmseg4j和高亮highlighter

最近在研究lucene,其实很简单,可以整合中文分词器mmseg4j时,总是会报一些异常,这主要是版本兼容问题,在此做一个记录

环境:

lucene:4.3.1

mmseg4j:1.9.1

主要jar包,如下图:

lucene整合中文分词器mmseg4j和高亮highlighter_第1张图片

因为我只要mmseg4j的分词器,所以不要solr包

直接上代码:

package com.chenlb.mmseg4j.example;

import java.io.File;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;

public class Test {

	private static final String INDEXPATH = "D:\\index";
	private static Analyzer analyzer = new ComplexAnalyzer();

	public static void main(String[] args) {
		try {
			indexCreate();
			search();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static void indexCreate() throws Exception {
		// 建立索引对象
		Directory directory = FSDirectory.open(new File(INDEXPATH));
		IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_43,
				analyzer);
		iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
		IndexWriter writer = new IndexWriter(directory, iwConfig);
		String content = "京华时报2008年1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";
		Document doc = new Document();
		TextField textField = new TextField("title", content, Field.Store.YES);
		doc.add(textField);
		writer.addDocument(doc);
		writer.close();
	}

	public static void search() throws Exception {

		File indexDir = new File(INDEXPATH);
		// 索引目录
		Directory dir = FSDirectory.open(indexDir);
		// 根据索引目录创建读索引对象
		IndexReader reader = DirectoryReader.open(dir);
		// 搜索对象创建
		IndexSearcher searcher = new IndexSearcher(reader);
		// 创建查询解析对象
		QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_43,
				new String[] { "title" }, analyzer);
		parser.setDefaultOperator(QueryParser.AND_OPERATOR);
		String word = "中西伯利亚 ";
		// 根据域和目标搜索文本创建查询器
		Query query = parser.parse(word);
		System.out.println("搜索关键词: " + query.toString(word));
		// 对结果进行相似度打分排序
		TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10,
				true);
		searcher.search(query, collector);
		// 获取结果
		ScoreDoc[] hits = collector.topDocs().scoreDocs;
		int numTotalHits = collector.getTotalHits();
		System.out.println("一共匹配" + numTotalHits + "个网页");
		// 设置高亮显示格式
		SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(
				"<font color='red'><strong>", "</strong></font>");
		/* 语法高亮显示设置 */
		Highlighter highlighter = new Highlighter(simpleHTMLFormatter,
				new QueryScorer(query));
		highlighter.setTextFragmenter(new SimpleFragmenter(100));

		// 显示搜索结果
		for (int i = 0; i < hits.length; i++) {
			Document doc = searcher.doc(hits[i].doc);
			String title = doc.get("title");
			TokenStream titleTokenStream = analyzer.tokenStream(title,
					new StringReader(title));
			String highLightTitle = highlighter.getBestFragment(
					titleTokenStream, title);
			System.out.println((i + 1) + "." + title);
			System.out.println(highLightTitle);
		}
	}
}

运行结果:

lucene整合中文分词器mmseg4j和高亮highlighter_第2张图片

这就完成了.....

你可能感兴趣的:(lucene整合中文分词器mmseg4j和高亮highlighter)