Lucene基础(三)-- 中文分词及高亮显示

Lucene分词器及高亮

分词器

在lucene中我们按照分词方式把文档进行索引,不同的分词器索引的效果不太一样,之前的例子使用的都是标准分词器,对于英文的效果很好,但是中文分词效果就不怎么样,他会按照汉字的字直接分词,没有词语的概念。

使用分词的地方只需要把Analyzer实例化成我们第三方的分词器即可

中文分词有很多,这里使用IKAnalyzer 为例,
下载地址 https://git.oschina.net/wltea/IK-Analyzer-2012FF 现在下来后里面有一篇教程。

高亮

导入lucene-highlighter-xxx.jar 在对查询出来的结果实现高亮显示

 // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("", "");
            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

            for (int i = 0; i < hits.length; i++) {
                Document doc = isearcher.doc(hits[i].doc);
                // 内容增加高亮显示
                TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
                String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
                System.out.println(content);
            }

Lucene中文分词器

实例:

package lucene_demo04;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 *中文分词,IKAnalayzer,对索引结果实现高亮显示
 * @author YipFun
 */
public class LuceneDemo04 {
    private static final Version version = Version.LUCENE_4_9;
    private Directory directory = null;
    private DirectoryReader ireader = null;
    private IndexWriter iwriter = null;
    private IKAnalyzer analyzer;

    //测试数据
    private String[] content = {
        "你好,我是中共人",
        "中华人民共和国",
        "中国人民从此站起来了",
        "Lucene是一个不错的全文检索的工具",
        "全文检索中文分词"
    };

    /**
     * 构造方法
     */
    public LuceneDemo04() {
        directory = new RAMDirectory();
    }

    private IKAnalyzer getAnalyzer(){
        if(analyzer == null){
            return new IKAnalyzer();
        }else{
            return analyzer;
        }
    }


    /**
     * 创建索引
     */
    public void createIndex(){
        Document doc = null;
        try {
            IndexWriterConfig iwConfig =  new IndexWriterConfig(version, getAnalyzer());
            iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
            iwriter = new IndexWriter(directory,iwConfig);
            for(String text : content){
                doc = new Document();
                doc.add(new TextField("content", text,Field.Store.YES));
                iwriter.addDocument(doc);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }finally{
            try {
                if(iwriter != null)
                iwriter.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }

    public IndexSearcher getSearcher(){
        try {
            if(ireader==null) {
                ireader = DirectoryReader.open(directory);
            } else {
                DirectoryReader tr = DirectoryReader.openIfChanged(ireader) ;
                if(tr!=null) {
                    ireader.close();
                    ireader = tr;
                }
            }
            return new IndexSearcher(ireader);
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    public void searchByTerm(String field,String keyword,int num) throws InvalidTokenOffsetsException{
         IndexSearcher isearcher = getSearcher();
         Analyzer analyzer =  getAnalyzer();
        //使用QueryParser查询分析器构造Query对象
        QueryParser qp = new QueryParser(version,
                field,analyzer);
        //这句所起效果?
        qp.setDefaultOperator(QueryParser.OR_OPERATOR);
        try {
            Query query = qp.parse(keyword);
            ScoreDoc[] hits;

            //注意searcher的几个方法
            hits = isearcher.search(query, null, num).scoreDocs;

            // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("", "");
            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

            for (int i = 0; i < hits.length; i++) {
                Document doc = isearcher.doc(hits[i].doc);
                // 内容增加高亮显示
                TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
                String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
                System.out.println(content);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }

    /**
     * 使用过滤器查询
     * @param field
     * @param keyword
     * @param num
     * @throws InvalidTokenOffsetsException
     */
    public void searchByTermFilter(String field,String keyword,int num) throws InvalidTokenOffsetsException{
         IndexSearcher isearcher = getSearcher();
         Analyzer analyzer =  getAnalyzer();
        //使用QueryParser查询分析器构造Query对象
        QueryParser qp = new QueryParser(version,
                field,analyzer);
        //这句所起效果?
        qp.setDefaultOperator(QueryParser.OR_OPERATOR);
        try {
            Query query = qp.parse(keyword);
            Query q2 = qp.parse("全文检索");
            ScoreDoc[] hits;

            QueryWrapperFilter  filter=new QueryWrapperFilter(q2);  
            //注意searcher的几个方法
            hits = isearcher.search(query, filter, num).scoreDocs;

            // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("", "");
            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

            for (int i = 0; i < hits.length; i++) {
                Document doc = isearcher.doc(hits[i].doc);
                // 内容增加高亮显示
                TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
                String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
                System.out.println(content);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws InvalidTokenOffsetsException {
        System.out.println("start");
        LuceneDemo04 ld = new LuceneDemo04();
        ld.createIndex();
        long start = System.currentTimeMillis();
        ld.searchByTerm("content","人民",500);
        System.out.println("end search use "+(System.currentTimeMillis()-start)+"ms");
    }


}

运行结果:

start
加载扩展词典:ext.dic
加载扩展停止词典:stopword.dic
中华'color:red'>人民共和国
中国'color:red'>人民从此站起来了
end search use 129ms

你可能感兴趣的:(lucene)