LUCENE-学习

1.lucene简介

lucene主要的功能是用来全文检索。他可以搜索出非结构性的数据,比如WORD文档等。

2.lucene使用

1)pom文件引入


    4.7.2
    UTF-8
    UTF-8



    
        org.apache.lucene
        lucene-core
        ${lucene.version}
    
    
        org.apache.lucene
        lucene-queryparser
        ${lucene.version}
    
    
        org.apache.lucene
        lucene-analyzers-common
        ${lucene.version}
    
    
    
        junit
        junit
        RELEASE
        compile
    
    
    
        org.apache.lucene
        lucene-highlighter
        4.7.2
    

2)新建lucene工具类

package com.show.lucence.lucence.utils;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class AnalyzerUtil {
    public static final Class INTEGER_TYPE = Integer.class;
    public static final Class DOUBLE_TYPE = Double.class;
    public static final Class FLOAT_TYPE = Float.class;
    public static final Class LONG_TYPE = Long.class;
    public static final Class BIGDECIMAL_TYPE = BigDecimal.class;
    public static final Class STRING_TYPE = String.class;

    public static final Map CLASS_MAP = new HashMap() {
        {
            put(INTEGER_TYPE,0);
            put(DOUBLE_TYPE,1);
            put(FLOAT_TYPE,2);
            put(LONG_TYPE,3);
            put(BIGDECIMAL_TYPE,4);
            put(STRING_TYPE,5);
        }
    };
    //写入
    public static void writerDoc(Directory directory, Analyzer analyzer, Object entity) throws Exception {
        Document doc = newDocument(entity);
        //创建写入器
        IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_47,analyzer));
        indexWriter.addDocument(doc);
        indexWriter.close();
    }
    //更新
    public static void updateDoc(Directory directory, Analyzer analyzer, Object entity,String index0,String value0) throws Exception{
        IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_47,analyzer));
        Document doc = newDocument(entity);
        Term term = new Term(index0,value0);
        indexWriter.updateDocument(term,doc);
        indexWriter.close();
    }
    //删除
    public static void deleteDoc(Directory directory, Analyzer analyzer,String index0,String value0) throws Exception {
        IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_47,analyzer));
        Term term = new Term(index0,value0);
        if(index0 == null && value0 == null){
            indexWriter.deleteAll();
        }else {
            indexWriter.deleteDocuments(term);
        }
        indexWriter.close();
    }
    //查询
    public static  List searchDoc(Directory directory, Analyzer analyzer,Class clazz,String searchInfo) throws Exception {

        Map fieldType = new HashMap();
        String[] fields = getFields(clazz, fieldType);

        DirectoryReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);
        MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_47,fields,analyzer);

        List objList = new ArrayList();

        searchDoc(searchInfo, clazz, fieldType, fields, searcher, parser, objList);

        reader.close();
        directory.close();
        return objList;
    }

    private static  void searchDoc(String searchInfo, Class clazz, Map fieldType, String[] fields, IndexSearcher searcher, MultiFieldQueryParser parser, List objList) throws Exception {
        Query query = parser.parse(searchInfo);
        TopDocs search =  searcher.search(query, null, 10);
        System.out.println(search.totalHits);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        //读出
        for (ScoreDoc scoreDoc: scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            T obj = (T)clazz.newInstance();
            for (String field : fields) {
                Class aClass = fieldType.get(field);
                String fieldValue = document.get(field);
                Method declaredMethod = clazz.getDeclaredMethod("set" + toUpperCaseFirst(field),aClass);
                if(CLASS_MAP.get(aClass) == 5){
                    Highlighter highlighter = highlighter(query);
                    String bestFragment = highlighter.getBestFragment(parser.getAnalyzer(), field, fieldValue);
                    if (bestFragment != null){
                        fieldValue = bestFragment;
                    }
                }
                transToReal(declaredMethod,obj,aClass, fieldValue);
            }
            objList.add(obj);
        }
    }
    //高亮显示
    private static Highlighter highlighter(Query query){
        Formatter formatter = new SimpleHTMLFormatter("","");
        Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));
        Fragmenter fragmenter = new SimpleFragmenter(100);
        highlighter.setTextFragmenter(fragmenter);
        return highlighter;
    }

    private static Document newDocument(Object entity) throws IllegalAccessException, InvocationTargetException, NoSuchMethodException {
        //创建文档对象,并添加相关字段值
        Document doc = new Document();
        Class clazz = entity.getClass();
        java.lang.reflect.Field[] declaredFields = clazz.getDeclaredFields();
        for (java.lang.reflect.Field declaredField : declaredFields) {
            declaredField.setAccessible(true);
            String fieldName = declaredField.getName();
            Object invoke = clazz.getDeclaredMethod("get" + toUpperCaseFirst(fieldName)).invoke(entity);
            doc.add(new Field(fieldName,invoke.toString(),Field.Store.YES,Field.Index.ANALYZED));
        }
        return doc;
    }

    private static  String[] getFields(Class clazz, Map fieldType) {
        java.lang.reflect.Field[] declaredFields = clazz.getDeclaredFields();
        String[] fields = new String[declaredFields.length];
        for (int i = 0; i < fields.length; i++) {
            java.lang.reflect.Field declaredField = declaredFields[i];
            declaredField.setAccessible(true);
            String fieldName = declaredField.getName();
            fields[i] = fieldName;
            fieldType.put(fieldName,declaredField.getType());
        }
        return fields;
    }

    private static void transToReal(Method declaredMethod,Object obj,Class aClass, String fieldValue) throws Exception {
        System.out.println(CLASS_MAP.get(aClass));
        switch (CLASS_MAP.get(aClass)){
            case 0:
                declaredMethod.invoke(obj, Integer.valueOf(fieldValue));
                break;
            case 1:
                declaredMethod.invoke(obj,Double.valueOf(fieldValue));
                break;
            case 2:
                declaredMethod.invoke(obj,Float.valueOf(fieldValue));
                break;
            case 3:
                declaredMethod.invoke(obj,Long.valueOf(fieldValue));
                break;
            case 4:
                declaredMethod.invoke(obj,new BigDecimal(fieldValue));
                break;
            default:
                declaredMethod.invoke(obj,fieldValue);
        }
    }

    private static String toUpperCaseFirst(String fieldName) {
        return fieldName.substring(0, 1).toUpperCase() + fieldName.substring(1);
    }

}

3)自定义测试对象

package com.show.lucence.lucence.domain;

import java.io.Serializable;

public class Goods implements Serializable {

    private Integer goodsId;
    private String goodsName;
    private Double goodsPrice;
    private String goodsRemark;
    public Goods() {
        super();
    }
    public Goods(Integer goodsId, String goodsName,Double goodsPrice,String goodsRemark) {
        super();
        this.goodsId = goodsId;//商品ID
        this.goodsName = goodsName;//商品名称
        this.goodsPrice = goodsPrice;//商品价格
        this.goodsRemark = goodsRemark;//商品备注、描述
    }

    public Integer getGoodsId() {
        return goodsId;
    }

    public void setGoodsId(Integer goodsId) {
        this.goodsId = goodsId;
    }

    public String getGoodsName() {
        return goodsName;
    }

    public void setGoodsName(String goodsName) {
        this.goodsName = goodsName;
    }

    public Double getGoodsPrice() {
        return goodsPrice;
    }

    public void setGoodsPrice(Double goodsPrice) {
        this.goodsPrice = goodsPrice;
    }

    public String getGoodsRemark() {
        return goodsRemark;
    }

    public void setGoodsRemark(String goodsRemark) {
        this.goodsRemark = goodsRemark;
    }

    @Override
    public String toString() {
        return "Goods{" +
                "goodsId=" + goodsId +
                ", goodsName='" + goodsName + '\'' +
                ", goodsPrice=" + goodsPrice +
                ", goodsRemark='" + goodsRemark + '\'' +
                '}';
    }
}

4)建测试类测试-这里使用的IK分词器

package com.show.lucence.lucence.analyzers;

import com.show.lucence.lucence.domain.Goods;
import com.show.lucence.lucence.utils.AnalyzerUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.BeforeClass;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.List;


public class AnalyzerTest {

    private static Directory directory = null;
    private static Analyzer analyzer = null;
    private static Goods goods = null;

    @BeforeClass
    public static void init() throws IOException {
        //索引库
        directory = getDirectory();
        //分词器
        analyzer = getAnalyzer();
        //测试自定义的对象
        goods = getGoods();
    }

    @Test
    public void writerDoc() throws Exception {
        AnalyzerUtil.writerDoc(directory,analyzer,goods);
    }

    @Test
    public void updateDoc() throws Exception {
        AnalyzerUtil.updateDoc(directory,analyzer,goods,"goodsId",String.valueOf(goods.getGoodsId()));
    }

    @Test
    public void deleteDoc() throws Exception {
        AnalyzerUtil.deleteDoc(directory,analyzer,"goodsId",String.valueOf(goods.getGoodsId()));
    }

    @Test
    public void searchDoc() throws Exception {
        List goodsList = AnalyzerUtil.searchDoc(directory, analyzer,Goods.class,"力实现自");
        System.out.println(goodsList);
    }

    private static Goods getGoods() {
        String goodsRemark = "个性独特  努力实现自我";
        //goodsRemark = "我是超级王牌";
        return new Goods(1,"Pin",11.0,goodsRemark);
    }

    private static Analyzer getAnalyzer() {
        //指定分词器,版本一般指定为最高
        //StandardAnalyzer分词器
        Analyzer  analyzer =
                //new StandardAnalyzer(Version.LUCENE_47);
            new IKAnalyzer(false);
        return analyzer;
    }

    //设置对应的索引库--目前不会变
    private static Directory getDirectory() throws IOException {
        //构建索引库
        return FSDirectory.open(new File("D:\\pragramFIle\\lucence\\src\\main\\resources\\com\\show\\lucence"));
    }

    @Test
    public void testTemplate1() throws Exception {
        Class aClass = goods.getClass();
        Field goodsName = aClass.getDeclaredField("goodsName");
        goodsName.setAccessible(true);
        System.out.println(goodsName.getName());

    }


}

5)使用lucene时可以使用luke(lucene索引库查看工具)来查看分好的索引库。lucene与luke的版本一定要相适应,这里lucene用的4.7.2,我用的是lukeall-4.7.1,可以查看。下完jar包之后,可以在jar包下新建一个.bat文件,文件内容start javaw -jar lukeall-4.7.1.jar。索引库新建成功后,可以点击.bat文件启动可视化工具,然后显示如下:

LUCENE-学习_第1张图片

6)IK扩展字典项使用:

在项目resource路径下创建IKAnalyzer.cfg.xml文件,文件内容如下:





    IK Analyzer 扩展配置
    
    
    /dicdata/use.dic.dic;/dicdata/googlepy.dic
    
    /dicdata/ext_stopword.dic
    

附加:

####lucene构建步骤:
##创建索引:
#####1.构建索引库
目的:用来制定生成的索引文件的存放位置
#####2.指定对应的分词器
常见的分词器:
######>>1)StandardAnalyzer分词器: 单字分词:就是按照中文一个字一个字地进行分词.如:”我爱中国”
          效果:我,爱,中,国
######>>2)CJKAnalyzer分词器:二分法分词:按两个字进行切分,如”我是中国人”
          效果:我是,是中,中国,国人
######>>3)IK分词器:自定义词库
          扩展词典(新创建词功能): 有些词IK分词器不识别,例如:”白富美,高富帅
          停用词典(停用某些词功能): 有些词不需要建立索引  例如:“哦”,“啊”,“的””
          IK分词器的词库有限,新增加的词条可以通过配置文件添加到IK的词库中,也可以把一些不用的词条去除:
          
#####3.创建文档对象
文档对象:
######>主要是添加索引的内容
#####4.创建写入器
######>主要通过指定的分词器进行分词,并将文档对象写到索引库里面。

##查询索引:
#####1.设定搜索目录
#####2.选择搜索器,指定设定好的搜索目录
#####3.选择对应的执行语句解析器
#####4.搜索器执行查询语句(可以查出索引所在的文档/以及查到符合索引的文档数)

你可能感兴趣的:(IAVA,lucene,信息检索)