elasticsearch实现中文分词和拼音分词混合查询+CompletionSuggestion

引言

之前已经介绍了如何搭建elasticsearch服务端和简单的索引创建,和中文分词的支持。今天我们来说一说如何实现elasticsearch同时实现中文分词和pinyin分词。并且实现类似百度搜索栏的搜索建议的功能。

混合查询

实现混合查询有很多方式,这里介绍我认为是一个偷懒的方法,就是为你要拼音搜索的字段提供两个额外的字段,一个是全拼字段,一个是首字母缩写字段。我这里用的是官网的Employee的例子:

public class Employee implements Serializable {

    private String firstName;
    private String lastName;
    private String pinyin;//firstName全拼
    private String header;//firstName首字母缩写
    private int age;
    private String about;
    private List interests;

    ....省略getter setter

接下来为index添加setting和mapping

 XContentBuilder settings = XContentFactory.jsonBuilder();

            settings.startObject()
                    .startObject("analysis")
                    .startObject("analyzer")
                    .startObject("ik_analyzer").field("tokenizer","ik_smart")
                    .endObject()
                    .endObject()
                    .endObject().endObject();

            CreateIndexRequest createIndexRequest = new CreateIndexRequest(index).settings(settings);
            CreateIndexResponse createIndexResponse = esClient.admin().indices().create(createIndexRequest).get();
            logger.info("Index:{} created,response:{}", index, JSON.toJSON(createIndexResponse));
            XContentBuilder builder = XContentFactory.jsonBuilder();
            builder.startObject()
                    .startObject(type)
                    .startObject("properties")
                    .startObject("firstName").field("type", "string").field("analyzer","ik_smart")
               /*     .field("search_analyzer","ik_smart").field("preserve_separators",false)
                    .field("preserve_position_increments",false)*/
                    .endObject()
                    .startObject("lastName").field("type", "string").field("analyzer","ik_smart")
                    .endObject()
                    .startObject("pinyin").field("type","string").field("analyzer","pinyin")
                    .startObject()
                    .startObject("header").field("type","string").field("analyzer","pinyin")
                    .startObject("about").field("type", "string").field("analyzer","ik_smart")
                    .endObject()
                    .startObject("interests").field("type", "string").field("analyzer","ik_smart")
                    .endObject()
                    .endObject()
                    .endObject()
                    .endObject();

            PutMappingRequest putMappingRequest = new PutMappingRequest(index);
            putMappingRequest.type(type);
            putMappingRequest.source(builder);
            PutMappingResponse putMappingResponse = esClient.admin().indices().putMapping(putMappingRequest).get();
            logger.info("Mapping for `{}.{}` putted, response:{}", index, type, JSON.toJSON(putMappingResponse));

            return true;
        } catch (Exception e) {
            logger.error("doCreateIndex", e);
            return false;
        }

添加几个测试用例,我这里直接用了批量插入索引的方法:

    public Boolean bulkIndex(List jsonList){

        if(esIndexTypes.get(index)==null) {
            if(getMapping(index, indexType)) esIndexTypes.put(index,true);
        }

        BulkRequestBuilder bulkBuilder= esClient.prepareBulk();
        for (String s : jsonList) {
            IndexRequestBuilder requestBuilder = esClient.prepareIndex(index, indexType)
                    .setSource(s);
           bulkBuilder.add(requestBuilder);
        }

        BulkResponse bulkResponse = bulkBuilder.execute().actionGet();
        logger.info("index:{} bulk request,:response:{}",index,JSON.toJSON(bulkResponse));
        return true;
    }

    @org.junit.Test
    public void test(){
        List list1 = new ArrayList<>(10000);
        for (int i=0;i<10000;i++) {
            Employee employee = new Employee();
            employee.setFirstName("告白气球"+i);
            employee.setPinyin("gaobaiqiqiu"+i);
            employee.setHeader("gbqq");
            employee.setLastName("周杰伦,日记");
            employee.setAbout("呜啦啦啦火车笛\n" +
                    "\n" +
                    "随着奔腾的马蹄\n" +
                    "\n" +
                    "小妹妹吹着口琴\n" +
                    "\n" +
                    "夕阳下美了剪影\n" +
                    "\n" +
                    "我用子弹写日记,我泡妞看电影");
            employee.setAge(18);
            List list = new ArrayList();
            list.add("喜欢打篮球");
            list.add("在大晴天晒太阳");
            list.add("泡妞看电影");
            employee.setInterests(list);
            list1.add(JSON.toJSONString(employee));
        }

        boolean index = esProxy.bulkIndex(list1);


    }

最后直接搜gaobaiqiqiugbqq搜出来的数据像这样:

[{"firstName":"告白气球","lastName":"周杰伦,日记","pinyin":"gaobaiqiqiu","about":"呜啦啦啦火车笛\n\n随着奔腾的马蹄\n\n小妹妹吹着口琴\n\n夕阳下美了剪影\n\n我用子弹写日记,我泡妞看电影","header":"gbqq","interests":["喜欢打篮球","在大晴天晒太阳","泡妞看电影"],"age":18}]

如果直接搜告白搜出来的数据像这样:

[{"firstName":"告白气球","lastName":"周杰伦,日记","pinyin":"gaobaiqiqiu","about":"呜啦啦啦火车笛\n\n随着奔腾的马蹄\n\n小妹妹吹着口琴\n\n夕阳下美了剪影\n\n我用子弹写日记,我泡妞看电影","header":"gbqq","interests":["喜欢打篮球","在大晴天晒太阳","泡妞看电影"],"age":18}]

CompletionSuggestion查询建议

使用CompletionSuggestion时mapping需要改一下,实时推荐的字段type需要使用completion。

 XContentBuilder builder = XContentFactory.jsonBuilder();
            builder.startObject()
                    .startObject(type)
                    .startObject("properties")
                    .startObject("firstName").field("type", "completion").field("analyzer","ik_smart")
                  .field("search_analyzer","ik_smart").field("preserve_separators",false)
                    .field("preserve_position_increments",false)
                    .endObject()
                    .startObject("lastName").field("type", "string").field("analyzer","ik_smart")
                    .endObject()
                    .startObject("pinyin").field("type","string").field("analyzer","pinyin")
                    .startObject()
                    .startObject("header").field("type","string").field("analyzer","pinyin")
                    .startObject("about").field("type", "string").field("analyzer","ik_smart")
                    .endObject()
                    .startObject("interests").field("type", "string").field("analyzer","ik_smart")
                    .endObject()
                    .endObject()
                    .endObject()
                    .endObject();

查询的时候需要使用CompletionSuggestionBuilder.

public void searchSuggest(String str){

        CompletionSuggestionBuilder suggestionBuilder = new CompletionSuggestionBuilder("firstName");
        suggestionBuilder.analyzer("ik_smart");
        suggestionBuilder.text(str);
        SearchResponse response = esClient.prepareSearch(index).setTypes(indexType).setQuery(QueryBuilders.matchAllQuery())
                .suggest(new SuggestBuilder().addSuggestion("my-suggest-1",suggestionBuilder)).get();

        Suggest suggest= response.getSuggest();
        CompletionSuggestion suggestion = suggest.getSuggestion("my-suggest-1");
        List.Entry> list = suggestion.getEntries();
        for (int i = 0; i < list.size(); i++) {
            List.Entry.Option> options = list.get(i).getOptions();
            for (int j = 0; j < options.size(); j++) {
                if (options.get(j) instanceof CompletionSuggestion.Entry.Option) {
                    CompletionSuggestion.Entry.Option op =  options.get(j);
                    System.out.println(op.getScore()+"--"+op.getText());
                }
            }
        }
    }

你也可以使用restAPI:http://192.168.10.xxx:9200/megacorp/_search?pretty这里megacorp是indexName,

{ "size": 0,
  "suggest": {
    "my-suggest-1": {
      "prefix": "someone li",
      "completion": {
        "field": "firstName"
      }
    }
  }
}

查询出来的结果:

{
    "took": 12,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
    },
    "hits": {
        "total": 0,
        "max_score": 0,
        "hits": []
    },
    "suggest": {
        "blog-suggest": [
            {
                "text": "someone li",
                "offset": 0,
                "length": 10,
                "options": [
                    {
                        "text": "someone like you",
                        "_index": "megacorp",
                        "_type": "employee",
                        "_id": "AV_doqcXKY206Vs3lcCO",
                        "_score": 1,
                        "_source": {
                            "about": "呜啦啦啦火车笛\n\n随着奔腾的马蹄\n\n小妹妹吹着口琴\n\n夕阳下美了剪影\n\n我用子弹写日记,我泡妞看电影",
                            "age": 18,
                            "firstName": "someone like you",
                            "interests": [ "喜欢打篮球", "在大晴天晒太阳", "泡妞看电影" ],
                            "lastName": "周杰伦,日记" }
                    }
                ]
            }
        ]
    }
}

你可能感兴趣的:(elasticsearch,elasticsearch,搜索)