Elasticsearch如何实现筛选功能(设置字段不分词和聚合操作)

0 起因

中文分词中比较常用的分词器是es-ik,建立索引的方式如下:
这里我们为index personList新建了两个字段:name和district,注意索引名称必须是小写
(以下格式都是在kibana上做的)

PUT /person_list
{
  "mappings": {
    "info": {
      "properties": {
        "name": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_max_word"
        },
        "district": {
            "type": "text",
            "analyzer": "ik_max_word",
            "search_analyzer": "ik_max_word"
        }
      }
    }
  }
}'

查看索引详情和索引某一些属性

GET person_list
GET /person_list/_settings
GET /person_list/_mapping

新增一些数据用于我们的测试
你可以批量新增(推荐):

POST /person_list/info/_bulk
{"index":{"_id":"1"}}
{"name":"李明","district":"上海市"}
{"index":{"_id":"2"}}
{"name":"李明","district":"上海市"}
{"index":{"_id":"3"}}
{"name":"李明","district":"北京市"}
{"index":{"_id":"4"}}
{"name":"张伟","district":"上海市"}
{"index":{"_id":"5"}}
{"name":"张伟","district":"北京市"}
{"index":{"_id":"6"}}
{"name":"张伟","district":"北京市"}

也可以逐条新增

POST /person_list/info
 {
   "name": "李明",
   "district":"上海"
 }

下面看看需求

0.1 需求一:实现对name的的搜索功能

这个很简单,模糊搜索和精确搜索都能实现,同时设置以下offset和size

GET person_list/info/_search
{
  "query": {
    "match_phrase_prefix": {"name": "张伟"}
  },
  "size": 10,
  "from": 0
}

搜索结果:

{
  "took": 0,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.35667494,
    "hits": [
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "4",
        "_score": 0.35667494,
        "_source": {
          "name": "张伟",
          "district": "北京"
        }
      }
    ]
  }
}

0.2 需求二:实现对name的的聚合,当搜索某个人名时,显示同一人名在不同地区的数量

聚合语句如下,我们需要得到张伟在不同地区的人数

 GET person_list/info/_search
{
  "query":{
     "match_phrase_prefix":{"name":"张伟"}
  },
  "aggs":{
    "result":{
      "terms":{"field":"district"}
    }
  }
}

此时返回的结果是

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [district] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
      }
    ],
    "type": "search_phase_execution_exception",
    "reason": "all shards failed",
    "phase": "query",
    "grouped": true,
    "failed_shards": [
      {
        "shard": 0,
        "index": "person_list",
        "node": "SOK5mAntQ8SYv6BuOGYuMg",
        "reason": {
          "type": "illegal_argument_exception",
          "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [district] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
        }
      }
    ]
  },
  "status": 400
}

报错了,说要我们设置 fielddata=true,怎么改呢?

1 配置可被筛选的Index

我们可以通过语句直接修改

 POST /person_list/_mapping/info
{
  "properties": {
        "district": {
            "type": "text",
            "analyzer": "ik_max_word",
            "search_analyzer": "ik_max_word",
            "fielddata": true
            }
        }
}

成功了

{
  "acknowledged": true
}

现在再次运行聚合操作

 GET person_list/info/_search
{
  "query":{
     "match_phrase_prefix":{"name":"张伟"}
  },

  "aggs":{
    "result":{
      "terms":{"field":"district"}
    }
  }
}

看一下结果

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 0.47000363,
    "hits": [
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "4",
        "_score": 0.47000363,
        "_source": {
          "name": "张伟",
          "district": "上海市"
        }
      },
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "6",
        "_score": 0.47000363,
        "_source": {
          "name": "张伟",
          "district": "北京市"
        }
      },
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "5",
        "_score": 0.2876821,
        "_source": {
          "name": "张伟",
          "district": "北京市"
        }
      }
    ]
  },
  "aggregations": {
    "result": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "北京",
          "doc_count": 2
        },
        {
          "key": "北京市",
          "doc_count": 2
        },
        {
          "key": "市",
          "doc_count": 2
        },
        {
          "key": "上海",
          "doc_count": 1
        },
        {
          "key": "上海市",
          "doc_count": 1
        },
        {
          "key": "海市",
          "doc_count": 1
        }
      ]
    }
  }
}

出问题了,district字段被拆了!
大概想一想,由于我们用的ik分词,所以在聚合的过程中,是先把district分词然后聚合并统计数量的。
现在思路清晰了,对于district的analyzer设置,我们不应该分词。
通过搜索网上的方案,我们再次修改映射

 POST /person_list/_mapping/info
{
  "properties": {
        "district": {
            "type": "text",
            "fielddata": true,
            "fields": {"raw": {"type": "keyword"}}
            }
        }
}

不行,报错analyzer冲突,怎么办?
现在的问题就是我们取消对district的ik分词,应该就可以了
捋一捋,analyzer有两种方案:

  1. 官方自带的:standard,simple,whitespace,language,具体左右可以查看官方文档
  2. 自定义/第三方analyzer

考虑了一下,district字段的所有数据都没有空格,使用whitespace正好能够避免被分词
于是乎我们重新建立了一遍索引:

# 删除数据
POST person_list/_delete_by_query
{
  "query": { 
    "match_all": {
    }
  }
}

# 删除索引
DELETE /person_list

# 新建索引
PUT /person_list
{
  "mappings": {
    "info": {
      "properties": {
        "name": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_max_word"
        },
        "district": {
            "type": "text",
            "analyzer": "whitespace",
            "search_analyzer": "whitespace",
            "fielddata": true
        }
      }
    }
  }
}'


# 导入数据
POST /person_list/info/_bulk
{"index":{"_id":"1"}}
{"name":"李明","district":"上海市"}
{"index":{"_id":"2"}}
{"name":"李明","district":"上海市"}
{"index":{"_id":"3"}}
{"name":"李明","district":"北京市"}
{"index":{"_id":"4"}}
{"name":"张伟","district":"上海市"}
{"index":{"_id":"5"}}
{"name":"张伟","district":"北京市"}
{"index":{"_id":"6"}}
{"name":"张伟","district":"北京市"}

查询聚合结果

 GET person_list/info/_search
{
  "query":{
     "match_phrase_prefix":{"name":"张伟"}
  },
  "aggs":{
    "result":{
      "terms":{"field":"district"}
    }
  }
}

结果正是我们想要的:

{
  "took": 3,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 0.47000363,
    "hits": [
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "4",
        "_score": 0.47000363,
        "_source": {
          "name": "张伟",
          "district": "上海市"
        }
      },
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "6",
        "_score": 0.47000363,
        "_source": {
          "name": "张伟",
          "district": "北京市"
        }
      },
      {
        "_index": "person_list",
        "_type": "info",
        "_id": "5",
        "_score": 0.2876821,
        "_source": {
          "name": "张伟",
          "district": "北京市"
        }
      }
    ]
  },
  "aggregations": {
    "result": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "北京市",
          "doc_count": 2
        },
        {
          "key": "上海市",
          "doc_count": 1
        }
      ]
    }
  }
}

至此,我们已经学会了如何通过ES实现基本的筛选功能了

你可能感兴趣的:(数据库)