平时咱们使用ElasticSearch都是单字段进行去重,对于多字段进行去重还是少见的。
ElasticSearch单字段去重详见博文:ElasticSearch单字段查询去重详解_IT之一小佬的博客-CSDN博客
本博文将详细介绍多字段进行去重。本文示例数据详见上文单字段博文数据。
# 聚合获取多字段去重数量
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"cardinality": {
"script": {
"lang": "painless",
"source": "doc['age'].value + doc['gender'].value"
}
}
}
}
}
运行结果:
{
"took" : 10,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"value" : 3
}
}
}
注意:使用script方法对于大数据去重时,可能会有小小的误差!
# 查询.聚合
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
}
}
}
}
运行结果:
{
"took" : 80,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2
},
{
"key" : 26,
"doc_count" : 1
},
{
"key" : 27,
"doc_count" : 1
}
]
}
}
}
top_hits指标聚合器跟踪要聚合的最相关文档,可以有效地用于通过存储桶聚合器按某些字段对结果集进行分组。
直接使用top_hits返回全部字段:
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"size": 1
}
}
}
}
}
}
运行结果:
{
"took" : 647,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2,
"age_top" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "刘一",
"age" : 25,
"gender" : "男",
"email" : "[email protected]",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : 26,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "陈二",
"age" : 26,
"gender" : "女",
"email" : "[email protected]",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : 27,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"id" : 1,
"name" : "张三",
"age" : 27,
"gender" : "男",
"email" : "[email protected]",
"provience" : "北京",
"address" : "北京市朝阳区",
"status" : "正常"
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
使用_source includes返回需要的字段:
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "age",
"size": 10
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"_source": {
"includes": [
"name",
"age",
"gender",
"provience",
"address"
]
},
"size": 1
}
}
}
}
}
}
运行结果:
{
"took" : 115,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 25,
"doc_count" : 2,
"age_top" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "刘一",
"age" : 25
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : 26,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "女",
"provience" : "北京",
"name" : "陈二",
"age" : 26
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : 27,
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "张三",
"age" : 27
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
常规的聚合无法在聚合中进行复杂操作,所以要加入脚本,修改terms中内容为下,将三个条件拼接起来。
GET person_info/_search
{
"query": {
"match": {
"provience.keyword": "北京"
}
},
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"script": {
"lang": "painless",
"source": "doc['age'].value + '#' + doc['gender'].value + '#' + doc['name.keyword']"
}
},
"aggs": {
"age_top": {
"top_hits": {
"sort": [{
"age": {
"order": "desc"
}
}],
"_source": {
"includes": [
"name",
"age",
"gender",
"provience",
"address"
]
},
"size": 1
}
}
}
}
}
}
运行结果:
{
"took" : 52,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"age_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "25#男#[刘一]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hFHKl4YBPv2uoOpTcHMg",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "刘一",
"age" : 25
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : "25#男#[王五]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "iFHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "王五",
"age" : 25
},
"sort" : [
25
]
}
]
}
}
},
{
"key" : "26#女#[陈二]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "ilHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "女",
"provience" : "北京",
"name" : "陈二",
"age" : 26
},
"sort" : [
26
]
}
]
}
}
},
{
"key" : "27#男#[张三]",
"doc_count" : 1,
"age_top" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "person_info",
"_type" : "_doc",
"_id" : "hlHKl4YBPv2uoOpTcHMi",
"_score" : null,
"_source" : {
"address" : "北京市朝阳区",
"gender" : "男",
"provience" : "北京",
"name" : "张三",
"age" : 27
},
"sort" : [
27
]
}
]
}
}
}
]
}
}
}
参考博文:
Elasticsearch Painless Script入门教程 - CodeAntenna
es[elasticsearch]多字段去重查询 - 百度文库
Elasticsearch去重查询/过滤重复数据(聚合) - 码农教程