ElasticSearch复杂查询-指标聚合、桶聚合

结绳记事,思考、记录,方有成长~

文章目录

    • 1.前言
    • 2.指标聚合
    • 3.桶聚合
      • 3.1Terms Aggregation(聚合后计算总数)
      • 3.2桶聚合后再指标聚合(单个group字段):
      • 3.3桶聚合后指标聚合(多个group字段)

1.前言

ElasticSearch的聚合(Aggregations)功能非常强大,允许在数据上做复杂的统计。聚合分析功能主要有指标聚合桶聚合管道聚合矩阵聚合四大类,管道聚合和矩阵聚合官方说明是在试验阶段,后期会完全更改或者移除。以下demo会从restful接口、java的API接口及SQL含义3个层面来阐述ES的聚合概念。

size如何理解

2.指标聚合

即计算最大、最小、平均值、求和、计数等功能。比如统计最大值,统计books索引中的最高价格

# SQL描述
select max(price) as max_price from book;

GET books/_search
{
	"size": 0,
	"aggs": {
	    # 自定义的别名
		"max_price": {
			# 计算price字段的最大值,此处的max可以替换为min、avg、sum、cardinality(去重后计算个数)、stats(一次性返回所有的聚合指标)
			"max": {"field": "price"}
		}
	}
}

# 结果
{
	"aggregations": {
		"max_price": {
			"value": 81.4
		}
	}
}

# java API方式
public static void main(String[] args) {
	MaxAggregationBuilder agg = AggregationBuilders.max("max_price").field("price");
	SearchResponse sr = clent().prepareSearch("books").addAggregation(agg).get();
	Max agg = sr.getAggregaion().get("max_price");
	double value = agg.getValue();
	System.out.println(value);
}

3.桶聚合

Bucket可以理解为一个桶,它会遍历文档中的内容,凡是符合某一要求的就放入一个桶中,分桶相当于SQL中的group by。以books索引中的图书为例,一本书会被划分到科技类、经济类或者其他分类中,name科技类图书就是一个桶,经济类图书也是一个桶,桶就是符合某一划分标准的文档集合。

3.1Terms Aggregation(聚合后计算总数)

Terms Aggregation用于分组聚合。例如,根据language字段对books索引中的文档进行分组,统计属于各种编程语言的书的数量,或者业务中常见的统计各状态数量的场景。

# 对应的SQL逻辑
select language as key, count(1) as per_count from books group by language 

POST books/_search?size=0
{
	"aggs": {
		# 自定义的分组别名
		"per_count": {
			"terms": {
				# 根据language分组
				"field": "language"
			}
		}
	}
}

# 结果
{
	"aggregations": {
		"per_count": {
			"doc_count_error_upper_bound": 0,
			"sum_other_doc_count": 0,
			"buckets": [
				{
					"key": "java",
					"doc_count": 2
				},
				{
					"key": "python",
					"doc_count": 2
				},
				{
					"key": "javascript",
					"doc_count": 1
				}
			]
		}
	}
}

# java API实现方式
TermsAggregationBuilder termAgg = AggregationBuilders.terms("per_count").field("language");
SearchResponse sr = client().prepareSearch("books").addAggregation(termAgg).execute().actionGet();
Terms perCount = sr.getAggregations().get("per_count");
for (Terms.Buckey entry : perCount.getBuckets()) {
	System.out.println(entry.getKey() + "-----------" + entry.getDocCount());
}

3.2桶聚合后再指标聚合(单个group字段):

我们最常见的场景,其实是分组后再进行指标聚合。例如某公司员工体检,员工可以选择不同的套餐,需要统计各套餐对应人数及总费用。可以先按照套餐packageId字段进行Terms Aggregation,再进行Sum Aggregation,查询语句如下:

# 先看下SQL实现的逻辑
select package_id as '套餐ID', count(1) as '套餐数量', sum(price) as '套餐费用' from appoint_info 
group by package_id

# 查询体检预约单表
POST appointInfo/_search?size=0
{
	"aggs": {
		# 各套餐人数
		"per_count": {
			"terms": {"field": "packageId"},
			"aggs": {
				# 各套餐下的总费用
				"sum_price": {
					"sum": {"field": "price"}
				}
			}
		}
	}
}

# 结果如下
{
	"aggregations": {
		"per_count": {
			"doc_count_error_upper_bound": 0,
			"sum_other_doc_count": 0,
			"buckets": [
				{
					# 套餐id为123
					"key": 123,
					"doc_count": 10,
					"sum_price": {
						"value": 5000
					}
				},
				{
					# 套餐id为456
					"key": 456,
					"doc_count": 8,
					"sum_price": {
						"value": 4000
					}
				},
				{
					# 套餐id为789
					"key": 789,
					"doc_count": 6,
					"sum_price": {
						"value": 3000
					}
				}
			]
		}
	}
}

# Java API实现
AggregationBuilder aggregation = AggregationBuilders
        .terms("term_by_package")
        // 套餐ID字段
        .field("packageId")
        // 按照price字段聚合
        .subAggregation(AggregationBuilders.sum("sum_price").field("price"));

SearchRequestBuilder searchRequestBuilder = client().prepareSearch(indexName).setTypes(typeName)
        .setSearchType(SearchType.QUERY_THEN_FETCH).setFetchSource(searchBean.getFields(), null)
        .setQuery(toESQuery(searchBean))
        // 构造聚合查询
        .addAggregation(aggregation);

SearchResponse response = searchRequestBuilder.execute().actionGet();
Terms packageAgg = response.getAggregations().get("term_by_package");
for (Terms.Bucket entry : packageAgg .getBuckets()) {
    // key为分组的字段
    Long key = (Long) entry.getKey();
    // count为每组的条数
    Long count = entry.getDocCount();
    Sum sum = entry.getAggregations().get("sum_price");
    // 每组的总价格
    Double sumMoney = sum.getValue();
    log.info("套餐id:{}, 套餐数量:{}, 套餐总费用:{}", key, count, sumMoney);
}

3.3桶聚合后指标聚合(多个group字段)

考虑这样的场景:公司组织体检,员工可以选择不同套餐,现在需要统计各部门各套餐的体检人数及总费用,便于HR了解公司体检进度。

# SQL逻辑
select department_id '部门ID', package_id '套餐ID', count(1) '体检人数', sum(price) '费用小计' 
from appoint_info
group by department_id, package_id

# 查询各部门各体检状态人数及费用
POST appoint_info/_search?size=0
{
  "aggs": {
    "department_count": {
      "terms": {
        "field": "departmentId"
      },
      "aggs": {
        "package_count": {
          "terms": {
            "field": "packageId"
          },
          "aggs": {
            "sum_price": {
              "sum": {
                "field": "price"
              }
            }
          }
        }
      }
    }
  }
}
# 结果如下
"aggregations": {
    # 第一层桶,部门维度
    "department_count": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      # 部门桶数据
      "buckets": [
        { 
          # 部门ID
          "key": "4754227381249",
          # 部门总人数
          "doc_count": 12,
          # 第二层桶, 套餐维度
          "package_count": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            # 套餐桶数据
            "buckets": [
              {
                # 套餐ID
                "key": "7797345913354",
                # 套餐总人数
                "doc_count": 11,
                # 部门内此套餐的总费用,demo数据,不用校验价格的真实性
                "sum_price": {
                  "value": 172
                }
              },
              {
                "doc_count": 1,
                "key": "7522468013066",
                "sum_price": {
                  "value": 32
                }
              }
            ]
          }
        },
        {
          "package_count": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "doc_count": 3,
            "key": "3757616728069",
            "buckets": [
              {
                "doc_count": 2,
                "key": "8777113347594",
                "sum_price": {
                  "value": 84
                }
              },
              {
                "doc_count": 1,
                "key": "7522468013066",
                "sum_price": {
                  "value": 42
                }
              }
            ]
          }
        }
  }

# Java API实现逻辑
AggregationBuilder aggregation = AggregationBuilders
        .terms("term_by_department")
        .field("departmentId")
        # 注意:分组字段越多,则需要不断添加subAggregation(),如果超过3个字段,可使用循环来构造多个桶
        .subAggregation(AggregationBuilders.terms("term_by_package").field("packageId")
                .subAggregation(AggregationBuilders.sum("price_sum").field("price")))
        ;

SearchRequestBuilder searchRequestBuilder = client.prepareSearch(indexName).setTypes(typeName)
        .setSearchType(SearchType.QUERY_THEN_FETCH).setFetchSource(searchBean.getFields(), null)
        .setQuery(toESQuery(searchBean))
        // 构造聚合查询
        .addAggregation(aggregation);

SearchResponse response = searchRequestBuilder.execute().actionGet();
Terms departmentAgg = response.getAggregations().get("term_by_department");
# 解析第一层桶,部门桶
for (Terms.Bucket entry : departmentAgg.getBuckets()) {
    Long departmentId= (Long) entry.getKey();
    Long departmentCount = entry.getDocCount();
    # 解析第二层桶,套餐桶,
    # 注意:如果分组字段较多,这个地方嵌套就会特别深,可以采用递归方式解析
    Terms packageAgg = entry.getAggregations().get("term_by_package");
    for (Terms.Bucket package: packageAgg .getBuckets()) {
        Long packageId = (Long) package.getKeyAsNumber();
        Long packageCount = package.getDocCount();
        Sum sum = package.getAggregations().get("price_sum");
        double priceSum= sum.getValue();
        log.info("departmentId:{}, departmentCount :{}, packageId :{}, packageCount :{}, priceSum:{}", departmentId, departmentCount , packageId , packageCount , priceSum);
    }
}

你可能感兴趣的:(Elasticsearch)