结绳记事,思考、记录,方有成长~
ElasticSearch的聚合(Aggregations)功能非常强大,允许在数据上做复杂的统计。聚合分析功能主要有指标聚合、桶聚合、管道聚合和矩阵聚合四大类,管道聚合和矩阵聚合官方说明是在试验阶段,后期会完全更改或者移除。以下demo会从restful接口、java的API接口及SQL含义3个层面来阐述ES的聚合概念。
size如何理解
即计算最大、最小、平均值、求和、计数等功能。比如统计最大值,统计books索引中的最高价格
# SQL描述
select max(price) as max_price from book;
GET books/_search
{
"size": 0,
"aggs": {
# 自定义的别名
"max_price": {
# 计算price字段的最大值,此处的max可以替换为min、avg、sum、cardinality(去重后计算个数)、stats(一次性返回所有的聚合指标)
"max": {"field": "price"}
}
}
}
# 结果
{
"aggregations": {
"max_price": {
"value": 81.4
}
}
}
# java API方式
public static void main(String[] args) {
MaxAggregationBuilder agg = AggregationBuilders.max("max_price").field("price");
SearchResponse sr = clent().prepareSearch("books").addAggregation(agg).get();
Max agg = sr.getAggregaion().get("max_price");
double value = agg.getValue();
System.out.println(value);
}
Bucket可以理解为一个桶,它会遍历文档中的内容,凡是符合某一要求的就放入一个桶中,分桶相当于SQL中的group by。以books索引中的图书为例,一本书会被划分到科技类、经济类或者其他分类中,name科技类图书就是一个桶,经济类图书也是一个桶,桶就是符合某一划分标准的文档集合。
Terms Aggregation用于分组聚合。例如,根据language字段对books索引中的文档进行分组,统计属于各种编程语言的书的数量,或者业务中常见的统计各状态数量的场景。
# 对应的SQL逻辑
select language as key, count(1) as per_count from books group by language
POST books/_search?size=0
{
"aggs": {
# 自定义的分组别名
"per_count": {
"terms": {
# 根据language分组
"field": "language"
}
}
}
}
# 结果
{
"aggregations": {
"per_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "java",
"doc_count": 2
},
{
"key": "python",
"doc_count": 2
},
{
"key": "javascript",
"doc_count": 1
}
]
}
}
}
# java API实现方式
TermsAggregationBuilder termAgg = AggregationBuilders.terms("per_count").field("language");
SearchResponse sr = client().prepareSearch("books").addAggregation(termAgg).execute().actionGet();
Terms perCount = sr.getAggregations().get("per_count");
for (Terms.Buckey entry : perCount.getBuckets()) {
System.out.println(entry.getKey() + "-----------" + entry.getDocCount());
}
我们最常见的场景,其实是分组后再进行指标聚合。例如某公司员工体检,员工可以选择不同的套餐,需要统计各套餐对应人数及总费用。可以先按照套餐packageId字段进行Terms Aggregation,再进行Sum Aggregation,查询语句如下:
# 先看下SQL实现的逻辑
select package_id as '套餐ID', count(1) as '套餐数量', sum(price) as '套餐费用' from appoint_info
group by package_id
# 查询体检预约单表
POST appointInfo/_search?size=0
{
"aggs": {
# 各套餐人数
"per_count": {
"terms": {"field": "packageId"},
"aggs": {
# 各套餐下的总费用
"sum_price": {
"sum": {"field": "price"}
}
}
}
}
}
# 结果如下
{
"aggregations": {
"per_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
# 套餐id为123
"key": 123,
"doc_count": 10,
"sum_price": {
"value": 5000
}
},
{
# 套餐id为456
"key": 456,
"doc_count": 8,
"sum_price": {
"value": 4000
}
},
{
# 套餐id为789
"key": 789,
"doc_count": 6,
"sum_price": {
"value": 3000
}
}
]
}
}
}
# Java API实现
AggregationBuilder aggregation = AggregationBuilders
.terms("term_by_package")
// 套餐ID字段
.field("packageId")
// 按照price字段聚合
.subAggregation(AggregationBuilders.sum("sum_price").field("price"));
SearchRequestBuilder searchRequestBuilder = client().prepareSearch(indexName).setTypes(typeName)
.setSearchType(SearchType.QUERY_THEN_FETCH).setFetchSource(searchBean.getFields(), null)
.setQuery(toESQuery(searchBean))
// 构造聚合查询
.addAggregation(aggregation);
SearchResponse response = searchRequestBuilder.execute().actionGet();
Terms packageAgg = response.getAggregations().get("term_by_package");
for (Terms.Bucket entry : packageAgg .getBuckets()) {
// key为分组的字段
Long key = (Long) entry.getKey();
// count为每组的条数
Long count = entry.getDocCount();
Sum sum = entry.getAggregations().get("sum_price");
// 每组的总价格
Double sumMoney = sum.getValue();
log.info("套餐id:{}, 套餐数量:{}, 套餐总费用:{}", key, count, sumMoney);
}
考虑这样的场景:公司组织体检,员工可以选择不同套餐,现在需要统计各部门各套餐的体检人数及总费用,便于HR了解公司体检进度。
# SQL逻辑
select department_id '部门ID', package_id '套餐ID', count(1) '体检人数', sum(price) '费用小计'
from appoint_info
group by department_id, package_id
# 查询各部门各体检状态人数及费用
POST appoint_info/_search?size=0
{
"aggs": {
"department_count": {
"terms": {
"field": "departmentId"
},
"aggs": {
"package_count": {
"terms": {
"field": "packageId"
},
"aggs": {
"sum_price": {
"sum": {
"field": "price"
}
}
}
}
}
}
}
}
# 结果如下
"aggregations": {
# 第一层桶,部门维度
"department_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
# 部门桶数据
"buckets": [
{
# 部门ID
"key": "4754227381249",
# 部门总人数
"doc_count": 12,
# 第二层桶, 套餐维度
"package_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
# 套餐桶数据
"buckets": [
{
# 套餐ID
"key": "7797345913354",
# 套餐总人数
"doc_count": 11,
# 部门内此套餐的总费用,demo数据,不用校验价格的真实性
"sum_price": {
"value": 172
}
},
{
"doc_count": 1,
"key": "7522468013066",
"sum_price": {
"value": 32
}
}
]
}
},
{
"package_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"doc_count": 3,
"key": "3757616728069",
"buckets": [
{
"doc_count": 2,
"key": "8777113347594",
"sum_price": {
"value": 84
}
},
{
"doc_count": 1,
"key": "7522468013066",
"sum_price": {
"value": 42
}
}
]
}
}
}
# Java API实现逻辑
AggregationBuilder aggregation = AggregationBuilders
.terms("term_by_department")
.field("departmentId")
# 注意:分组字段越多,则需要不断添加subAggregation(),如果超过3个字段,可使用循环来构造多个桶
.subAggregation(AggregationBuilders.terms("term_by_package").field("packageId")
.subAggregation(AggregationBuilders.sum("price_sum").field("price")))
;
SearchRequestBuilder searchRequestBuilder = client.prepareSearch(indexName).setTypes(typeName)
.setSearchType(SearchType.QUERY_THEN_FETCH).setFetchSource(searchBean.getFields(), null)
.setQuery(toESQuery(searchBean))
// 构造聚合查询
.addAggregation(aggregation);
SearchResponse response = searchRequestBuilder.execute().actionGet();
Terms departmentAgg = response.getAggregations().get("term_by_department");
# 解析第一层桶,部门桶
for (Terms.Bucket entry : departmentAgg.getBuckets()) {
Long departmentId= (Long) entry.getKey();
Long departmentCount = entry.getDocCount();
# 解析第二层桶,套餐桶,
# 注意:如果分组字段较多,这个地方嵌套就会特别深,可以采用递归方式解析
Terms packageAgg = entry.getAggregations().get("term_by_package");
for (Terms.Bucket package: packageAgg .getBuckets()) {
Long packageId = (Long) package.getKeyAsNumber();
Long packageCount = package.getDocCount();
Sum sum = package.getAggregations().get("price_sum");
double priceSum= sum.getValue();
log.info("departmentId:{}, departmentCount :{}, packageId :{}, packageCount :{}, priceSum:{}", departmentId, departmentCount , packageId , packageCount , priceSum);
}
}