由于项目中最近用到了elasticsearch,并且用到elasticsearch的聚合(Aggregation)功能,就深入研究了一下,elasticsearch中的聚合主要有四种:Bucketing Aggregation、Metric Aggregation、Matrix Aggregation和Pipeline Aggregation。
聚合的基本结构
"aggregations" : {
"" : { --用户自己起的名字
"" : { --聚合类型,如avg, sum
-- 针对的字段
}
[,"meta" : { [] } ]?
[,"aggregations" : { []+ } ]? --聚合里面可以嵌套聚合
}
[,"" : { ... } ]*
}
Metric Aggregation
Avg Aggregation--计算平均值
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"avg_value": {
"avg": {"field": "value"}
}
}
}
返回结果:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 315,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"avg_value" : {
"value" : 342.84761904761905
}
}
}
之前看到其他博客上有说search_type=count可以只返回aggregation部分的结果,但我在7.x版本中试了下,好像不行,这边只能通过将size设为0来隐藏掉除了统计数据以外的数据。
Cardinality Aggregation--去重(相当于mysql中的distinct)
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"avg_value": {
"cardinality": {"field": "service_id"}
}
}
}
返回结果:
{
"took" : 11,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 317,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"avg_value" : {
"value" : 2
}
}
}
Extended Status Aggragation--获取某个字段的所有统计信息(包括平均值,最大/小值....)
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"avg_status": {
"extended_stats": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 326,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"avg_status" : {
"count" : 326, // 数量
"min" : 2.0, // 最小值
"max" : 2481.0, // 最大值
"avg" : 347.63803680981596, // 均值
"sum" : 113330.0, // 和
"sum_of_squares" : 1.02303634E8,
"variance" : 192962.62358387595,
"std_deviation" : 439.275111500613,
"std_deviation_bounds" : {
"upper" : 1226.188259811042,
"lower" : -530.91218619141
}
}
}
}
Max Aggregation--求最大值
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"max_value": {
"max": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 352,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"max_value" : {
"value" : 2481.0
}
}
}
Min Aggreegation--计算最小值
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"min_value": {
"min": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 352,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"min_value" : {
"value" : 2.0
}
}
}
Percentiles Aggregation -- 百分比统计,按照[ 1, 5, 25, 50, 75, 95, 99 ]来统计
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"value_outlier": {
"percentiles": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 44,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 334,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"value_outlier" : {
"values" : {
"1.0" : 4.0,
"5.0" : 67.2,
"25.0" : 91.33333333333333,
"50.0" : 151.0,
"75.0" : 420.0,
"95.0" : 1412.4000000000005,
"99.0" : 1906.32
}
}
}
}
从返回结果可以看出来,75%的数据在420ms加载完毕。
当然我们也可以指定自己需要统计的百分比:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"value_outlier": {
"percentiles": {
"field": "value",
"percents": [95, 96, 99, 99.5]
}
}
}
}
返回结果:
{
"took" : 20,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 330,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"value_outlier" : {
"values" : {
"95.0" : 1366.0,
"96.0" : 1449.8000000000002,
"99.0" : 1906.3999999999999,
"99.5" : 2064.400000000004
}
}
}
}
Percentile Ranks Aggregation -- 统计返回内数据的百分比
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"value_range": {
"percentile_ranks": {
"field": "value",
"values": [100, 200]
}
}
}
}
返回结果:
{
"took" : 10,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 346,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"value_range" : {
"values" : {
"100.0" : 32.51445086705203,
"200.0" : 65.19405450041288
}
}
}
}
从返回结果可以看出,在100ms左右加载完毕的占了32%, 200ms左右加载完毕的占了65%
Status Aggregation -- 状态统计
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"value_status": {
"stats": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 355,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"value_status" : {
"count" : 355,
"min" : 2.0,
"max" : 2753.0,
"avg" : 339.8112676056338,
"sum" : 120633.0
}
}
}
可以发现跟之前的extended stats aggregation返回数据类似,只是少了一些较复杂的标准差之类的数据。
Sum Aggregation -- 求和函数
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"query": {"term": {
"service_id": {
"value": 5
}
}},
"aggs": {
"sum_value": {
"sum": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 194,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"sum_value" : {
"value" : 91322.0
}
}
}
Top Hits Aggregation -- 获取前n条数据, 可以嵌套使用
请求示例:
GET /endpoint_avg/_search
{
"size": 0,
"aggs": {
"top_tags": {
"terms": {
"field": "service_id",
"size": 2
},
"aggs": {
"top_value": {
"top_hits": {
"size": 3,
"sort": [{
"time_bucket": {"order": "desc"}
}]
}
}
}
}
}
}
返回结果:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 372,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"top_tags" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 5,
"doc_count" : 198,
"top_value" : {
"hits" : {
"total" : 198,
"max_score" : null,
"hits" : [
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191621_25",
"_score" : null,
"_source" : {
"service_id" : 5,
"count" : 2,
"time_bucket" : 201906191621,
"service_instance_id" : 250,
"entity_id" : "25",
"value" : 149,
"summation" : 299
},
"sort" : [
201906191621
]
},
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191620_24",
"_score" : null,
"_source" : {
"service_id" : 5,
"count" : 1,
"time_bucket" : 201906191620,
"service_instance_id" : 250,
"entity_id" : "24",
"value" : 93,
"summation" : 93
},
"sort" : [
201906191620
]
},
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191620_37",
"_score" : null,
"_source" : {
"service_id" : 5,
"count" : 1,
"time_bucket" : 201906191620,
"service_instance_id" : 250,
"entity_id" : "37",
"value" : 122,
"summation" : 122
},
"sort" : [
201906191620
]
}
]
}
}
},
{
"key" : 3,
"doc_count" : 174,
"top_value" : {
"hits" : {
"total" : 174,
"max_score" : null,
"hits" : [
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191621_144",
"_score" : null,
"_source" : {
"service_id" : 3,
"count" : 1,
"time_bucket" : 201906191621,
"service_instance_id" : 238,
"entity_id" : "144",
"value" : 93,
"summation" : 93
},
"sort" : [
201906191621
]
},
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191620_70",
"_score" : null,
"_source" : {
"service_id" : 3,
"count" : 1,
"time_bucket" : 201906191620,
"service_instance_id" : 238,
"entity_id" : "70",
"value" : 192,
"summation" : 192
},
"sort" : [
201906191620
]
},
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191620_18",
"_score" : null,
"_source" : {
"service_id" : 3,
"count" : 2,
"time_bucket" : 201906191620,
"service_instance_id" : 238,
"entity_id" : "18",
"value" : 81,
"summation" : 162
},
"sort" : [
201906191620
]
}
]
}
}
}
]
}
}
}
Value Count Aggregation--统计不同值的数量
请求示例:
GET /endpoint_avg/_search
{
"size": 2,
"aggs": {
"value_count": {
"value_count": {
"field": "value"
}
}
}
}
返回结果:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 357,
"max_score" : 1.0,
"hits" : [
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191457_16",
"_score" : 1.0,
"_source" : {
"service_id" : 3,
"count" : 1,
"time_bucket" : 201906191457,
"service_instance_id" : 238,
"entity_id" : "16",
"value" : 129,
"summation" : 129
}
},
{
"_index" : "endpoint_avg",
"_type" : "type",
"_id" : "201906191503_691",
"_score" : 1.0,
"_source" : {
"service_id" : 5,
"count" : 2,
"time_bucket" : 201906191503,
"service_instance_id" : 250,
"entity_id" : "691",
"value" : 178,
"summation" : 357
}
}
]
},
"aggregations" : {
"value_count" : {
"value" : 357
}
}
}