测试数据
index 结构
PUT /employees/
{
"mappings" : {
"properties" : {
"age" : {
"type" : "integer"
},
"gender" : {
"type" : "keyword"
},
"job" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 50
}
}
},
"name" : {
"type" : "keyword"
},
"salary" : {
"type" : "integer"
}
}
}
}
插入20条数据
PUT /employees/_bulk
{ "index" : { "_id" : "1" } }
{ "name" : "Emma","age":32,"job":"Product Manager","gender":"female","salary":35000 }
{ "index" : { "_id" : "2" } }
{ "name" : "Underwood","age":41,"job":"Dev Manager","gender":"male","salary": 50000}
{ "index" : { "_id" : "3" } }
{ "name" : "Tran","age":25,"job":"Web Designer","gender":"male","salary":18000 }
{ "index" : { "_id" : "4" } }
{ "name" : "Rivera","age":26,"job":"Web Designer","gender":"female","salary": 22000}
{ "index" : { "_id" : "5" } }
{ "name" : "Rose","age":25,"job":"QA","gender":"female","salary":18000 }
{ "index" : { "_id" : "6" } }
{ "name" : "Lucy","age":31,"job":"QA","gender":"female","salary": 25000}
{ "index" : { "_id" : "7" } }
{ "name" : "Byrd","age":27,"job":"QA","gender":"male","salary":20000 }
{ "index" : { "_id" : "8" } }
{ "name" : "Foster","age":27,"job":"Java Programmer","gender":"male","salary": 20000}
{ "index" : { "_id" : "9" } }
{ "name" : "Gregory","age":32,"job":"Java Programmer","gender":"male","salary":22000 }
{ "index" : { "_id" : "10" } }
{ "name" : "Bryant","age":20,"job":"Java Programmer","gender":"male","salary": 9000}
{ "index" : { "_id" : "11" } }
{ "name" : "Jenny","age":36,"job":"Java Programmer","gender":"female","salary":38000 }
{ "index" : { "_id" : "12" } }
{ "name" : "Mcdonald","age":31,"job":"Java Programmer","gender":"male","salary": 32000}
{ "index" : { "_id" : "13" } }
{ "name" : "Jonthna","age":30,"job":"Java Programmer","gender":"female","salary":30000 }
{ "index" : { "_id" : "14" } }
{ "name" : "Marshall","age":32,"job":"Javascript Programmer","gender":"male","salary": 25000}
{ "index" : { "_id" : "15" } }
{ "name" : "King","age":33,"job":"Java Programmer","gender":"male","salary":28000 }
{ "index" : { "_id" : "16" } }
{ "name" : "Mccarthy","age":21,"job":"Javascript Programmer","gender":"male","salary": 16000}
{ "index" : { "_id" : "17" } }
{ "name" : "Goodwin","age":25,"job":"Javascript Programmer","gender":"male","salary": 16000}
{ "index" : { "_id" : "18" } }
{ "name" : "Catherine","age":29,"job":"Javascript Programmer","gender":"female","salary": 20000}
{ "index" : { "_id" : "19" } }
{ "name" : "Boone","age":30,"job":"DBA","gender":"male","salary": 30000}
{ "index" : { "_id" : "20" } }
{ "name" : "Kathy","age":29,"job":"DBA","gender":"female","salary": 20000}
query
ES聚合分析的默认作用范围是query的查询结果集,也就是说,聚合是在先query之后的结果里面进行的,
# 查询年龄大于等于30岁的员工,并且对符合条件的员工按照职位类型进行分组
POST /employees/_search
{
"size": 3,
"query": {
"range": {
"age": {
"gte": 30
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
}
}
}
返回结果
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"name" : "Emma",
"age" : 32,
"job" : "Product Manager",
"gender" : "female",
"salary" : 35000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"name" : "Underwood",
"age" : 41,
"job" : "Dev Manager",
"gender" : "male",
"salary" : 50000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "6",
"_score" : 1.0,
"_source" : {
"name" : "Lucy",
"age" : 31,
"job" : "QA",
"gender" : "female",
"salary" : 25000
}
}
]
},
"aggregations" : {
"jobs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Java Programmer",
"doc_count" : 5
},
{
"key" : "DBA",
"doc_count" : 1
},
{
"key" : "Dev Manager",
"doc_count" : 1
},
{
"key" : "Javascript Programmer",
"doc_count" : 1
},
{
"key" : "Product Manager",
"doc_count" : 1
},
{
"key" : "QA",
"doc_count" : 1
}
]
}
}
}
filter
如果我们想仅仅对聚合的数据进行筛选不影响query的结果,或者我们只想在一个聚合中进行筛选,而不影响其他聚合的结果呢,此时我们可以使用filter
POST employees/_search
{
"size": 3,
"query": {
"range": {
"age": {
"gte": 30
}
}
},
"aggs": {
"older_person": {
"filter": {
"range": {
"age": {
"from": 35
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
}
}
},
"all_jobs": {
"terms": {
"field": "job.keyword"
}
}
}
}
结果如下:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"name" : "Emma",
"age" : 32,
"job" : "Product Manager",
"gender" : "female",
"salary" : 35000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"name" : "Underwood",
"age" : 41,
"job" : "Dev Manager",
"gender" : "male",
"salary" : 50000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "6",
"_score" : 1.0,
"_source" : {
"name" : "Lucy",
"age" : 31,
"job" : "QA",
"gender" : "female",
"salary" : 25000
}
}
]
},
"aggregations" : {
"older_person" : {
"doc_count" : 2,
"jobs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Dev Manager",
"doc_count" : 1
},
{
"key" : "Java Programmer",
"doc_count" : 1
}
]
}
},
"all_jobs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Java Programmer",
"doc_count" : 5
},
{
"key" : "DBA",
"doc_count" : 1
},
{
"key" : "Dev Manager",
"doc_count" : 1
},
{
"key" : "Javascript Programmer",
"doc_count" : 1
},
{
"key" : "Product Manager",
"doc_count" : 1
},
{
"key" : "QA",
"doc_count" : 1
}
]
}
}
}
我们可以看到query部分和上面query的结果是一样的,然后older_person里面的聚合才使用到了filter里面的条件,all_jobs里面和上面的query里面的一样。
post_filter
如果你想筛选条件只适用于查询,不适用于聚合,那该怎么办?使用post_filter
比如,我想筛选出年龄小于23岁的员工,并且按照职位类别进行分组,然后查询出小于35岁的员工,按照id排序取前五位
POST employees/_search
{
"size": 5,
"aggs": {
"young_person": {
"filter": {
"range": {
"age": {
"lte": 23
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
}
}
}
},
"post_filter": {
"range": {
"age": {
"lte": 35
}
}
}
}
结果如下:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 18,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"name" : "Emma",
"age" : 32,
"job" : "Product Manager",
"gender" : "female",
"salary" : 35000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"name" : "Tran",
"age" : 25,
"job" : "Web Designer",
"gender" : "male",
"salary" : 18000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"name" : "Rivera",
"age" : 26,
"job" : "Web Designer",
"gender" : "female",
"salary" : 22000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"name" : "Rose",
"age" : 25,
"job" : "QA",
"gender" : "female",
"salary" : 18000
}
},
{
"_index" : "employees",
"_type" : "_doc",
"_id" : "6",
"_score" : 1.0,
"_source" : {
"name" : "Lucy",
"age" : 31,
"job" : "QA",
"gender" : "female",
"salary" : 25000
}
}
]
},
"aggregations" : {
"young_person" : {
"doc_count" : 2,
"jobs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Java Programmer",
"doc_count" : 1
},
{
"key" : "Javascript Programmer",
"doc_count" : 1
}
]
}
}
}
}
这个例子中,post_filter的筛选条件并没有用到聚合里面,所以你可以理解为post_filter
和query
是一对相反的操作,query对聚合的数据先进行筛选再聚合,post_filter和聚合是相互独立的互不影响。
global
最后一个是global,可以无视query的影响,比如我们想筛选出大于35岁的员工,并且按照职业类型分组,然后我们还想获得所有员工的平均工资。
POST /employees/_search
{
"size": 0,
"query": {
"range": {
"age": {
"gte": 35
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
},
"all": {
"global": {},
"aggs": {
"salary_avg": {
"avg": {
"field": "salary"
}
}
}
}
}
}
结果如下:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"all" : {
"doc_count" : 20,
"salary_avg" : {
"value" : 24700.0
}
},
"jobs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Dev Manager",
"doc_count" : 1
},
{
"key" : "Java Programmer",
"doc_count" : 1
}
]
}
}
}
上面我们可以看到,24700.0是所有员工的平均工资,然后jobs里面是大于35岁的员工的类别,分别是Dev Manager和Java programmer。
当然要想实现上面的需求,我们也可以有其他写法,这里仅仅是为了展示global
的用法。
比如下面的代码
POST /employees/_search
{
"size": 0,
"aggs": {
"old_persons": {
"filter": {
"range": {
"age": {
"gte": 35
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
}
}
},
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
}
结果如下:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 20,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"old_persons" : {
"doc_count" : 2,
"jobs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Dev Manager",
"doc_count" : 1
},
{
"key" : "Java Programmer",
"doc_count" : 1
}
]
}
},
"avg_salary" : {
"value" : 24700.0
}
}
}