mapping
olap_patient
{
"_routing": {
"required": true
},
"_source": {
"enabled": true
},
"properties": {
"visits": {
"ignore_above": 256,
"type": "keyword"
},
"patient_id": {
"ignore_above": 256,
"type": "keyword"
}
}
}
olap_visit
{
"_parent": {
"type": "olap_patient"
},
"_routing": {
"required": true
},
"_source": {
"enabled": true
},
"properties": {
"patient_id": {
"doc_values": true,
"ignore_above": 256,
"type": "keyword"
},
"visit_id": {
"doc_values": true,
"ignore_above": 256,
"type": "keyword"
}
}
}
index data
curl -XPUT "localhost:9200/test/olap_patient/2?routing=2" -d '{"visits":["2+001", "2+002", "2+003"], "patient_id":"2"}'
curl -XPUT "localhost:9200/test/olap_visit/2+001?routing=2?parent=2" -d '{"visit_id":"2+001", "patient_id":"2"}'
curl -XPUT "localhost:9200/test/olap_visit/2+002?routing=2?parent=2" -d '{"visit_id":"2+002", "patient_id":"2"}'
curl -XPUT "localhost:9200/test/olap_visit/2+003?routing=2?parent=2" -d '{"visit_id":"2+003", "patient_id":"2"}'
索引过程
olap_patient#1
[stored,indexed,omitNorms,indexOptions=DOCS<_routing:1>,
stored<_source:[7b 22 76 69 73 69 74 73 22 3a 5b 22 31 2b 30 30 31 22 2c 20 22 31 2b 30 30 32 22 2c 20 22 31 2b 30 30 33 22 5d 2c 20 22 70 61 74 69 65 6e 74 5f 69 64 22 3a 22 31 22 7d]>, indexed,omitNorms,indexOptions=DOCS<_type:olap_patient>,
docValuesType=SORTED_SET<_type:[6f 6c 61 70 5f 70 61 74 69 65 6e 74]>,
stored,indexed,omitNorms,indexOptions=DOCS<_uid:olap_patient#1>,
docValuesType=NUMERIC<_version:-1>,
indexed,omitNorms,indexOptions=DOCS,
docValuesType=SORTED_SET,
indexed,omitNorms,indexOptions=DOCS,
docValuesType=SORTED_SET,
indexed,omitNorms,indexOptions=DOCS,
docValuesType=SORTED_SET,
indexed,omitNorms,indexOptions=DOCS,
docValuesType=SORTED_SET,
indexed,omitNorms,indexOptions=DOCS<_field_names:_routing>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_source>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_type>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_type>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_uid>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_version>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visits>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visits>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visits>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visits>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visits>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visits>,
indexed,omitNorms,indexOptions=DOCS<_field_names:patient_id>,
indexed,omitNorms,indexOptions=DOCS<_field_names:patient_id>,
docValuesType=SORTED<_parent#olap_patient:[31]>]
olap_visit#2+001
[stored,indexed,omitNorms,indexOptions=DOCS<_routing:2>,
stored<_source:[7b 22 76 69 73 69 74 5f 69 64 22 3a 22 32 2b 30 30 31 22 2c 20 22 70 61 74 69 65 6e 74 5f 69 64 22 3a 22 32 22 7d]>,
indexed,omitNorms,indexOptions=DOCS<_type:olap_visit>,
docValuesType=SORTED_SET<_type:[6f 6c 61 70 5f 76 69 73 69 74]>,
stored,indexed,omitNorms,indexOptions=DOCS<_uid:olap_visit#2 001>,
docValuesType=NUMERIC<_version:-1>,
indexed,omitNorms,indexOptions=DOCS,
docValuesType=SORTED_SET,
indexed,omitNorms,indexOptions=DOCS,
docValuesType=SORTED_SET,
indexed,omitNorms,indexOptions=DOCS<_field_names:_routing>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_source>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_type>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_type>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_uid>,
indexed,omitNorms,indexOptions=DOCS<_field_names:_version>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visit_id>,
indexed,omitNorms,indexOptions=DOCS<_field_names:visit_id>,
indexed,omitNorms,indexOptions=DOCS<_field_names:patient_id>,
indexed,omitNorms,indexOptions=DOCS<_field_names:patient_id>,
docValuesType=SORTED<_parent#olap_patient:[32]>]
注意索引字段_parent#olap_patient:[32]
search
/test2/olap_visit/_search?pretty -d '{"query":{"has_parent":{"type":"olap_patient", "query":{"terms":{"visits":["3+002"]}}}}}'
- 通过TermQuery(visits:["3+002"])+ TermQuery(_type:olap_patient)得到olap_patient结果集;对每一个olap_patient,通过DocValue(_parent#olap_patient)拿到olap_patient的_parent#olap_patient字段存在doc_values里的序号;对所有序号构建一个BitSets,这个过程由org.apache.lucene.search.join.GlobalOrdinalsCollector.OrdinalMapCollector完成;注意这里的BitSets和通常的倒排链归并不一样,这里仅作为random access filter使用
- 通过TermQuery(_type:olap_visit)得到olap_visit结果集;对每一个olap_visit,通过DocValue(_parent#olap_patient)拿到olap_visit的_parent#olap_patient字段存在doc_values里的序号;判断该序号是否在第一个阶段生成的BitSets里;如果存在,则表示该olap_visit的parent满足has_parent查询条件,该olap_visit即命中
总的来说,ES构建了一个DocValues字段,不管是父文档还是子文档,都将父文档的id存在了该DocValues字段里,这样做可以在查询得到parent结果集后将parent的doc_values取出来,作为child查询结果集的一个过滤条件
为什么要基于doc_values去实现呢?因为doc_values内部维护了两个对象,一个是ordinals,一个是values;ordinals可以查找doc_id对应的序数,values可以查找序数对应的字段值;也就是说,doc_values可以将文档集对应的值集用bitset来表示,方便过滤及聚合
tips
增加一个doc_type时不能指定已存在的非parent的doc_type为_parent,因为ES会为父子文档都创建joinField的DocValues索引。异常信息:{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"can't add a _parent field that points to an already existing type, that isn't already a parent"}],"type":"illegal_argument_exception","reason":"can't add a _parent field that points to an already existing type, that isn't already a parent"},"status":400}