term information: term frequency in the field, term positions, start and end offsets, term payloads
term statistics: 设置term_statistics=true; total term frequency, 一个term在所有document中出现的频率; document frequency,有多少document包含这个term
field statistics: document count,有多少document包含这个field; sum of document frequency,一个field中所有term的df之和; sum of total term frequency,一个field中的所有term的tf之和
GET /twitter/tweet/1/_termvectors
GET /twitter/tweet/1/_termvectors?fields=text
term statistics和field statistics并不精准,不会被考虑有的doc可能被删除了
term vector,涉及了很多的term和field相关的统计信息,有两种方式可以采集到这个统计信息
(2)query-time,你之前没有生成过任何的Term vector信息,然后在查看term vector的时候,直接就可以看到了,会on the fly,现场计算出各种统计信息,然后返回给你
这一讲,不会手敲任何命令,直接copy我做好的命令,因为这一讲的重点,不是掌握什么搜索或者聚合的语法,而是说,掌握,如何采集term vector信息,然后如何看懂term vector信息,你能掌握利用term vector进行数据探查
PUT /my_index { "mappings": { "my_type": { "properties": { "text": { "type": "text", "term_vector": "with_positions_offsets_payloads", "store" : true, "analyzer" : "fulltext_analyzer"//使用自定义的分词器 }, "fullname": { "type": "text", "analyzer" : "fulltext_analyzer" } } } }, "settings" : { "index" : { "number_of_shards" : 1, "number_of_replicas" : 0 }, "analysis": { "analyzer": { "fulltext_analyzer": {//自定义的分词器 "type": "custom", "tokenizer": "whitespace", "filter": [ "lowercase", "type_as_payload" ] } } } } }
结果: { "acknowledged": true, "shards_acknowledged": true } |
PUT /my_index/my_type/1 { "fullname" : "Leo Li", "text" : "hello test test test " }
PUT /my_index/my_type/2 { "fullname" : "Leo Li", "text" : "other hello test ..." }
结果: { "_index": "my_index", "_type": "my_type", "_id": "2", "_version": 1, "result": "created", "_shards": { "total": 1, "successful": 1, "failed": 0 }, "created": true } |
GET /my_index/my_type/1/_termvectors { "fields" : ["text"], "offsets" : true, "payloads" : true, "positions" : true, "term_statistics" : true, "field_statistics" : true } 结果 { "_index": "my_index", "_type": "my_type", "_id": "1", "_version": 1, "found": true, "took": 10, "term_vectors": { "text": { "field_statistics": { "sum_doc_freq": 6, //所有的doc 中的doc_freq加起来 "doc_count": 2,//意思是:test 这个field 存在多少doc中 "sum_ttf": 8//所有的doc 中的ttf加起来 }, "terms": { "hello": { "doc_freq": 2,//有两个doc 包含这个term "ttf": 2,//hello 这个词在所有doc中出现了几次 "term_freq": 1,//这个hello在当前这个doc中出现了几次 "tokens": [//一个hello 在一个field 中可能出现多次,每出现一次就是一个token { "position": 0,//位于第0个词 "start_offset": 0,//开始位置 "end_offset": 5//结束位置, "payload": "d29yZA=="//这是一个编码 } ] }, "test": { "doc_freq": 2, //有两个doc 包含这个term "ttf": 4, //test 这个词在所有doc中出现了4次 "term_freq": 3, //这个hello在当前这个doc中出现了3次 "tokens": [[//一个test 在一个field 中可能出现多次,每出现一次就是一个token { "position": 1, "start_offset": 6, "end_offset": 10, "payload": "d29yZA==" }, { "position": 2, "start_offset": 11, "end_offset": 15, "payload": "d29yZA==" }, { "position": 3, "start_offset": 16, "end_offset": 20, "payload": "d29yZA==" } ] } } } } }
GET /my_index/my_type/1/_termvectors { "fields" : ["fullname"], "offsets" : true, "positions" : true, "term_statistics" : true, "field_statistics" : true }
结果: { "_index": "my_index", "_type": "my_type", "_id": "1", "_version": 1, "found": true, "took": 13, "term_vectors": { "fullname": { "field_statistics": { "sum_doc_freq": 4, "doc_count": 2, "sum_ttf": 4 }, "terms": { "leo": { "doc_freq": 2, "ttf": 2, "term_freq": 1, "tokens": [ { "position": 0, "start_offset": 0, "end_offset": 3 } ] }, "li": { "doc_freq": 2, "ttf": 2, "term_freq": 1, "tokens": [ { "position": 1, "start_offset": 4, "end_offset": 6 } ] } } } } } |
一般来说,如果条件允许,你就用query time的term vector就可以了,你要探查什么数据,现场去探查一下就好了
GET /my_index/my_type/_termvectors { "doc" : { "fullname" : "Leo Li", "text" : "hello test test test" }, "fields" : ["text"], "offsets" : true, "payloads" : true, "positions" : true, "term_statistics" : true, "field_statistics" : true } 结果: { "_index": "my_index", "_type": "my_type", "_version": 0, "found": true, "took": 1, "term_vectors": { "text": { "field_statistics": { "sum_doc_freq": 6, "doc_count": 2, "sum_ttf": 8 }, "terms": { "hello": { "doc_freq": 2, "ttf": 2, "term_freq": 1, "tokens": [ { "position": 0, "start_offset": 0, "end_offset": 5 } ] }, "test": { "doc_freq": 2, "ttf": 4, "term_freq": 3, "tokens": [ { "position": 1, "start_offset": 6, "end_offset": 10 }, { "position": 2, "start_offset": 11, "end_offset": 15 }, { "position": 3, "start_offset": 16, "end_offset": 20 } ] } } } } } |
手动指定一个doc,实际上不是要指定doc,而是要指定你想要安插的词条,hello test,那么就可以放在一个field中
GET /my_index/my_type/_termvectors { "doc" : { "fullname" : "Leo Li", "text" : "hello test test test" }, "fields" : ["text"], "offsets" : true, "payloads" : true, "positions" : true, "term_statistics" : true, "field_statistics" : true, "per_field_analyzer" : { "text": "standard" } } 结果: { "_index": "my_index", "_type": "my_type", "_version": 0, "found": true, "took": 0, "term_vectors": { "text": { "field_statistics": { "sum_doc_freq": 6, "doc_count": 2, "sum_ttf": 8 }, "terms": { "hello": { "doc_freq": 2, "ttf": 2, "term_freq": 1, "tokens": [ { "position": 0, "start_offset": 0, "end_offset": 5 } ] }, "test": { "doc_freq": 2, "ttf": 4, "term_freq": 3, "tokens": [ { "position": 1, "start_offset": 6, "end_offset": 10 }, { "position": 2, "start_offset": 11, "end_offset": 15 }, { "position": 3, "start_offset": 16, "end_offset": 20 } ] } } } } } |
GET /my_index/my_type/_termvectors { "doc" : { "fullname" : "Leo Li", "text" : "hello test test test" }, "fields" : ["text"], "offsets" : true, "payloads" : true, "positions" : true, "term_statistics" : true, "field_statistics" : true, "filter" : { "max_num_terms" : 3, "min_term_freq" : 1, "min_doc_freq" : 1 } } 结果: { "_index": "my_index", "_type": "my_type", "_version": 0, "found": true, "took": 1, "term_vectors": { "text": { "field_statistics": { "sum_doc_freq": 6, "doc_count": 2, "sum_ttf": 8 }, "terms": { "hello": { "doc_freq": 2, "ttf": 2, "term_freq": 1, "tokens": [ { "position": 0, "start_offset": 0, "end_offset": 5 } ], "score": 1 }, "test": { "doc_freq": 2, "ttf": 4, "term_freq": 3, "tokens": [ { "position": 1, "start_offset": 6, "end_offset": 10 }, { "position": 2, "start_offset": 11, "end_offset": 15 }, { "position": 3, "start_offset": 16, "end_offset": 20 } ], "score": 3 } } } } } |
这个就是说,根据term统计信息,过滤出你想要看到的term vector统计结果
GET _mtermvectors { "docs": [ { "_index": "my_index", "_type": "my_type", "_id": "2", "term_statistics": true }, { "_index": "my_index", "_type": "my_type", "_id": "1", "fields": [ "text" ] } ] }
GET /my_index/_mtermvectors { "docs": [ { "_type": "test", "_id": "2", "fields": [ "text" ], "term_statistics": true }, { "_type": "test", "_id": "1" } ] }
GET /my_index/my_type/_mtermvectors { "docs": [ { "_id": "2", "fields": [ "text" ], "term_statistics": true }, { "_id": "1" } ] }
GET /_mtermvectors { "docs": [ { "_index": "my_index", "_type": "my_type", "doc" : { "fullname" : "Leo Li", "text" : "hello test test test" } }, { "_index": "my_index", "_type": "my_type", "doc" : { "fullname" : "Leo Li", "text" : "other hello test ..." } } ] }