相关性算分描述了一个文档和查询语句匹配的程度,es会对每个匹配查询条件的结果进行算分
打分的本质是排序,将相关性高的文档排在最前面。es现在采用的是BM 25的算分机制
检索词在一篇文档中出现的频率
算法:检索词的次数除以文档的总字数
度量一条查询和结果文档相关性的简单方法:简单讲搜索中的每一个词的TF进行相加
TF(区块链)+TF(的)+TF(应用)
stop word
过滤不必要的词进行算分
比如TF(的)是没有必要算分的
DF:检索词在所有文档中出现的频率
inverse document frequency :简单的说就是:log(全部文档数/检索词出现的文档总数)
TF-IDF 本质上就是将TF求和变成加权求和
TF(区块链)*IDF(区块链)+TF(的)*IDF(的)+TF(应用)*IDF(应用)
POST testscore/_bulk {"index":{"_id":1}} {"content":"we use Elasticsearch to power the search"} {"index":{"_id":2}} {"content":"we like elasticsearch"} {"index":{"_id":3}} {"content":"the scoring of documents is caculated by the scoring formula"} {"index":{"_id":4}} {"content":"you know for search"} GET testscore/_search { "query": { "match": { "content": "elasticsearch" } } } 结果: { "took" : 2, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 2, "relation" : "eq" }, "max_score" : 0.8713851, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "2", "_score" : 0.8713851,//由于文档比较短,所以打分比较高 "_source" : { "content" : "we like elasticsearch" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "1", "_score" : 0.6489038, "_source" : { "content" : "we use Elasticsearch to power the search" } } ] } } GET testscore/_search { "query": { "match": { "content": "you" } } } 结果: { "took" : 3, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 1, "relation" : "eq" }, "max_score" : 1.3940738, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "4", "_score" : 1.3940738, "_source" : { "content" : "you know for search" } } ] } } GET testscore/_search { "query": { "match": { "content": "the" } } } 结果: { "took" : 3, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 2, "relation" : "eq" }, "max_score" : 0.8025915, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "3", "_score" : 0.8025915, "_source" : { "content" : "the scoring of documents is caculated by the scoring formula" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "1", "_score" : 0.6489038, "_source" : { "content" : "we use Elasticsearch to power the search" } } ] } } GET testscore/_search { "query": { "match": { "content": "the elasticsearch" } } } 结果: { "took" : 8, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 3, "relation" : "eq" }, "max_score" : 1.2978076, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "1", "_score" : 1.2978076, "_source" : { "content" : "we use Elasticsearch to power the search" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "2", "_score" : 0.8713851, "_source" : { "content" : "we like elasticsearch" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "3", "_score" : 0.8025915, "_source" : { "content" : "the scoring of documents is caculated by the scoring formula" } } ] } }
Boosting是控制相关度的一种手段
boost的定义
当boost>1 打分的相关度相对性提高
当0 当boost<0 贡献负分 例:POST /blogs/_bulk
{"index":{"_id":1}}
{"title":"Apple iPad","content":"Apple iPad,Apple iPad"}
{"index":{"_id":2}}
{"title":"Apple iPad,Apple iPad","content":"Apple iPad"}
给字段进行权重处理
GET blogs/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"title": {
"query": "Apple iPad",
"boost": 1.1
}
}
},
{
"match": {
"content": {
"query": "Apple iPad",
"boost": 1
}
}
}
]
}
}
}
结果:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.5353899,
"hits" : [
{
"_index" : "blogs",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.5353899,
"_source" : {
"title" : "Apple iPad,Apple iPad",
"content" : "Apple iPad"
}
},
{
"_index" : "blogs",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.5332985,
"_source" : {
"title" : "Apple iPad",
"content" : "Apple iPad,Apple iPad"
}
}
]
}
}
GET blogs/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"title": {
"query": "Apple iPad",
"boost": 1.1
}
}
},
{
"match": {
"content": {
"query": "Apple iPad",
"boost": 2
}
}
}
]
}
}
}
结果:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.798205,
"hits" : [
{
"_index" : "blogs",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.798205,
"_source" : {
"title" : "Apple iPad",
"content" : "Apple iPad,Apple iPad"
}
},
{
"_index" : "blogs",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.77938265,
"_source" : {
"title" : "Apple iPad,Apple iPad",
"content" : "Apple iPad"
}
}
]
}
}
例2:
POST /news/_bulk
{"index":{"_id":1}}
{"content":"Apple Mac"}
{"index":{"_id":2}}
{"content":"Apple iPad"}
{"index":{"_id":3}}
{"content":"Apple employee like Apple Pie and Apple Juice"}
GET news/_search
{
"query": {
"match": {
"content": "Apple"
}
}
}
结果:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 0.17280532,
"hits" : [
{
"_index" : "news",
"_type" : "_doc",
"_id" : "3",
"_score" : 0.17280532,
"_source" : {
"content" : "Apple employee like Apple Pie and Apple Juice"//由于Apple出现的频率比较高,所以分数比较高
}
},
{
"_index" : "news",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.16786805,
"_source" : {
"content" : "Apple Mac"
}
},
{
"_index" : "news",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.16786805,
"_source" : {
"content" : "Apple iPad"
}
}
]
}
}
如果我们加上权重,让苹果的产品排在最上面如何做呢?
GET news/_search
{
"query": {
"boosting": {
"positive": {
"match": {
"content": "Apple"
}
},
"negative": {
"match": {
"content": "Pie"
}
},
"negative_boost": 0.2
}
}
}
结果:苹果的产品在最上面,有关于Pie的查询仅仅是相关而非匹配
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 0.16786805,
"hits" : [
{
"_index" : "news",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.16786805,
"_source" : {
"content" : "Apple Mac"
}
},
{
"_index" : "news",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.16786805,
"_source" : {
"content" : "Apple iPad"
}
},
{
"_index" : "news",
"_type" : "_doc",
"_id" : "3",
"_score" : 0.034561064,
"_source" : {
"content" : "Apple employee like Apple Pie and Apple Juice"
}
}
]
}
}