
  • TF: token frequency ,某个搜索字段分词后再document中字段(待搜索的字段)中出现的次数

  • IDF:inverse document frequency,逆文档频率,某个搜索的字段在所有document中出现的次数取反

  • TFNORM:token frequency normalized,词频归一化
  • BM25:算法:(freq + k1 * (1 - b + b * dl / avgdl))


        "_index" : "movies",
        "_type" : "_doc",
        "_id" : "321697",
        "_score" : 6.6273837,
        "_source" : {
          "title" : "Steve Jobs"
        "_index" : "movies",
        "_type" : "_doc",
        "_id" : "23706",
        "_score" : 6.0948296,
        "_source" : {
          "title" : "All About Steve"


GET /movies/_search
  "query": {
    "match": {
      "title": "steve"




TFNORM方面则不一样了,第一个文档中该词占比为1/2,第二个文档中该词占比为1/3,故而第一个文档在该搜索下打分比第二个索引高,所以ES算法时使用了TFNORM计算方式freq / (freq + k1 * (1 - b + b * dl / avgdl))



GET /movies/_search
  // 和MySQL的执行计划类似
  "explain": true, 
  "query": {
    "match": {
      "title": "steve"


    "_shard": "[movies][1]",
    "_node": "pqNhgutvQfqcLqLEzIDnbQ",
    "_index": "movies",
    "_type": "_doc",
    "_id": "321697",
    "_score": 6.6273837,
    "_source": {
        "overview": "Set backstage at three iconic product launches and ending in 1998 with the unveiling of the iMac, Steve Jobs takes us behind the scenes of the digital revolution to paint an intimate portrait of the brilliant man at its epicenter.",
        "voteAverage": 6.8,
        "keywords": [
                "id": 5565,
                "name": "biography"
                "id": 6104,
                "name": "computer"
                "id": 15300,
                "name": "father daughter relationship"
                "id": 157935,
                "name": "apple computer"
                "id": 161160,
                "name": "steve jobs"
                "id": 185722,
                "name": "based on true events"
        "releaseDate": "2015-01-01T00:00:00.000Z",
        "runtime": 122,
        "originalLanguage": "en",
        "title": "Steve Jobs",
        "productionCountries": [
                "iso_3166_1": "US",
                "name": "United States of America"
        "revenue": 34441873,
        "genres": [
                "id": 18,
                "name": "Drama"
                "id": 36,
                "name": "History"
        "originalTitle": "Steve Jobs",
        "popularity": 53.670525,
        "tagline": "Can a great man be a good man?",
        "spokenLanguages": [
                "iso_639_1": "en",
                "name": "English"
        "id": 321697,
        "voteCount": 1573,
        "productionCompanies": [
                "name": "Universal Pictures",
                "id": 33
                "name": "Scott Rudin Productions",
                "id": 258
                "name": "Legendary Pictures",
                "id": 923
                "name": "The Mark Gordon Company",
                "id": 1557
                "name": "Management 360",
                "id": 4220
                "name": "Cloud Eight Films",
                "id": 6708
        "budget": 30000000,
        "homepage": "http://www.stevejobsthefilm.com",
        "status": "Released"
    -          }


    "_explanation": {
        "value": 6.6273837,
        // title字段值steve在所有匹配的1526个文档中的权重
        "description": "weight(title:steve in 1526) [PerFieldSimilarity], result of:",
        "details": [
                // value = idf.value * tf.value * 2.2
                // 6.6273837 = 6.4412656 * 0.46767938 * 2.2
                "value": 6.6273837,
                "description": "score(freq=1.0), product of:",
                "details": [
                        "value": 2.2,
                        // 放大因子,这个数值可以在创建索引的时候指定,默认值是2.2
                        "description": "boost",
                        "details": []
                        "value": 6.4412656,
                        "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                        "details": [
                                "value": 2,
                                "description": "n, number of documents containing term",
                                "details": []
                                "value": 1567,
                                "description": "N, total number of documents with field",
                                "details": []
                        "value": 0.46767938,
                        "description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                        "details": [
                                "value": 1,
                                "description": "freq, occurrences of term within document",
                                "details": []
                            // 这块提现了BM25算法((freq + k1 * (1 - b + b * dl / avgdl)))
                                "value": 1.2,
                                "description": "k1, term saturation parameter",
                                "details": []
                                "value": 0.75,
                                "description": "b, length normalization parameter",
                                "details": []
                            // 这块就可以提现出一个归一化的操作算法
                                "value": 2,
                                "description": "dl, length of field",
                                "details": []
                                "value": 2.1474154,
                                "description": "avgdl, average length of field",
                                "details": []