Elasticsearch源码分析十--调用Lucene查询分析器Analyzer

  • 概述
  • 源码分析

概述

Elasticsearch的查询语句中的一些域需要进一步调用Analyzer分析器进行分析,比如fuzzy_like_this查询、fuzzy_like_this_filed查询、more_like_this查询、more_like_this_field查询、multi_match查询等。在这些查询中,查询field包含多个值,如下例中需要搜索包含“crime publishment”的document。我们不能直接用“crime publishment”去索引中查找,需要对其运用Analyzer分析器进行分析。

{
    "fuzzy_like_this" : 
    {
        "fields": "title",
        "like_text": "crime punishment",
        "analyzer": "simple"   #定义分析器,用于分析like_text字段
    }
}

源码分析

'''(1)Elasticsearch:在解析查询语句时,获取分析器'''
public class FuzzyLikeThisQueryParser implements QueryParser {

    public static final String NAME = "flt";

    @Override
    public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException {
        XContentParser parser = parseContext.parser();

        int maxNumTerms = 25;
        float boost = 1.0f;
        List fields = null;
        String likeText = null;
        float minSimilarity = 0.5f;
        int prefixLength = 0;
        boolean ignoreTF = false;
        '''分析器'''
        Analyzer analyzer = null;
        boolean failOnUnsupportedField = true;

        XContentParser.Token token;
        String currentFieldName = null;
        while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
            if (token == XContentParser.Token.FIELD_NAME) {
                currentFieldName = parser.currentName();
            } else if (token.isValue()) {
                if ("like_text".equals(currentFieldName) || "likeText".equals(currentFieldName)) {
                    likeText = parser.text();
                } else if ("max_query_terms".equals(currentFieldName) || "maxQueryTerms".equals(currentFieldName)) {
                    maxNumTerms = parser.intValue();
                } else if ("boost".equals(currentFieldName)) {
                    boost = parser.floatValue();
                } else if ("ignore_tf".equals(currentFieldName) || "ignoreTF".equals(currentFieldName)) {
                    ignoreTF = parser.booleanValue();
                } else if ("min_similarity".equals(currentFieldName) || "minSimilarity".equals(currentFieldName)) {
                    minSimilarity = parser.floatValue();
                } else if ("prefix_length".equals(currentFieldName) || "prefixLength".equals(currentFieldName)) {
                    prefixLength = parser.intValue();
                } else if ("analyzer".equals(currentFieldName)) {
                    '''根据查询语句中analyzer字段的值获取分析器实例对象
                        分析器名字-分析器实例对应关系注册参考另一篇文章'''
                    analyzer = parseContext.analysisService().analyzer(parser.text());
                } else if ("fail_on_unsupported_field".equals(currentFieldName) || "failOnUnsupportedField".equals(currentFieldName)) {
                    failOnUnsupportedField = parser.booleanValue();
                } else {
                    throw new QueryParsingException(parseContext.index(), "[flt] query does not support [" + currentFieldName + "]");
                }
            } else if (token == XContentParser.Token.START_ARRAY) {
                if ("fields".equals(currentFieldName)) {
                    fields = Lists.newLinkedList();
                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
                        fields.add(parseContext.indexName(parser.text()));
                    }
                } else {
                    throw new QueryParsingException(parseContext.index(), "[flt] query does not support [" + currentFieldName + "]");
                }
            }
        }

        if (likeText == null) {
            throw new QueryParsingException(parseContext.index(), "fuzzy_like_this requires 'like_text' to be specified");
        }

        '''查询语句未指定分析器时,调用默认分析器'''
        if (analyzer == null) {
            analyzer = parseContext.mapperService().searchAnalyzer();
        }

        '''将分析器传给Lucene的FuzzyLikeThisQuery对象'''
        FuzzyLikeThisQuery query = new FuzzyLikeThisQuery(maxNumTerms, analyzer);
        if (fields == null) {
            fields = Lists.newArrayList(parseContext.defaultField());
        } else if (fields.isEmpty()) {
            throw new QueryParsingException(parseContext.index(), "fuzzy_like_this requires 'fields' to be non-empty");
        }
        for (Iterator it = fields.iterator(); it.hasNext(); ) {
            final String fieldName = it.next();
            if (!Analysis.generatesCharacterTokenStream(analyzer, fieldName)) {
                if (failOnUnsupportedField) {
                    throw new ElasticSearchIllegalArgumentException("more_like_this doesn't support binary/numeric fields: [" + fieldName + "]");
                } else {
                    it.remove();
                }
            }
        }
        if (fields.isEmpty()) {
            return null;
        }
        for (String field : fields) {
            query.addTerms(likeText, field, minSimilarity, prefixLength);
        }
        query.setBoost(boost);
        query.setIgnoreTF(ignoreTF);

        return query;
    }
}

'''(2)Lucene:在FuzzyLikeThisQuery.rewrite()方法中,调用分析器分析like_text域'''
public class FuzzyLikeThisQuery extends Query{
    public Query rewrite(IndexReader reader) throws IOException
    {
        if(rewrittenQuery!=null)
        {
            return rewrittenQuery;
        }
        //load up the list of possible terms
        for (Iterator iter = fieldVals.iterator(); iter.hasNext(); ) {
          FieldVals f = iter.next();
          '''在此函数中调用分析器'''
          addTerms(reader, f);
        }
      //clear the list of fields
        fieldVals.clear();

        '''省略...'''
     }

    private void addTerms(IndexReader reader, FieldVals f) throws IOException {
    if (f.queryString == null) return;
    final Terms terms = MultiFields.getTerms(reader, f.fieldName);
    if (terms == null) {
      return;
    }
    '''analyzer即是Elasticsearch从查询语句中解析出来的分析器
       该分析器会将"crime punishment"分析成"crime"和"pubnishment"'''
    try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

      int corpusNumDocs = reader.numDocs();
      HashSet processedTerms = new HashSet<>();
      ts.reset();
      while (ts.incrementToken()) {
        String term = termAtt.toString();
        if (!processedTerms.contains(term)) {
          processedTerms.add(term);
          ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
          float minScore = 0;
          Term startTerm = new Term(f.fieldName, term);
          AttributeSource atts = new AttributeSource();
          MaxNonCompetitiveBoostAttribute maxBoostAtt =
            atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
          SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
          //store the df so all variants use same idf
          int df = reader.docFreq(startTerm);
          int numVariants = 0;
          int totalVariantDocFreqs = 0;
          BytesRef possibleMatch;
          BoostAttribute boostAtt =
            fe.attributes().addAttribute(BoostAttribute.class);
          while ((possibleMatch = fe.next()) != null) {
            numVariants++;
            totalVariantDocFreqs += fe.docFreq();
            float score = boostAtt.getBoost();
            if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
              ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
              variantsQ.insertWithOverflow(st);
              minScore = variantsQ.top().score; // maintain minScore
            }
            maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
          }

          if (numVariants > 0) {
            int avgDf = totalVariantDocFreqs / numVariants;
            if (df == 0)//no direct match we can use as df for all variants
            {
              df = avgDf; //use avg df of all variants
            }

            // take the top variants (scored by edit distance) and reset the score
            // to include an IDF factor then add to the global queue for ranking
            // overall top query terms
            int size = variantsQ.size();
            for (int i = 0; i < size; i++) {
              ScoreTerm st = variantsQ.pop();
              st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
              q.insertWithOverflow(st);
            }
          }
        }
      }
      ts.end();
    }
  }
}


你可能感兴趣的:(Elasticsearch,Lucene)