Elasticsearch的查询语句中的一些域需要进一步调用Analyzer分析器进行分析,比如fuzzy_like_this查询、fuzzy_like_this_filed查询、more_like_this查询、more_like_this_field查询、multi_match查询等。在这些查询中,查询field包含多个值,如下例中需要搜索包含“crime publishment”的document。我们不能直接用“crime publishment”去索引中查找,需要对其运用Analyzer分析器进行分析。
{
"fuzzy_like_this" :
{
"fields": "title",
"like_text": "crime punishment",
"analyzer": "simple" #定义分析器,用于分析like_text字段
}
}
'''(1)Elasticsearch:在解析查询语句时,获取分析器'''
public class FuzzyLikeThisQueryParser implements QueryParser {
public static final String NAME = "flt";
@Override
public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException {
XContentParser parser = parseContext.parser();
int maxNumTerms = 25;
float boost = 1.0f;
List fields = null;
String likeText = null;
float minSimilarity = 0.5f;
int prefixLength = 0;
boolean ignoreTF = false;
'''分析器'''
Analyzer analyzer = null;
boolean failOnUnsupportedField = true;
XContentParser.Token token;
String currentFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (token.isValue()) {
if ("like_text".equals(currentFieldName) || "likeText".equals(currentFieldName)) {
likeText = parser.text();
} else if ("max_query_terms".equals(currentFieldName) || "maxQueryTerms".equals(currentFieldName)) {
maxNumTerms = parser.intValue();
} else if ("boost".equals(currentFieldName)) {
boost = parser.floatValue();
} else if ("ignore_tf".equals(currentFieldName) || "ignoreTF".equals(currentFieldName)) {
ignoreTF = parser.booleanValue();
} else if ("min_similarity".equals(currentFieldName) || "minSimilarity".equals(currentFieldName)) {
minSimilarity = parser.floatValue();
} else if ("prefix_length".equals(currentFieldName) || "prefixLength".equals(currentFieldName)) {
prefixLength = parser.intValue();
} else if ("analyzer".equals(currentFieldName)) {
'''根据查询语句中analyzer字段的值获取分析器实例对象
分析器名字-分析器实例对应关系注册参考另一篇文章'''
analyzer = parseContext.analysisService().analyzer(parser.text());
} else if ("fail_on_unsupported_field".equals(currentFieldName) || "failOnUnsupportedField".equals(currentFieldName)) {
failOnUnsupportedField = parser.booleanValue();
} else {
throw new QueryParsingException(parseContext.index(), "[flt] query does not support [" + currentFieldName + "]");
}
} else if (token == XContentParser.Token.START_ARRAY) {
if ("fields".equals(currentFieldName)) {
fields = Lists.newLinkedList();
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
fields.add(parseContext.indexName(parser.text()));
}
} else {
throw new QueryParsingException(parseContext.index(), "[flt] query does not support [" + currentFieldName + "]");
}
}
}
if (likeText == null) {
throw new QueryParsingException(parseContext.index(), "fuzzy_like_this requires 'like_text' to be specified");
}
'''查询语句未指定分析器时,调用默认分析器'''
if (analyzer == null) {
analyzer = parseContext.mapperService().searchAnalyzer();
}
'''将分析器传给Lucene的FuzzyLikeThisQuery对象'''
FuzzyLikeThisQuery query = new FuzzyLikeThisQuery(maxNumTerms, analyzer);
if (fields == null) {
fields = Lists.newArrayList(parseContext.defaultField());
} else if (fields.isEmpty()) {
throw new QueryParsingException(parseContext.index(), "fuzzy_like_this requires 'fields' to be non-empty");
}
for (Iterator it = fields.iterator(); it.hasNext(); ) {
final String fieldName = it.next();
if (!Analysis.generatesCharacterTokenStream(analyzer, fieldName)) {
if (failOnUnsupportedField) {
throw new ElasticSearchIllegalArgumentException("more_like_this doesn't support binary/numeric fields: [" + fieldName + "]");
} else {
it.remove();
}
}
}
if (fields.isEmpty()) {
return null;
}
for (String field : fields) {
query.addTerms(likeText, field, minSimilarity, prefixLength);
}
query.setBoost(boost);
query.setIgnoreTF(ignoreTF);
return query;
}
}
'''(2)Lucene:在FuzzyLikeThisQuery.rewrite()方法中,调用分析器分析like_text域'''
public class FuzzyLikeThisQuery extends Query{
public Query rewrite(IndexReader reader) throws IOException
{
if(rewrittenQuery!=null)
{
return rewrittenQuery;
}
//load up the list of possible terms
for (Iterator iter = fieldVals.iterator(); iter.hasNext(); ) {
FieldVals f = iter.next();
'''在此函数中调用分析器'''
addTerms(reader, f);
}
//clear the list of fields
fieldVals.clear();
'''省略...'''
}
private void addTerms(IndexReader reader, FieldVals f) throws IOException {
if (f.queryString == null) return;
final Terms terms = MultiFields.getTerms(reader, f.fieldName);
if (terms == null) {
return;
}
'''analyzer即是Elasticsearch从查询语句中解析出来的分析器
该分析器会将"crime punishment"分析成"crime"和"pubnishment"'''
try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
int corpusNumDocs = reader.numDocs();
HashSet processedTerms = new HashSet<>();
ts.reset();
while (ts.incrementToken()) {
String term = termAtt.toString();
if (!processedTerms.contains(term)) {
processedTerms.add(term);
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
int totalVariantDocFreqs = 0;
BytesRef possibleMatch;
BoostAttribute boostAtt =
fe.attributes().addAttribute(BoostAttribute.class);
while ((possibleMatch = fe.next()) != null) {
numVariants++;
totalVariantDocFreqs += fe.docFreq();
float score = boostAtt.getBoost();
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
variantsQ.insertWithOverflow(st);
minScore = variantsQ.top().score; // maintain minScore
}
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {
int avgDf = totalVariantDocFreqs / numVariants;
if (df == 0)//no direct match we can use as df for all variants
{
df = avgDf; //use avg df of all variants
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking
// overall top query terms
int size = variantsQ.size();
for (int i = 0; i < size; i++) {
ScoreTerm st = variantsQ.pop();
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
q.insertWithOverflow(st);
}
}
}
}
ts.end();
}
}
}