Solr获得高亮词(Highlighter/Term)的position及offset信息

package org.scbit.lsbi.solr.highlighting;

import org.apache.lucene.search.Query;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldTermStack;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.PluginInfoInitialized;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;

public class PositionsSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {

    public void init(PluginInfo info) {
    }

    @Override
    public NamedList doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
        SolrParams params = req.getParams();

        // if highlighting isn't enabled, then why call doHighlighting?
        if (isHighlightingEnabled(params)) {
            FastVectorHighlighter fvh = new FastVectorHighlighter(
                    // FVH cannot process hl.usePhraseHighlighter parameter per-field basis
                    params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true),
                    // FVH cannot process hl.requireFieldMatch parameter per-field basis
                    params.getBool(HighlightParams.FIELD_MATCH, false));
            fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, Integer.MAX_VALUE));

            SolrIndexSearcher searcher = req.getSearcher();
            IndexSchema schema = searcher.getSchema();
            int[] docIDs = toDocIDs(docs);

            // query-time parameters
            String[] fieldNames = getHighlightFields(query, req, defaultFields);
            Set fset = new HashSet();

            // pre-fetch documents using the Searcher's doc cache
            for (String f : fieldNames) {
                fset.add(f);
            }
            // fetch unique key if one exists.
            SchemaField keyField = schema.getUniqueKeyField();
            if (null != keyField) {
                fset.add(keyField.getName());
            }

            NamedList list = new SimpleOrderedMap<>();

            for (int docID : docIDs) {
                NamedList summary = new SimpleOrderedMap<>();
                for (String field : fieldNames) {
                    FieldQuery fq = fvh.getFieldQuery(query, searcher.getIndexReader());
                    FieldTermStack stack = new FieldTermStack(req.getSearcher().getIndexReader(), docID, field, fq);
                    FieldPhraseList fpl = new FieldPhraseList(stack, fq);

                    NamedList> terms = new SimpleOrderedMap<>();
                    for (FieldPhraseList.WeightedPhraseInfo wpi : fpl.getPhraseList()) {
                        for (FieldTermStack.TermInfo ti : wpi.getTermsInfos()) {
                            NamedList term = new SimpleOrderedMap<>();

                            term.add("position", ti.getPosition());

                            ArrayList ofst = new ArrayList<>(2);
                            ofst.add(ti.getStartOffset());
                            ofst.add(ti.getEndOffset());
                            term.add("offsets", ofst);

                            terms.add(ti.getText(), term);
                        }
                    }
                    NamedList info = new SimpleOrderedMap<>();
                    info.add("terms", terms);
                    summary.add(field, info);
                }

                String printId = schema.printableUniqueKey(searcher.doc(docID, fset));
                list.add(printId == null ? null : printId, summary);
            }

            return list;
        } else {
            return null;
        }

    }

    protected int[] toDocIDs(DocList docs) {
        int[] docIDs = new int[docs.size()];
        DocIterator iterator = docs.iterator();
        for (int i = 0; i < docIDs.length; i++) {
            if (!iterator.hasNext()) {
                throw new AssertionError();
            }
            docIDs[i] = iterator.nextDoc();
        }
        if (iterator.hasNext()) {
            throw new AssertionError();
        }
        return docIDs;
    }

} 
  

代码来源参考:https://issues.apache.org/jira/browse/SOLR-4722

查询结果如下:

{
        "pmid":28705234,
        "pmcid":5513360,
        "title":"In silico characterization of cell-cell interactions using a cellular automata model of cell culture.",
        "author":"Takanori Kihara; Kosuke Kashitani; Jun Miyake; ",
        "articleAbstract":"Cell proliferation is a key characteristic of eukaryotic cells. During cell proliferation, cells interact with each other. In this study, we developed a cellular automata model to estimate cell-cell interactions using experimentally obtained images of cultured cells.\nWe used four types of cells; HeLa cells, human osteosarcoma (HOS) cells, rat mesenchymal stem cells (MSCs), and rat smooth muscle A7r5 cells. These cells were cultured and stained daily. The obtained cell images were binarized and clipped into squares containing about 10(4) cells. These cells showed characteristic cell proliferation patterns. The growth curves of these cells were generated from the cell proliferation images and we determined the doubling time of these cells from the growth curves. We developed a simple cellular automata system with an easily accessible graphical user interface. This system has five variable parameters, namely, initial cell number, doubling time, motility, cell-cell adhesion, and cell-cell contact inhibition (of proliferation). Within these parameters, we obtained initial cell numbers and doubling times experimentally. We set the motility at a constant value because the effect of the parameter for our simulation was restricted. Therefore, we simulated cell proliferation behavior with cell-cell adhesion and cell-cell contact inhibition as variables. By comparing growth curves and proliferation cell images, we succeeded in determining the cell-cell interaction properties of each cell. Simulated HeLa and HOS cells exhibited low cell-cell adhesion and weak cell-cell contact inhibition. Simulated MSCs exhibited high cell-cell adhesion and positive cell-cell contact inhibition. Simulated A7r5 cells exhibited low cell-cell adhesion and strong cell-cell contact inhibition. These simulated results correlated with the experimental growth curves and proliferation images.\nOur simulation approach is an easy method for evaluating the cell-cell interaction properties of cells.\n",
        "keyword":"Cell assay system; Cell proliferation; Cellular automata; Cell–cell adhesion; Cell–cell contact inhibition; ",
        "publishedYear":2017,
        "publishedMonth":7,
        "publishedDay":14,
        "elecPublishedDate":20170714,
        "year":2017,
        "volume":"10",
        "issue":"1",
        "page":"283",
        "hitNum":0,
        "journalTitle":"BMC Research Notes",
        "journalIsoAbbr":"BMC Res Notes",
        "journalMedlineTA":"BMC Res Notes",
        "journalIssnElec":"1756-0500",
        "publisherName":"BioMed Central",
        "doi":"10.1186/s13104-017-2613-x",
        "indexTime":1500865734000,
        "_version_":1609023004723904512}

得到的高亮部分:

"highlighting":{
    "28705234":{
      "title":{
        "terms":{
          "cell":{
            "position":4,
            "offsets":[30,
              34]},
          "cell":{
            "position":6,
            "offsets":[35,
              39]},
          "cell":{
            "position":14,
            "offsets":[88,
              92]}}}},
。。。。。。。

 

你可能感兴趣的:(Solr,Highlight,offset,position)