SpanQuery按照词在文章中的距离或者查询几个相邻词的查询
SpanQuery包括以下几种:
SpanTermQuery:词距查询的基础,结果和TermQuery相似,只不过是增加了查询结果中单词的距离信息。
SpanFirstQuery:在指定距离可以找到第一个单词的查询。
SpanNearQuery:查询的几个语句之间保持者一定的距离。
SpanOrQuery:同时查询几个词句查询。
SpanNotQuery:从一个词距查询结果中,去除一个词距查询。
下面一个简单例子介绍
package com; //SpanQuery:跨度查询。此类为抽象类。 import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.store.RAMDirectory; public class SpanQueryTest { private RAMDirectory directory; private IndexSearcher indexSearcher; private IndexReader reader; private SpanTermQuery quick; private SpanTermQuery brown; private SpanTermQuery red; private SpanTermQuery fox; private SpanTermQuery lazy; private SpanTermQuery sleepy; private SpanTermQuery dog; private SpanTermQuery cat; private Analyzer analyzer; // 索引及初使化 public void index() throws IOException { directory = new RAMDirectory(); analyzer = new WhitespaceAnalyzer(); IndexWriter writer = new IndexWriter(directory, analyzer, true); Document doc1 = new Document(); doc1.add(new Field("field", "the quick brown fox jumps over the lazy dog", Store.YES, Index.TOKENIZED)); Document doc2 = new Document(); doc2.add(new Field("field", "the quick red fox jumps over the sleepy cat", Store.YES, Index.TOKENIZED)); writer.addDocument(doc1); writer.addDocument(doc2); writer.optimize(); writer.close(); quick = new SpanTermQuery(new Term("field", "quick")); brown = new SpanTermQuery(new Term("field", "brown")); red = new SpanTermQuery(new Term("field", "red")); fox = new SpanTermQuery(new Term("field", "fox")); lazy = new SpanTermQuery(new Term("field", "lazy")); sleepy = new SpanTermQuery(new Term("field", "sleepy")); dog = new SpanTermQuery(new Term("field", "dog")); cat = new SpanTermQuery(new Term("field", "cat")); indexSearcher = new IndexSearcher(directory); reader = IndexReader.open(directory); } private void dumpSpans(SpanQuery query) throws IOException { // 检索效果和TermQuery一样,可以把他当成TermQuery Hits hits = indexSearcher.search(query); for (int i = 0; i < hits.length(); i++) { // System.out.println(hits.doc(i).get("field")); } // 但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。 Spans spans = query.getSpans(reader); int numSpans = 0; float[] scores = new float[2]; for (int i = 0; i < hits.length(); i++) { scores[hits.id(i)] = hits.score(i); } while (spans.next()) { numSpans++; int id = spans.doc(); Document doc = reader.document(id); Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc .get("field")); StringBuffer buffer = new StringBuffer(); for (int i = 0; i < tokens.length; i++) { // the quick brown fox jumps over the lazy dog // spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3 // 在第二项的位置后加<,第三项后加> 返回<brown> if (i == spans.start()) { buffer.append("<"); } buffer.append(tokens[i].termText()); if (i + 1 == spans.end()) { buffer.append(">"); } buffer.append(" "); } buffer.append("(" + scores[id] + ") "); System.out.println(buffer); } // indexSearcher.close(); } // SpanTermQuery:检索效果完全同TermQuery,但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。 public void spanTermQueryTest() throws IOException { dumpSpans(brown); //// 搜索结果 // the quick <brown> fox jumps over the lazy dog (0.22097087) } // SpanFirstQuery:查找方式为从Field的内容起始位置开始,在一个固定的宽度内查找所指定的词条。 public void spanFirstQueryTest() throws IOException { // the quick brown fox jumps over the lazy dog // 在给定的范围搜索,前两个为the quick // brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到 SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3); dumpSpans(firstQuery); ////搜索结果 // the quick <brown> fox jumps over the lazy dog (0.22097087) } // SpanNearQuery:功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语,还有可能是另一个SpanQuery的查询结果作为整体考虑,进行嵌套查询。 public void spanNearQueryTest() throws IOException { // the quick brown fox jumps over the lazy dog // 第二个参数为两个项的位置之间允许的最大间隔 // 在这里两个较远的项为quick和fox,他们之是的最大间隔为5,所以slop必须>=5才能搜到结果 SpanNearQuery nearQuery = new SpanNearQuery(new SpanQuery[] { quick, brown, fox }, 5, true); dumpSpans(nearQuery); // 与PhraseQuery短语搜索相似 // 这里搜索quick,dog,brown,要想得到结果,就要将brown向后移动5个位置才能到dog的后面,所以slop要>=5才能找到结果 // 第三个参数,如果为true表示保持各项位置不变,顺序搜索 nearQuery = new SpanNearQuery(new SpanQuery[] { quick, dog, brown }, 5, false); dumpSpans(nearQuery); //////搜索结果///// // 第一个dumpSpans的结果 the <quick brown fox> jumps over the lazy dog (0.34204215) // 第二个dumpSpans的结果 the <quick brown fox jumps over the lazy dog> (0.27026406) } // 从第一个SpanQuery查询结果中,去掉第二个SpanQuery查询结果,作为检索结果 public void spanNotQueryTest() throws IOException { // the quick brown fox jumps over the lazy dog SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick, fox }, 1, true); // 结果为quick brown fox 和 quick red fox dumpSpans(quick_fox); // SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog); // // dumpSpans(quick_fox_dog); // 在quick_fox结果中,去掉red,结果为quick brown fox SpanNotQuery no_quick_red_fox = new SpanNotQuery(quick_fox, red); dumpSpans(no_quick_red_fox); //////搜索结果///////第一个dumpSpans结果为前两条,第二个dumpSpans结果为第三条 //the <quick brown fox> jumps over the lazy dog (0.18579213) //the <quick red fox> jumps over the sleepy cat (0.18579213) //the <quick brown fox> jumps over the lazy dog (0.18579213) } // SpanOrQuery:把所有SpanQuery查询结果综合起来,作为检索结果。 public void spanOrQueryTest() throws IOException { SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick, fox }, 1, true); SpanNearQuery lazy_dog = new SpanNearQuery( new SpanQuery[] { lazy, dog }, 0, true); SpanNearQuery sleepy_cat = new SpanNearQuery(new SpanQuery[] { sleepy, cat }, 0, true); SpanNearQuery qf_near_ld = new SpanNearQuery(new SpanQuery[] { quick_fox, lazy_dog }, 3, true); dumpSpans(qf_near_ld); SpanNearQuery qf_near_sc = new SpanNearQuery(new SpanQuery[] { quick_fox, sleepy_cat }, 3, true); dumpSpans(qf_near_sc); SpanOrQuery or = new SpanOrQuery(new SpanQuery[] { qf_near_ld, qf_near_sc }); dumpSpans(or); /////////搜索结果 第一个dumpSpans结果为第一条,第二个为第二条,第三个为第三,四条 // the <quick brown fox jumps over the lazy dog> (0.3321948) // the <quick red fox jumps over the sleepy cat> (0.3321948) // the <quick brown fox jumps over the lazy dog> (0.5405281) // the <quick red fox jumps over the sleepy cat> (0.5405281) } public static void main(String[] args) throws IOException { SpanQueryTest test = new SpanQueryTest(); test.index(); test.spanOrQueryTest(); } } class AnalyzerUtils { public static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader( text)); boolean b = true; List<Token> list = new ArrayList<Token>(); while (b) { Token token = stream.next(); if (token == null) b = false; else list.add(token); } return (Token[]) list.toArray(new Token[0]); } }