Lucene支持term查询(TermQuery)、布尔查询、词语查询(PhraseQuery)、范围查询(RangeQuery)、前缀查询(PrefixQuery)、模糊查询(FuzzyQuery)等。Lucene的布尔查询又包括求交查询、求并查询和求差查询。在此以求交查询为例,说明Lucene的句子查询。
测试程序说明:
使用Lucene的求交的布尔查询。为支持中文分词,使用JE的MMAnalyzer。使用文档1,文档2,query1,query2,query3,query4进行测试:
核心代码及测试结果:
String[] searchWords = {"金融时报欧元区问题","美国华尔街 评论美元" ,"分析欧元"}; IndexSearcher indexSearcher = new IndexSearcher(dir); Analyzer analyzer = new MMAnalyzer(); QueryParser qp = new QueryParser("contents", analyzer); // queryParser默认是求并搜索,此处设置为求交搜索 qp.setDefaultOperator(QueryParser.AND_OPERATOR); for(int i=0; i<searchWords.length; i++){ query = qp.parse(searchWords[i]); //打印parse结果: System.out.println(query.toString()); Hits results = indexSearcher.search(query); }
Ø 文档1:美国华尔街日报评论文章指出,这次六国央行联手放低美元换汇利率,是起源于今年9月份,联储官员与华人街对冲基金高管们闭门会议后的延续动作。http://url.cn/1W7NrE
查询分析 1:
Query1:“美国华尔街评论问题”
Parse结果:contents:"美国 华尔街 评论 文章"
查询结果:无结果
结论:取词“美国华尔街评论文章”的拉链为空
查询分析2:
Query2:“美国华尔街 评论美元”
Parse结果:+contents:"美国 华尔街" +contents:"评论 美元"
查询结果:无结果
结论:取词“美国华尔街”的拉链不为空,取词“评论美元”的拉链为空
Ø 文档2:金融 路透社分析指出,在全球经济都处于脆弱的状态下,如果12月九日的高峰会仍然是因政治博弈而没实质性消息出台的话,欧元存活会受到极大威胁,全球股市会大幅震荡。
查询分析1:
Query3:“分析 欧元”
Parse结果:+contents:分析 +contents:欧元
查询结果:有结果
结论:取词“分析”和“欧元”的拉链都不为空
查询分析2:
Query4:“分析欧元”
Parse结果:contents:"分析 欧元"
查询结果:无结果
结论:取词“分析欧元”的拉链为空
测试结论:
在布尔查询的求交查询中,QueryParse的parse方法将query中以空格分隔的词作为基本的term,而不是一些基本term的组合。例如上例中Parse结果:contents:"美国 华尔街 评论 文章",其表明contents后的内容是一个term,而不是4个term。
改进方法:
对原始查询重新构造新的查询,将query=“美国华尔街评论问题”,先构造为“美国 华尔街 评论 文章”,再去查询,以下的代码详细地说明了此构造过程:
package search; import java.io.File; import java.io.IOException; import java.io.StringReader; import jeasy.analysis.MMAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Boolean_search { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { //根据索引文件夹位置,构造查询器 File indexDir = new File("D://lucene/index"); if (!indexDir.exists()) { System.out.println("The Lucene index is not exist"); return; } Directory dir = FSDirectory.getDirectory(indexDir, false); IndexSearcher indexSearcher = new IndexSearcher(dir); //构造分词器、分析器 Analyzer analyzer = new MMAnalyzer(); QueryParser qp = new QueryParser("contents", analyzer); // queryParser默认是求并搜索,此处设置为求交搜索 qp.setDefaultOperator(QueryParser.AND_OPERATOR); //初始化查询query String[] searchWords = {"金融时报欧元区问题","美国华尔街 评论美元" ,"分析,欧元"};//, "分析" ,"欧元区"}; String queryWords = ""; Query query = null; //对每个原始查询query构造new query,使用句子查询,打印文档具体得分 for (int i = 0; i < searchWords.length; i++) { //对每个原始查询query构造new query,用以下两种方法: //方法一:利用分词器构造new query queryWords = getTermsByAnalyzer(analyzer,searchWords[i]); //方法二:利用分析器构造new query //queryWords = getTermsByQueryParser(qp,searchWords[i]); //使用查询器进行求交查询 query = qp.parse(queryWords); Hits results = indexSearcher.search(query); System.out.println(results.length() + " search results for query " + searchWords[i]); System.out.println("query is parsed as:" + queryWords); //查看query检索结果中文档的具体得分 if(results.length() > 0){ for(int k=0; k<results.length(); k++){ String explain = indexSearcher.explain(query, results.id(k)).toString(); System.out.println(explain); } } } } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 使用分词器构造new query * @param analyzer * @param s * @throws Exception */ public static String getTermsByAnalyzer(Analyzer analyzer, String s) throws Exception { String queryWord = ""; StringReader reader = new StringReader(s); TokenStream ts = analyzer.tokenStream(s, reader); //查看分词后的结果 System.out.println("分词器分词结果如下:"); Token t = ts.next(); while (t != null) { queryWord += t.termText()+" "; System.out.println(t.termText()); t = ts.next(); } return queryWord; } /** * 使用分析器构造new query * @param qp * @param s * @return */ public static String getTermsByQueryParser(QueryParser qp, String s){ String queryWord = ""; Query query = null; Query tmp_query = null; //1.将s以空格分隔,得到查询词组 String tmp_words[] = s.split(" "); //处理每个查询词组 for(int j=0; j<tmp_words.length; j++){ //2.对每个查询词组分词 try { tmp_query = qp.parse(tmp_words[j]); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } String tmp_str = tmp_query.toString(); //System.out.println(tmp_str); //3.将查询词组转化为以空格分隔的词的形式 if(tmp_str.charAt(9) != '\"'){ queryWord += tmp_str.substring(9, tmp_str.length()) + " "; }else{ queryWord += tmp_str.substring(10, tmp_str.length()-1) + " "; } //System.out.println(queryWords); }//queryWords 包含了查询中所有词 return queryWord; } }
分词器分词结果如下:
金融
时报
欧元区
问题
1 search results for query 金融时报欧元区问题
query is parsed as:金融 时报 欧元区 问题
0.8425362 = (MATCH) sum of:
0.22584343 = (MATCH) weight(contents:金融 in 5), product of:
0.51773727 = queryWeight(contents:金融), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.18545197 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:金融 in 5), product of:
1.0 = tf(termFreq(contents:金融)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
0.22584343 = (MATCH) weight(contents:时报 in 5), product of:
0.51773727 = queryWeight(contents:时报), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.18545197 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:时报 in 5), product of:
1.0 = tf(termFreq(contents:时报)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
0.16500592 = (MATCH) weight(contents:欧元区 in 5), product of:
0.442543 = queryWeight(contents:欧元区), product of:
2.3862944 = idf(docFreq=2, numDocs=12)
0.18545197 = queryNorm
0.3728585 = (MATCH) fieldWeight(contents:欧元区 in 5), product of:
1.0 = tf(termFreq(contents:欧元区)=1)
2.3862944 = idf(docFreq=2, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
0.22584343 = (MATCH) weight(contents:问题 in 5), product of:
0.51773727 = queryWeight(contents:问题), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.18545197 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:问题 in 5), product of:
1.0 = tf(termFreq(contents:问题)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
分词器分词结果如下:
美国
华尔街
评论
美元
1 search results for query 美国华尔街 评论美元
query is parsed as:美国 华尔街 评论 美元
0.68775237 = (MATCH) sum of:
0.12486125 = (MATCH) weight(contents:美国 in 11), product of:
0.42608652 = queryWeight(contents:美国), product of:
1.8754687 = idf(docFreq=4, numDocs=12)
0.22718935 = queryNorm
0.293042 = (MATCH) fieldWeight(contents:美国 in 11), product of:
1.0 = tf(termFreq(contents:美国)=1)
1.8754687 = idf(docFreq=4, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
0.20214175 = (MATCH) weight(contents:华尔街 in 11), product of:
0.54214066 = queryWeight(contents:华尔街), product of:
2.3862944 = idf(docFreq=2, numDocs=12)
0.22718935 = queryNorm
0.3728585 = (MATCH) fieldWeight(contents:华尔街 in 11), product of:
1.0 = tf(termFreq(contents:华尔街)=1)
2.3862944 = idf(docFreq=2, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
0.27667123 = (MATCH) weight(contents:评论 in 11), product of:
0.63425803 = queryWeight(contents:评论), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.22718935 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:评论 in 11), product of:
1.0 = tf(termFreq(contents:评论)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
0.08407816 = (MATCH) weight(contents:美元 in 11), product of:
0.3496436 = queryWeight(contents:美元), product of:
1.5389965 = idf(docFreq=6, numDocs=12)
0.22718935 = queryNorm
0.2404682 = (MATCH) fieldWeight(contents:美元 in 11), product of:
1.0 = tf(termFreq(contents:美元)=1)
1.5389965 = idf(docFreq=6, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
分词器分词结果如下:
分析
欧元
1 search results for query 分析,欧元
query is parsed as:分析 欧元
0.57385075 = (MATCH) sum of:
0.24226412 = (MATCH) weight(contents:分析 in 0), product of:
0.64974815 = queryWeight(contents:分析), product of:
2.3862944 = idf(docFreq=2, numDocs=12)
0.27228332 = queryNorm
0.3728585 = (MATCH) fieldWeight(contents:分析 in 0), product of:
1.0 = tf(termFreq(contents:分析)=1)
2.3862944 = idf(docFreq=2, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=0)
0.33158666 = (MATCH) weight(contents:欧元 in 0), product of:
0.76014954 = queryWeight(contents:欧元), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.27228332 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:欧元 in 0), product of:
1.0 = tf(termFreq(contents:欧元)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=0)