Lucene布尔查询中的句子查询

Lucene支持term查询(TermQuery)、布尔查询、词语查询(PhraseQuery)、范围查询(RangeQuery)、前缀查询(PrefixQuery)、模糊查询(FuzzyQuery)等。Lucene的布尔查询又包括求交查询、求并查询和求差查询。在此以求交查询为例,说明Lucene的句子查询。


测试程序说明

使用Lucene的求交的布尔查询。为支持中文分词,使用JE的MMAnalyzer。使用文档1,文档2,query1,query2,query3,query4进行测试:

核心代码及测试结果:

String[] searchWords = {"金融时报欧元区问题","美国华尔街 评论美元" ,"分析欧元"};

IndexSearcher indexSearcher = new IndexSearcher(dir);
Analyzer analyzer = new MMAnalyzer();
QueryParser qp = new QueryParser("contents", analyzer);
// queryParser默认是求并搜索,此处设置为求交搜索
qp.setDefaultOperator(QueryParser.AND_OPERATOR);

for(int i=0; i<searchWords.length; i++){
    query = qp.parse(searchWords[i]);
    //打印parse结果:
    System.out.println(query.toString());
    Hits results = indexSearcher.search(query);
}

Ø  文档1:美国华尔街日报评论文章指出,这次六国央行联手放低美元换汇利率,是起源于今年9月份,联储官员与华人街对冲基金高管们闭门会议后的延续动作。http://url.cn/1W7NrE

查询分析 1:

Query1:“美国华尔街评论问题”

Parse结果:contents:"美国 华尔街 评论 文章"

查询结果:无结果

结论:取词“美国华尔街评论文章”的拉链为空


查询分析2:

Query2:“美国华尔街 评论美元”

Parse结果:+contents:"美国 华尔街" +contents:"评论 美元"

查询结果:无结果

结论:取词“美国华尔街”的拉链不为空,取词“评论美元”的拉链为空


Ø  文档2:金融 路透社分析指出,在全球经济都处于脆弱的状态下,如果12月九日的高峰会仍然是因政治博弈而没实质性消息出台的话,欧元存活会受到极大威胁,全球股市会大幅震荡。

查询分析1:

Query3:“分析 欧元”

Parse结果:+contents:分析 +contents:欧元

查询结果:有结果

结论:取词“分析”和“欧元”的拉链都不为空


查询分析2:

Query4:“分析欧元”

Parse结果:contents:"分析 欧元"

查询结果:无结果

结论:取词“分析欧元”的拉链为空


测试结论:

在布尔查询的求交查询中,QueryParse的parse方法将query中以空格分隔的词作为基本的term,而不是一些基本term的组合。例如上例中Parse结果:contents:"美国 华尔街 评论 文章",其表明contents后的内容是一个term,而不是4个term。


改进方法:

对原始查询重新构造新的查询,将query=“美国华尔街评论问题”,先构造为“美国 华尔街 评论 文章”,再去查询,以下的代码详细地说明了此构造过程:

package search;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Boolean_search {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		try {
			
			//根据索引文件夹位置,构造查询器
			File indexDir = new File("D://lucene/index");
			if (!indexDir.exists()) {
				System.out.println("The Lucene index is not exist");
				return;
			}
			Directory dir = FSDirectory.getDirectory(indexDir, false);
			IndexSearcher indexSearcher = new IndexSearcher(dir);

			//构造分词器、分析器
			Analyzer analyzer = new MMAnalyzer();
			QueryParser qp = new QueryParser("contents", analyzer);
			// queryParser默认是求并搜索,此处设置为求交搜索
			qp.setDefaultOperator(QueryParser.AND_OPERATOR);

			//初始化查询query
			String[] searchWords = {"金融时报欧元区问题","美国华尔街 评论美元" ,"分析,欧元"};//, "分析" ,"欧元区"};
			String queryWords = "";
			Query query = null;
			
			//对每个原始查询query构造new query,使用句子查询,打印文档具体得分
			for (int i = 0; i < searchWords.length; i++) {	
				
				//对每个原始查询query构造new query,用以下两种方法:
				//方法一:利用分词器构造new query
				queryWords = getTermsByAnalyzer(analyzer,searchWords[i]);
				//方法二:利用分析器构造new query
				//queryWords = getTermsByQueryParser(qp,searchWords[i]);
				
				//使用查询器进行求交查询
				query = qp.parse(queryWords); 
				Hits results = indexSearcher.search(query);
				
				System.out.println(results.length() + " search results for query " + searchWords[i]);
				System.out.println("query is parsed as:" + queryWords);
				
				//查看query检索结果中文档的具体得分
				if(results.length() > 0){
					for(int k=0; k<results.length(); k++){
						String explain = indexSearcher.explain(query, results.id(k)).toString();
						System.out.println(explain);
					}
				}
			}

		} catch (CorruptIndexException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	/**
	 * 使用分词器构造new query
	 * @param analyzer
	 * @param s
	 * @throws Exception
	 */
	public static String getTermsByAnalyzer(Analyzer analyzer, String s) throws Exception { 

		    String queryWord = "";
		    StringReader reader = new StringReader(s); 
		    TokenStream ts = analyzer.tokenStream(s, reader); 

		    //查看分词后的结果
			System.out.println("分词器分词结果如下:");
		    Token t = ts.next(); 
		    while (t != null) { 
		      queryWord += t.termText()+" ";
		      System.out.println(t.termText());
		      t = ts.next();
		    } 
		    return queryWord;
	} 
	
	/**
	 * 使用分析器构造new query
	 * @param qp
	 * @param s
	 * @return
	 */
	public static String getTermsByQueryParser(QueryParser qp, String s){
		String queryWord = "";
		Query query = null;
		Query tmp_query = null;
		
        //1.将s以空格分隔,得到查询词组
		String tmp_words[] = s.split(" ");
		
		//处理每个查询词组
		for(int j=0; j<tmp_words.length; j++){
			//2.对每个查询词组分词
			try {
				tmp_query = qp.parse(tmp_words[j]);
			} catch (ParseException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			String tmp_str = tmp_query.toString();
			//System.out.println(tmp_str);
			//3.将查询词组转化为以空格分隔的词的形式
			if(tmp_str.charAt(9) != '\"'){
				queryWord += tmp_str.substring(9, tmp_str.length()) + " ";
			}else{
				queryWord += tmp_str.substring(10, tmp_str.length()-1) + " ";
			}	
			//System.out.println(queryWords);
		}//queryWords 包含了查询中所有词
		
		return queryWord;
	}
}

运行结果如下:

分词器分词结果如下:
金融
时报
欧元区
问题
1 search results for query 金融时报欧元区问题
query is parsed as:金融 时报 欧元区 问题 
0.8425362 = (MATCH) sum of:
  0.22584343 = (MATCH) weight(contents:金融 in 5), product of:
    0.51773727 = queryWeight(contents:金融), product of:
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.18545197 = queryNorm
    0.43621242 = (MATCH) fieldWeight(contents:金融 in 5), product of:
      1.0 = tf(termFreq(contents:金融)=1)
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=5)
  0.22584343 = (MATCH) weight(contents:时报 in 5), product of:
    0.51773727 = queryWeight(contents:时报), product of:
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.18545197 = queryNorm
    0.43621242 = (MATCH) fieldWeight(contents:时报 in 5), product of:
      1.0 = tf(termFreq(contents:时报)=1)
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=5)
  0.16500592 = (MATCH) weight(contents:欧元区 in 5), product of:
    0.442543 = queryWeight(contents:欧元区), product of:
      2.3862944 = idf(docFreq=2, numDocs=12)
      0.18545197 = queryNorm
    0.3728585 = (MATCH) fieldWeight(contents:欧元区 in 5), product of:
      1.0 = tf(termFreq(contents:欧元区)=1)
      2.3862944 = idf(docFreq=2, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=5)
  0.22584343 = (MATCH) weight(contents:问题 in 5), product of:
    0.51773727 = queryWeight(contents:问题), product of:
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.18545197 = queryNorm
    0.43621242 = (MATCH) fieldWeight(contents:问题 in 5), product of:
      1.0 = tf(termFreq(contents:问题)=1)
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=5)


分词器分词结果如下:
美国
华尔街
评论
美元
1 search results for query 美国华尔街 评论美元
query is parsed as:美国 华尔街 评论 美元 
0.68775237 = (MATCH) sum of:
  0.12486125 = (MATCH) weight(contents:美国 in 11), product of:
    0.42608652 = queryWeight(contents:美国), product of:
      1.8754687 = idf(docFreq=4, numDocs=12)
      0.22718935 = queryNorm
    0.293042 = (MATCH) fieldWeight(contents:美国 in 11), product of:
      1.0 = tf(termFreq(contents:美国)=1)
      1.8754687 = idf(docFreq=4, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=11)
  0.20214175 = (MATCH) weight(contents:华尔街 in 11), product of:
    0.54214066 = queryWeight(contents:华尔街), product of:
      2.3862944 = idf(docFreq=2, numDocs=12)
      0.22718935 = queryNorm
    0.3728585 = (MATCH) fieldWeight(contents:华尔街 in 11), product of:
      1.0 = tf(termFreq(contents:华尔街)=1)
      2.3862944 = idf(docFreq=2, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=11)
  0.27667123 = (MATCH) weight(contents:评论 in 11), product of:
    0.63425803 = queryWeight(contents:评论), product of:
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.22718935 = queryNorm
    0.43621242 = (MATCH) fieldWeight(contents:评论 in 11), product of:
      1.0 = tf(termFreq(contents:评论)=1)
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=11)
  0.08407816 = (MATCH) weight(contents:美元 in 11), product of:
    0.3496436 = queryWeight(contents:美元), product of:
      1.5389965 = idf(docFreq=6, numDocs=12)
      0.22718935 = queryNorm
    0.2404682 = (MATCH) fieldWeight(contents:美元 in 11), product of:
      1.0 = tf(termFreq(contents:美元)=1)
      1.5389965 = idf(docFreq=6, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=11)


分词器分词结果如下:
分析
欧元
1 search results for query 分析,欧元
query is parsed as:分析 欧元 
0.57385075 = (MATCH) sum of:
  0.24226412 = (MATCH) weight(contents:分析 in 0), product of:
    0.64974815 = queryWeight(contents:分析), product of:
      2.3862944 = idf(docFreq=2, numDocs=12)
      0.27228332 = queryNorm
    0.3728585 = (MATCH) fieldWeight(contents:分析 in 0), product of:
      1.0 = tf(termFreq(contents:分析)=1)
      2.3862944 = idf(docFreq=2, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=0)
  0.33158666 = (MATCH) weight(contents:欧元 in 0), product of:
    0.76014954 = queryWeight(contents:欧元), product of:
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.27228332 = queryNorm
    0.43621242 = (MATCH) fieldWeight(contents:欧元 in 0), product of:
      1.0 = tf(termFreq(contents:欧元)=1)
      2.7917595 = idf(docFreq=1, numDocs=12)
      0.15625 = fieldNorm(field=contents, doc=0)




你可能感兴趣的:(exception,金融,String,Lucene,search,query)