lucene本省对中文分词有支持,不过支持的不好,其分词方式是机械的将中文词一个分成一个进行存储,例如:成都信息工程大学,最终分成为::成|都|信|息|工|程|大|学,显然这种分词方式是低效且浪费存储空间的,IK分词是林良益前辈自定义写的一个专门针对中文分词的分析器,最新版本为2012年的版本for4.0之后未做更新,后续版本lucene的接口改变使其不支持,所以需要进行修改。
(谷歌不稳定访问不了)国内源码地址:http://git.oschina.net/wltea/IK-Analyzer-2012FF 网盘下载:链接:http://pan.baidu.com/s/1jIt7kGm 密码:hu1g
lucene6.5.0下载地址:https://lucene.apache.org 网盘下载:链接:http://pan.baidu.com/s/1mic8iBe 密码:axca
下载源码之后解压并导入到单独的java project,然后再导入lucene的jar包,如图所示,是我的工程结构
导入后修改四个文件:IKAnalyzer和IKTokenizer以及SWMCQueryBuilder、IKQueryExpressionParser,至于demo中的两个文件可直接删除或进行修改,我进行了修改。修改方式很简单,这里贴出修改的原文,以及修改后工程和源码下载。
修改后的工程地址:链接:http://pan.baidu.com/s/1nuALOql密码:miyq
编译好的IKAnalyzer的jar包下载地址:http://download.csdn.net/detail/fanpei_moukoy/9796612可直接导入lucene项目进行使用
IKAnalyzer
/**
* IK 中文分词 版本 6.5.0
* IK Analyzer release 6.5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.lucene;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.IOUtils;
/**
* IK分词器,Lucene Analyzer接口实现
* 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28
*/
public final class IKAnalyzer extends Analyzer{
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false);
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super();
this.useSmart = useSmart;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Reader reader=null;
try{
reader=new StringReader(fieldName);
IKTokenizer it = new IKTokenizer(reader);
return new Analyzer.TokenStreamComponents(it);
}finally {
IOUtils.closeWhileHandlingException(reader);
}
}
}
IKTokenizer
/**
* IK 中文分词 版本 6.5.0
* IK Analyzer release 6.5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* IK分词器 Lucene Tokenizer适配器类 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28
*/
public final class IKTokenizer extends Tokenizer {
// IK分词器实现
private IKSegmenter _IKImplement;
// 词元文本属性
private final CharTermAttribute termAtt;
// 词元位移属性
private final OffsetAttribute offsetAtt;
// 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
// 记录最后一个词元的结束位置
private int endPosition;
public IKTokenizer(Reader in) {
this(in, false);
}
/**
* Lucene 6.5.0 Tokenizer适配器类构造函数
*
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in, boolean useSmart) {
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input, useSmart);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
// 清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
// 设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元
return true;
}
// 返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}
IKQueryExpressionParser
/**
* IK 中文分词 版本 6.5.0
* IK Analyzer release 6.5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.query;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Stack;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BytesRef;
/**
* IK简易查询表达式解析
* 结合SWMCQuery算法 暴走抹茶 2017.3.28
*
* 表达式例子 :
* (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
* @author linliangyi
*
*/
public class IKQueryExpressionParser {
//public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
private List elements = new ArrayList();
private Stack querys = new Stack();
private Stack operates = new Stack();
/**
* 解析查询表达式,生成Lucene Query对象
*
* @param expression
* @param quickMode
* @return Lucene query
*/
public Query parseExp(String expression , boolean quickMode){
Query lucenceQuery = null;
if(expression != null && !"".equals(expression.trim())){
try{
//文法解析
this.splitElements(expression);
//语法解析
this.parseSyntax(quickMode);
if(this.querys.size() == 1){
lucenceQuery = this.querys.pop();
}else{
throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
}
}finally{
elements.clear();
querys.clear();
operates.clear();
}
}
return lucenceQuery;
}
/**
* 表达式文法解析
* @param expression
*/
private void splitElements(String expression){
if(expression == null){
return;
}
Element curretElement = null;
char[] expChars = expression.toCharArray();
for(int i = 0 ; i < expChars.length ; i++){
switch(expChars[i]){
case '&' :
if(curretElement == null){
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChars[i]);
}else if(curretElement.type == '&'){
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
}else if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChars[i]);
}
break;
case '|' :
if(curretElement == null){
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChars[i]);
}else if(curretElement.type == '|'){
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
}else if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChars[i]);
}
break;
case '-' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '-';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case '(' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '(';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ')' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ')';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ':' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ':';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case '=' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '=';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ' ' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else{
this.elements.add(curretElement);
curretElement = null;
}
}
break;
case '\'' :
if(curretElement == null){
curretElement = new Element();
curretElement.type = '\'';
}else if(curretElement.type == '\''){
this.elements.add(curretElement);
curretElement = null;
}else{
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '\'';
}
break;
case '[':
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '[';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ']':
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ']';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case '{':
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '{';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case '}':
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '}';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ',':
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ',';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
default :
if(curretElement == null){
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChars[i]);
}else if(curretElement.type == 'F'){
curretElement.append(expChars[i]);
}else if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else{
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChars[i]);
}
}
}
if(curretElement != null){
this.elements.add(curretElement);
curretElement = null;
}
}
/**
* 语法解析
*
*/
private void parseSyntax(boolean quickMode){
for(int i = 0 ; i < this.elements.size() ; i++){
Element e = this.elements.get(i);
if('F' == e.type){
Element e2 = this.elements.get(i + 1);
if('=' != e2.type && ':' != e2.type){
throw new IllegalStateException("表达式异常: = 或 : 号丢失");
}
Element e3 = this.elements.get(i + 2);
//处理 = 和 : 运算
if('\'' == e3.type){
i+=2;
if('=' == e2.type){
TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
this.querys.push(tQuery);
}else if(':' == e2.type){
String keyword = e3.toString();
//SWMCQuery Here
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode);
this.querys.push(_SWMCQuery);
}
}else if('[' == e3.type || '{' == e3.type){
i+=2;
//处理 [] 和 {}
LinkedList eQueue = new LinkedList();
eQueue.add(e3);
for( i++ ; i < this.elements.size() ; i++){
Element eN = this.elements.get(i);
eQueue.add(eN);
if(']' == eN.type || '}' == eN.type){
break;
}
}
//翻译RangeQuery
Query rangeQuery = this.toTermRangeQuery(e , eQueue);
this.querys.push(rangeQuery);
}else{
throw new IllegalStateException("表达式异常:匹配值丢失");
}
}else if('(' == e.type){
this.operates.push(e);
}else if(')' == e.type){
boolean doPop = true;
while(doPop && !this.operates.empty()){
Element op = this.operates.pop();
if('(' == op.type){
doPop = false;
}else {
Query q = toBooleanQuery(op);
this.querys.push(q);
}
}
}else{
if(this.operates.isEmpty()){
this.operates.push(e);
}else{
boolean doPeek = true;
while(doPeek && !this.operates.isEmpty()){
Element eleOnTop = this.operates.peek();
if('(' == eleOnTop.type){
doPeek = false;
this.operates.push(e);
}else if(compare(e , eleOnTop) == 1){
this.operates.push(e);
doPeek = false;
}else if(compare(e , eleOnTop) == 0){
Query q = toBooleanQuery(eleOnTop);
this.operates.pop();
this.querys.push(q);
}else{
Query q = toBooleanQuery(eleOnTop);
this.operates.pop();
this.querys.push(q);
}
}
if(doPeek && this.operates.empty()){
this.operates.push(e);
}
}
}
}
while(!this.operates.isEmpty()){
Element eleOnTop = this.operates.pop();
Query q = toBooleanQuery(eleOnTop);
this.querys.push(q);
}
}
/**
* 根据逻辑操作符,生成BooleanQuery
* @param op
* @return
*/
private Query toBooleanQuery(Element op){
if(this.querys.size() == 0){
return null;
}
//BooleanQuery resultQuery = null;
Builder builder = new Builder();
if(this.querys.size() == 1){
return this.querys.get(0);
}
Query q2 = this.querys.pop();
Query q1 = this.querys.pop();
if('&' == op.type){
if(q1 != null){
if(q1 instanceof BooleanQuery){
List clauses = ((BooleanQuery)q1).clauses();
if(clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.MUST){
for(BooleanClause c : clauses){
builder.add(c);
}
}else{
builder.add(q1,Occur.MUST);
}
}else{
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
builder.add(q1,Occur.MUST);
}
}
if(q2 != null){
if(q2 instanceof BooleanQuery){
List clauses = ((BooleanQuery)q2).clauses();
if(clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.MUST){
for(BooleanClause c : clauses){
builder.add(c);
}
}else{
builder.add(q2,Occur.MUST);
}
}else{
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
builder.add(q2,Occur.MUST);
}
}
}else if('|' == op.type){
if(q1 != null){
if(q1 instanceof BooleanQuery){
List clauses = ((BooleanQuery)q1).clauses();
if(clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.SHOULD){
for(BooleanClause c : clauses){
builder.add(c);
}
}else{
builder.add(q1,Occur.SHOULD);
}
}else{
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
builder.add(q1,Occur.SHOULD);
}
}
if(q2 != null){
if(q2 instanceof BooleanQuery){
List clauses = ((BooleanQuery)q2).clauses();
if(clauses.size() > 0
&& clauses.get(0).getOccur() == Occur.SHOULD){
for(BooleanClause c : clauses){
builder.add(c);
}
}else{
builder.add(q2,Occur.SHOULD);
}
}else{
//q2 instanceof TermQuery
//q2 instanceof TermRangeQuery
//q2 instanceof PhraseQuery
//others
builder.add(q2,Occur.SHOULD);
}
}
}else if('-' == op.type){
if(q1 == null || q2 == null){
throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
}
if(q1 instanceof BooleanQuery){
List clauses = ((BooleanQuery)q1).clauses();
if(clauses.size() > 0){
for(BooleanClause c : clauses){
builder.add(c);
}
}else{
builder.add(q1,Occur.MUST);
}
}else{
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
builder.add(q1,Occur.MUST);
}
builder.add(q2,Occur.MUST_NOT);
}
return builder.build();
}
/**
* 组装TermRangeQuery
* @param elements
* @return
*/
private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){
boolean includeFirst = false;
boolean includeLast = false;
String firstValue = null;
String lastValue = null;
//检查第一个元素是否是[或者{
Element first = elements.getFirst();
if('[' == first.type){
includeFirst = true;
}else if('{' == first.type){
includeFirst = false;
}else {
throw new IllegalStateException("表达式异常");
}
//检查最后一个元素是否是]或者}
Element last = elements.getLast();
if(']' == last.type){
includeLast = true;
}else if('}' == last.type){
includeLast = false;
}else {
throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
}
if(elements.size() < 4 || elements.size() > 5){
throw new IllegalStateException("表达式异常, RangeQuery 错误");
}
//读出中间部分
Element e2 = elements.get(1);
if('\'' == e2.type){
firstValue = e2.toString();
//
Element e3 = elements.get(2);
if(',' != e3.type){
throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
}
//
Element e4 = elements.get(3);
if('\'' == e4.type){
lastValue = e4.toString();
}else if(e4 != last){
throw new IllegalStateException("表达式异常,RangeQuery格式错误");
}
}else if(',' == e2.type){
firstValue = null;
//
Element e3 = elements.get(2);
if('\'' == e3.type){
lastValue = e3.toString();
}else{
throw new IllegalStateException("表达式异常,RangeQuery格式错误");
}
}else {
throw new IllegalStateException("表达式异常, RangeQuery格式错误");
}
return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);
}
/**
* 比较操作符优先级
* @param e1
* @param e2
* @return
*/
private int compare(Element e1 , Element e2){
if('&' == e1.type){
if('&' == e2.type){
return 0;
}else {
return 1;
}
}else if('|' == e1.type){
if('&' == e2.type){
return -1;
}else if('|' == e2.type){
return 0;
}else{
return 1;
}
}else{
if('-' == e2.type){
return 0;
}else{
return -1;
}
}
}
/**
* 表达式元素(操作符、FieldName、FieldValue)
* @author linliangyi
* May 20, 2010
*/
private class Element{
char type = 0;
StringBuffer eleTextBuff;
public Element(){
eleTextBuff = new StringBuffer();
}
public void append(char c){
this.eleTextBuff.append(c);
}
public String toString(){
return this.eleTextBuff.toString();
}
}
public static void main(String[] args){
IKQueryExpressionParser parser = new IKQueryExpressionParser();
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
Query result = parser.parseExp(ikQueryExp , true);
System.out.println(result);
}
}
SWMCQueryBuilder
/**
* IK 中文分词 版本 6.5.0
* IK Analyzer release 6.5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.query;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* Single Word Multi Char Query Builder
* IK分词算法专用 暴走抹茶 2017.3.28
* @author linliangyi
*
*/
public class SWMCQueryBuilder {
/**
* 生成SWMCQuery
* @param fieldName
* @param keywords
* @param quickMode
* @return Lucene Query
*/
public static Query create(String fieldName ,String keywords , boolean quickMode){
if(fieldName == null || keywords == null){
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
}
//1.对keywords进行分词处理
List lexemes = doAnalyze(keywords);
//2.根据分词结果,生成SWMCQuery
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
return _SWMCQuery;
}
/**
* 分词切分,并返回结链表
* @param keywords
* @return
*/
private static List doAnalyze(String keywords){
List lexemes = new ArrayList();
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
try{
Lexeme l = null;
while( (l = ikSeg.next()) != null){
lexemes.add(l);
}
}catch(IOException e){
e.printStackTrace();
}
return lexemes;
}
/**
* 根据分词结果生成SWMC搜索
* @param fieldName
* @param pathOption
* @param quickMode
* @return
*/
private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){
//构造SWMC的查询表达式
StringBuffer keywordBuffer = new StringBuffer();
//精简的SWMC的查询表达式
StringBuffer keywordBuffer_Short = new StringBuffer();
//记录最后词元长度
int lastLexemeLength = 0;
//记录最后词元结束位置
int lastLexemeEnd = -1;
int shortCount = 0;
int totalCount = 0;
for(Lexeme l : lexemes){
totalCount += l.getLength();
//精简表达式
if(l.getLength() > 1){
keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength();
}
if(lastLexemeLength == 0){
keywordBuffer.append(l.getLexemeText());
}else if(lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
keywordBuffer.append(l.getLexemeText());
}else{
keywordBuffer.append(' ').append(l.getLexemeText());
}
lastLexemeLength = l.getLength();
lastLexemeEnd = l.getEndPosition();
}
//借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
try {
//System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer_Short.toString());
return q;
} catch (ParseException e) {
e.printStackTrace();
}
}else{
if(keywordBuffer.length() > 0){
try {
//System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer.toString());
return q;
} catch (ParseException e) {
e.printStackTrace();
}
}
}
return null;
}
}