代码为QueryParser.jj,语法为JavaCC实现的LL():
完整文档:http://lucene.apache.org/java/2_0_0/queryparsersyntax.html
和正则一样:
?表示0个或1个
+表示一个或多个
*表示0个或多个
以下是Token部分:
_NUM_CHAR::
=
[
"
0
"
-
"
9
"
]
//
数字
_ESCAPED_CHAR:: = " \\ " [ " \\ " , " + " , " - " , " ! " , " ( " , " ) " , " : " , " ^ " , " [ " , " ] " , " \ "" , " { " , " } " , " ~ " , " * " , " ? " ] > //特殊字符,
_TERM_START_CHAR :: = ( ~ [ " " , " \t " , " \n " , " \r " , " + " , " - " , " ! " , " ( " , " ) " , " : " , " ^ " , " [ " , " ] " , " \ "" , " { " , " } " , " ~ " , " * " , " ? " ] //TERM的起始字符,除了列出的其它字符都可以
_TERM_CHAR:: = ( < _TERM_START_CHAR > | < _ESCAPED_CHAR > | " - " | " + " ) > // TERM可使用字符
_WHITESPACE:: = ( " " | " \t " | " \n " | " \r " ) // 空格和回车,
< DEFAULT > TOKEN:
AND:: = ( " AND " | " && " )
OR:: = ( " OR " | " || " )
NOT:: = ( " NOT " | " ! " )
PLUS:: = " + "
MINUS:: = " - "
LPAREN:: = " ( "
RPAREN:: = " ) "
COLON:: = " : "
STAR:: = " * "
CARAT:: = " ^ " // 后接Boost,原文<CARAT: "^" > : Boost,后面Boost说明什么没明白
QUOTED:: = " \ "" (~[ " \ "" ] | " \\\ "" )+ " \ "" // 表示用"包起来的字符串,字符"开始,中间由不是"的符号或者连着的这两个符号\"组成,字符"结束,
TERM:: =< _TERM_START_CHAR > ( < _TERM_CHAR > ) *
FUZZY_SLOP:: = " ~ " ( ( < _NUM_CHAR > ) + ( " . " ( < _NUM_CHAR > ) + ) ? ) ? // 字符~开始,而后是数字.Lucene支持模糊查询,例如"roam~"或"roam~0.8",The value is between 0 and 1,算法为the Levenshtein Distance, or Edit Distance algorithm
PREFIXTERM:: = ( < _TERM_START_CHAR > | " * " ) ( < _TERM_CHAR > ) * " * " > // 模糊查找,表示以某某开头的查询, 字符表示为"something*",前缀允许模糊符号*,中间可有字符也可没有, 结尾必须是*
WILDTERM:: = ( < _TERM_START_CHAR > | [ " * " , " ? " ]) ( < _TERM_CHAR > | ( [ " * " , " ? " ] )) * > // 类似上面,但同时支持?字符,结尾可以是字符也可以是* ?。使用[]表示or关系时,不需要使用|,只要,号分割即可
RANGEIN_START:: = " [ " // 在RangeQuery中,[或{表示了是否包含边界条件本身, 用字符表示为"[begin TO end]" 或者"{begin TO end}",后接RangeIn
RANGEEX_START:: = " { " // 同上,后接RangeEx
< Boost > TOKEN:
NUMBER:: = ( < _NUM_CHAR > ) + ( " . " ( < _NUM_CHAR > ) + ) ? // 后接DEFAULT, 整数或小数
< RangeIn > TOKEN:
RANGEIN_TO:: = " TO "
RANGEIN_END:: = " ] " // 后接DEFAULT, RangIn的结束
RANGEIN_QUOTED:: = " \ "" (~[ " \ "" ] | " \\\ "" )+ " \ "" // 同上述QUOTED,表示用"包起来的字符串,
RANGEIN_GOOP:: = ( ~ [ " " , " ] " ]) + // 1个或多个不是空格和]的符号,这样就能提取出[]中的内容
< RangeEx > TOKEN :
RANGEEX_TO:: = " TO " >
RANGEEX_END:: = " } " // 后接DEFAULT, RangeEx的结束
RANGEEX_QUOTED:: = " \ "" (~[ " \ "" ] | " \\\ "" )+ " \ "" // 同上述QUOTED,表示用"包起来的字符串,
RANGEEX_GOOP:: = ( ~ [ " " , " } " ]) + // 1个或多个不是空格和]的符号,这样就能提取出[]中的内容
< DEFAULT, RangeIn, RangeEx > SKIP : {
< < _WHITESPACE >>
} // 所有空格和回车被忽略
_ESCAPED_CHAR:: = " \\ " [ " \\ " , " + " , " - " , " ! " , " ( " , " ) " , " : " , " ^ " , " [ " , " ] " , " \ "" , " { " , " } " , " ~ " , " * " , " ? " ] > //特殊字符,
_TERM_START_CHAR :: = ( ~ [ " " , " \t " , " \n " , " \r " , " + " , " - " , " ! " , " ( " , " ) " , " : " , " ^ " , " [ " , " ] " , " \ "" , " { " , " } " , " ~ " , " * " , " ? " ] //TERM的起始字符,除了列出的其它字符都可以
_TERM_CHAR:: = ( < _TERM_START_CHAR > | < _ESCAPED_CHAR > | " - " | " + " ) > // TERM可使用字符
_WHITESPACE:: = ( " " | " \t " | " \n " | " \r " ) // 空格和回车,
< DEFAULT > TOKEN:
AND:: = ( " AND " | " && " )
OR:: = ( " OR " | " || " )
NOT:: = ( " NOT " | " ! " )
PLUS:: = " + "
MINUS:: = " - "
LPAREN:: = " ( "
RPAREN:: = " ) "
COLON:: = " : "
STAR:: = " * "
CARAT:: = " ^ " // 后接Boost,原文<CARAT: "^" > : Boost,后面Boost说明什么没明白
QUOTED:: = " \ "" (~[ " \ "" ] | " \\\ "" )+ " \ "" // 表示用"包起来的字符串,字符"开始,中间由不是"的符号或者连着的这两个符号\"组成,字符"结束,
TERM:: =< _TERM_START_CHAR > ( < _TERM_CHAR > ) *
FUZZY_SLOP:: = " ~ " ( ( < _NUM_CHAR > ) + ( " . " ( < _NUM_CHAR > ) + ) ? ) ? // 字符~开始,而后是数字.Lucene支持模糊查询,例如"roam~"或"roam~0.8",The value is between 0 and 1,算法为the Levenshtein Distance, or Edit Distance algorithm
PREFIXTERM:: = ( < _TERM_START_CHAR > | " * " ) ( < _TERM_CHAR > ) * " * " > // 模糊查找,表示以某某开头的查询, 字符表示为"something*",前缀允许模糊符号*,中间可有字符也可没有, 结尾必须是*
WILDTERM:: = ( < _TERM_START_CHAR > | [ " * " , " ? " ]) ( < _TERM_CHAR > | ( [ " * " , " ? " ] )) * > // 类似上面,但同时支持?字符,结尾可以是字符也可以是* ?。使用[]表示or关系时,不需要使用|,只要,号分割即可
RANGEIN_START:: = " [ " // 在RangeQuery中,[或{表示了是否包含边界条件本身, 用字符表示为"[begin TO end]" 或者"{begin TO end}",后接RangeIn
RANGEEX_START:: = " { " // 同上,后接RangeEx
< Boost > TOKEN:
NUMBER:: = ( < _NUM_CHAR > ) + ( " . " ( < _NUM_CHAR > ) + ) ? // 后接DEFAULT, 整数或小数
< RangeIn > TOKEN:
RANGEIN_TO:: = " TO "
RANGEIN_END:: = " ] " // 后接DEFAULT, RangIn的结束
RANGEIN_QUOTED:: = " \ "" (~[ " \ "" ] | " \\\ "" )+ " \ "" // 同上述QUOTED,表示用"包起来的字符串,
RANGEIN_GOOP:: = ( ~ [ " " , " ] " ]) + // 1个或多个不是空格和]的符号,这样就能提取出[]中的内容
< RangeEx > TOKEN :
RANGEEX_TO:: = " TO " >
RANGEEX_END:: = " } " // 后接DEFAULT, RangeEx的结束
RANGEEX_QUOTED:: = " \ "" (~[ " \ "" ] | " \\\ "" )+ " \ "" // 同上述QUOTED,表示用"包起来的字符串,
RANGEEX_GOOP:: = ( ~ [ " " , " } " ]) + // 1个或多个不是空格和]的符号,这样就能提取出[]中的内容
< DEFAULT, RangeIn, RangeEx > SKIP : {
< < _WHITESPACE >>
} // 所有空格和回车被忽略
以下为解析部分
Conjunction::
=
[
<
AND
>
{ ret
=
CONJ_AND; }
|
<
OR
>
{ ret
=
CONJ_OR; } ]
//
连接
Modifiers:: = [ < PLUS > { ret = MOD_REQ; } | < MINUS > { ret = MOD_NOT; } | < NOT > { ret = MOD_NOT; } ] // + - !符号
Query:: = Modifiers Clause (Conjunction Modifiers Clause) *
Clause:: = [( < TERM > < COLON >|< STAR > < COLON > )] // btw:代码中LOOKAHEAD[2]表示使用LL(2)
(Term |< LPAREN > Query < RPAREN > ( < CARAT > < NUMBER > ) ? ) // 子句. ???????这儿语法有点,仿佛允许 *:(*:dog)这样的语法,很奇怪
Term:: = (
( < TERM >|< STAR >|< PREFIXTERM >|< WILDTERM >|< NUMBER > ) [ < FUZZY_SLOP > ] [ < CARAT >< NUMBER > [ < FUZZY_SLOP > ]}
| ( < RANGEIN_START > ( < RANGEIN_GOOP >|< RANGEIN_QUOTED > ) [ < RANGEIN_TO > ] ( < RANGEIN_GOOP >|< RANGEIN_QUOTED > < RANGEIN_END > ) [ < CARAT > boost =< NUMBER > ] // 这儿看出range必须同时有两端,不能只有有一端
| ( < RANGEEX_START > < RANGEEX_GOOP >|< RANGEEX_QUOTED > [ < RANGEEX_TO > ] < RANGEEX_GOOP >|< RANGEEX_QUOTED > < RANGEEX_END > )[ < CARAT > boost =< NUMBER > ] // 在RangeQuery中,[或{表示了是否包含边界条件本身, 用字符表示为"[begin TO end]" 或者"{begin TO end}",后接RangeIn
| < QUOTED > [ < FUZZY_SLOP > ] [ < CARAT > boost =< NUMBER > ] // 被""包含的内容
Modifiers:: = [ < PLUS > { ret = MOD_REQ; } | < MINUS > { ret = MOD_NOT; } | < NOT > { ret = MOD_NOT; } ] // + - !符号
Query:: = Modifiers Clause (Conjunction Modifiers Clause) *
Clause:: = [( < TERM > < COLON >|< STAR > < COLON > )] // btw:代码中LOOKAHEAD[2]表示使用LL(2)
(Term |< LPAREN > Query < RPAREN > ( < CARAT > < NUMBER > ) ? ) // 子句. ???????这儿语法有点,仿佛允许 *:(*:dog)这样的语法,很奇怪
Term:: = (
( < TERM >|< STAR >|< PREFIXTERM >|< WILDTERM >|< NUMBER > ) [ < FUZZY_SLOP > ] [ < CARAT >< NUMBER > [ < FUZZY_SLOP > ]}
| ( < RANGEIN_START > ( < RANGEIN_GOOP >|< RANGEIN_QUOTED > ) [ < RANGEIN_TO > ] ( < RANGEIN_GOOP >|< RANGEIN_QUOTED > < RANGEIN_END > ) [ < CARAT > boost =< NUMBER > ] // 这儿看出range必须同时有两端,不能只有有一端
| ( < RANGEEX_START > < RANGEEX_GOOP >|< RANGEEX_QUOTED > [ < RANGEEX_TO > ] < RANGEEX_GOOP >|< RANGEEX_QUOTED > < RANGEEX_END > )[ < CARAT > boost =< NUMBER > ] // 在RangeQuery中,[或{表示了是否包含边界条件本身, 用字符表示为"[begin TO end]" 或者"{begin TO end}",后接RangeIn
| < QUOTED > [ < FUZZY_SLOP > ] [ < CARAT > boost =< NUMBER > ] // 被""包含的内容
btw: 猜测: javacc中,如果使用[],则允许出现0次或1次