词法语法解析

  • 熟练掌握词法、语法的解析流程及原理

openGauss在执行SQL语句时,使用flex,bison对语句进行词法分析,语法分析
词法语法分析的入口函数是raw_parser(parser.cpp),raw_parser调用base_yyparse进行词法语法分析
-> scan.l: 词法文件,由flex编译生成scan.cpp
-> gram.y: 语法文件,由bison编译生成gram.cpp
-> kwlist.h: 列出所有关键字
-> keywords.cpp: 常量定义
-> kwlookup.cpp:二分法确认当前词是否关键字
-> scansup.cpp:词法分析相关函数

  • scan.l
    scan.l识别SQL语句中的关键字,标识符,常量,操作符,终结符等。
// flex由%%分为三个部分
/* 定义段 */
%{
...
%}
...
/* 规则段 */
%%
...
%%
/* 用户子程序段 */

相关数据结构

// 关键字
typedef struct ScanKeyword {
    const char* name; /* 名称:小写 */
    int16 value;      /* token */
    int16 category;   /* 类型 */
} ScanKeyword;

定义段

%{
... // 定义宏,函数及include的文件
%}

%option reentrant  // 生成可重用的扫描器API
%option bison-bridge // 生成的扫描器API能够被bision调用
%option bison-locations
%option 8bit // 8位扫描器
%option never-interactive // 非交互式
%option nodefault
%option noinput
%option nounput
%option noyywrap // 不调用yywrap()
%option noyyalloc
%option noyyrealloc
%option noyyfree
%option warn
%option prefix="core_yy" // yy开头的函数名替换为core_yy开头

// 定义开始状态,对特定的规则进行匹配
%x xb // 位串
%x xc // 扩展C样式注释
%x xd // 双引号标识符
%x xh // 16进制数字字符串
%x xe // 扩展引号字符串(支持反斜杠转义序列)
%x xq // 标准引用字符串
%x xdolq // $xxx$
%x xui // unicode转义的标识符
%x xus // unicode转义的字符串
%x xeu // 扩展引号字符串中的Unicode代理项对

// 匹配正则表达式
// 空格,换行,备注
space           [ \t\n\r\f]
horiz_space     [ \t\f]
newline         [\n\r]
non_newline     [^\n\r]

comment         ("--"{non_newline}*)

whitespace      ({space}+|{comment})

special_whitespace      ({space}+|{comment}{newline})
horiz_whitespace        ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)

// 引号
quote           '
quotestop       {quote}{whitespace}*
quotecontinue   {quote}{whitespace_with_newline}{quote}
quotefail       {quote}{whitespace}*"-"

// 位串
xbstart         [bB]{quote}
xbinside        [^']*

// 16进制 
xhstart         [xX]{quote}
xhinside        [^']*

// n' 这种
xnstart         [nN]{quote}

/* Quoted string that allows backslash escapes */
xestart         [eE]{quote}  // e' 这种
xeinside        [^\\']+
xeescape        [\\][^0-7]
xeoctesc        [\\][0-7]{1,3}
xehexesc        [\\]x[0-9A-Fa-f]{1,2}
xeunicode       [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
xeunicodefail   [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})

/* Extended quote
 * xqdouble implements embedded quote, ''''
 */
xqstart         {quote}
xqdouble        {quote}{quote}
xqinside        [^']+

// $xxx$ 相关
dolq_start      [A-Za-z\200-\377_]
dolq_cont       [A-Za-z\200-\377_0-9]
dolqdelim       \$({dolq_start}{dolq_cont}*)?\$
dolqfailed      \${dolq_start}{dolq_cont}*
dolqinside      [^$]+

// 双引号
dquote          \"
xdstart         {dquote}
xdstop          {dquote}
xddouble        {dquote}{dquote}
xdinside        [^"]+

/* Unicode escapes */
uescape         [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
/* error rule to avoid backup */
uescapefail     ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])

/* Quoted identifier with Unicode escapes */
xuistart        [uU]&{dquote}
xuistop1        {dquote}{whitespace}*{uescapefail}?
xuistop2        {dquote}{whitespace}*{uescape}

/* Quoted string with Unicode escapes */
xusstart        [uU]&{quote}
xusstop1        {quote}{whitespace}*{uescapefail}?
xusstop2        {quote}{whitespace}*{uescape}

/* error rule to avoid backup */
xufailed        [uU]&

// C样式注释
xcstart         \/\*{op_chars}*
xcstop          \*+\/
xcinside        [^*/]+

digit           [0-9]
ident_start     [A-Za-z\200-\377_]
ident_cont      [A-Za-z\200-\377_0-9\$\#]

identifier      {ident_start}{ident_cont}*

typecast        "::"
plus_join       "(+)"
dot_dot         \.\.
colon_equals    ":="
para_equals "=>"

/*
 * "self" is the set of chars that should be returned as single-character
 * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
 * which can be one or more characters long (but if a single-char token
 * appears in the "self" set, it is not to be returned as an Op).  Note
 * that the sets overlap, but each has some chars that are not in the other.
 *
 * If you change either set, adjust the character lists appearing in the
 * rule for "operator"!
 */
self            [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
op_chars        [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
operator        {op_chars}+

/* we no longer allow unary minus in numbers.
 * instead we pass it separately to parser. there it gets
 * coerced via doNegate() -- Leon aug 20 1999
 *
* {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
*
 * {realfail1} and {realfail2} are added to prevent the need for scanner
 * backup when the {real} rule fails to match completely.
 */

integer         {digit}+
decimal         (({digit}*\.{digit}+)|({digit}+\.{digit}*))
decimalfail     {digit}+\.\.
real            ({integer}|{decimal})[Ee][-+]?{digit}+
realfail1       ({integer}|{decimal})[Ee]
realfail2       ({integer}|{decimal})[Ee][-+]

param           \${integer}

newParam        :({identifier}|{integer})

newArray        :({integer}{space}*\])

other           .

规则段

/* 规则 { 执行代码 } */
%%

{whitespace}    { /* 忽略空格,换行,备注 */ }
// 匹配备注 /* */格式
{xcstart}   {
                    SET_YYLLOC(); // 设置当前位置
                    yyextra->xcdepth = 0;
                    BEGIN(xc);
                    /* Put back any characters past slash-star; see above */
                    yyless(2); // 将当前token除前2个字符外的字符返回到输入流
                    if (yyextra->is_hint_str)
                    {
                        startlit();
                        addlit(yytext, yyleng, yyscanner);
                    }
        }

{xcstart}   {
                    (yyextra->xcdepth)++;
                    /* Put back any characters past slash-star; see above */
                    yyless(2);
                    if (yyextra->is_hint_str)
                    {
                        addlit(yytext, yyleng, yyscanner);
                    }
        }

{xcstop}    {
                    if (yyextra->xcdepth <= 0)
                        BEGIN(INITIAL);
                    else
                        (yyextra->xcdepth)--;

                    if (yyextra->is_hint_str)
                    {   
                        addlit(yytext, yyleng, yyscanner);
                        yylval->str = litbufdup(yyscanner);
                        yyextra->is_hint_str = false;
                        return COMMENTSTRING;
                    }
        }

{xcinside}  {
                    if (yyextra->is_hint_str)
                    {
                        addlit(yytext, yyleng, yyscanner);
                    }
        }

{op_chars}  {
                    if (yyextra->is_hint_str)
                    {
                        addlit(yytext, yyleng, yyscanner);
                    }
        }

\*+     {
                    if (yyextra->is_hint_str)
                    {
                        addlit(yytext, yyleng, yyscanner);
                    }
        }

<>     { yyerror("unterminated /* comment"); }
// 匹配 b''
{xbstart}       {
                    /* Binary bit type.
                     * At some point we should simply pass the string
                     * forward to the parser and label it there.
                     * In the meantime, place a leading "b" on the string
                     * to mark it for the input routine as a binary string.
                     */
                    SET_YYLLOC();
                    BEGIN(xb);
                    startlit();
                    addlitchar('b', yyscanner);
                }
{quotestop} |
{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    yylval->str = litbufdup(yyscanner);
                    yyextra->is_hint_str = false;
                    return BCONST;
        }
{xhinside}  |
{xbinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
{quotecontinue} |
{quotecontinue} {
                    /* ignore */
                }
<>     { yyerror("unterminated bit string literal"); }
// 匹配16进制  x''
{xhstart}       {
                    /* Hexadecimal bit type.
                     * At some point we should simply pass the string
                     * forward to the parser and label it there.
                     * In the meantime, place a leading "x" on the string
                     * to mark it for the input routine as a hex string.
                     */
                    SET_YYLLOC();
                    BEGIN(xh);
                    startlit();
                    addlitchar('x', yyscanner);
                }
{quotestop} |
{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    yylval->str = litbufdup(yyscanner);
                    yyextra->is_hint_str = false;
                    return XCONST;
        }
<>     { yyerror("unterminated hexadecimal string literal"); }
// 匹配n''
{xnstart}       {
                    /* National character.
                     * We will pass this along as a normal character string,
                     * but preceded with an internally-generated "NCHAR".
                     */
                    const ScanKeyword *keyword;

                    SET_YYLLOC();
                    yyless(1);              /* eat only 'n' this time */

                    keyword = ScanKeywordLookup("nchar",
                                                yyextra->keywords,
                                                yyextra->num_keywords);
                    if (keyword != NULL)
                    {
                        yylval->keyword = keyword->name;
                        yyextra->is_hint_str = false;
                        return keyword->value;
                    }
                    else
                    {
                        /* If NCHAR isn't a keyword, just return "n" */
                        yylval->str = pstrdup("n");
                        yyextra->ident_quoted = false;
                        yyextra->is_hint_str = false;
                        return IDENT;
                    }
                }
// 匹配 ''''
{xqstart}       {
                    yyextra->warn_on_first_escape = true;
                    yyextra->saw_non_ascii = false;
                    SET_YYLLOC();
                    if (u_sess->attr.attr_sql.standard_conforming_strings)
                        BEGIN(xq);
                    else
                        BEGIN(xe);
                    startlit();
                }
{xestart}       {
                    yyextra->warn_on_first_escape = false;
                    yyextra->saw_non_ascii = false;
                    SET_YYLLOC();
                    BEGIN(xe);
                    startlit();
                }
{xusstart}      {
                    SET_YYLLOC();
                    if (!u_sess->attr.attr_sql.standard_conforming_strings)
                        ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 errmsg("unsafe use of string constant with Unicode escapes"),
                                 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
                                 lexer_errposition()));
                    BEGIN(xus);
                    startlit();
                }
{quotestop}  |
{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    /*
                     * check that the data remains valid if it might have been
                     * made invalid by unescaping any chars.
                     */
                    if (yyextra->saw_non_ascii)
                        pg_verifymbstr(yyextra->literalbuf,
                                       yyextra->literallen,
                                       false);
                    yylval->str = litbufdup(yyscanner);
                    yyextra->is_hint_str = false;
                    return SCONST;
                }
{xusstop1} {
                    /* throw back all but the quote */
                    yyless(1);
                    BEGIN(INITIAL);
                    yylval->str = litbuf_udeescape('\\', yyscanner);
                    yyextra->is_hint_str = false;
                    return SCONST;
        }
{xusstop2} {
                    BEGIN(INITIAL);
                    yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner);
                    yyextra->is_hint_str = false;
                    return SCONST;
        }
{xqdouble} {
                    addlitchar('\'', yyscanner);
                }
{xqinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
{xeinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
{xeunicode} {
                    pg_wchar c = strtoul(yytext+2, NULL, 16);

                    check_escape_warning(yyscanner);

                    if (is_utf16_surrogate_first(c))
                    {
                        yyextra->utf16_first_part = c;
                        BEGIN(xeu);
                    }
                    else if (is_utf16_surrogate_second(c))
                        yyerror("invalid Unicode surrogate pair");
                    else
                        addunicode(c, yyscanner);
                }
{xeunicode} {
                    pg_wchar c = strtoul(yytext+2, NULL, 16);

                    if (!is_utf16_surrogate_second(c))
                        yyerror("invalid Unicode surrogate pair");

                    c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);

                    addunicode(c, yyscanner);

                    BEGIN(xe);
                }
.          { yyerror("invalid Unicode surrogate pair"); }
\n         { yyerror("invalid Unicode surrogate pair"); }
<>    { yyerror("invalid Unicode surrogate pair"); }
{xeunicodefail} {
                        ereport(ERROR,
                                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                                 errmsg("invalid Unicode escape"),
                                 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
                                 lexer_errposition()));
                }
{xeescape}  {
                    if (yytext[1] == '\'')
                    {
                        if (u_sess->attr.attr_sql.backslash_quote == BACKSLASH_QUOTE_OFF ||
                            (u_sess->attr.attr_sql.backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
                             PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
                            ereport(ERROR,
                                    (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                                     errmsg("unsafe use of \\' in a string literal"),
                                     errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
                                     lexer_errposition()));
                    }
                    check_string_escape_warning(yytext[1], yyscanner);
                    addlitchar(unescape_single_char(yytext[1], yyscanner),
                               yyscanner);
                }
{xeoctesc}  {
                    unsigned char c = strtoul(yytext+1, NULL, 8);

                    check_escape_warning(yyscanner);
                    addlitchar(c, yyscanner);
                    if (c == '\0' || IS_HIGHBIT_SET(c))
                        yyextra->saw_non_ascii = true;
                }
{xehexesc}  {
                    unsigned char c = strtoul(yytext+2, NULL, 16);

                    check_escape_warning(yyscanner);
                    addlitchar(c, yyscanner);
                    if (c == '\0' || IS_HIGHBIT_SET(c))
                        yyextra->saw_non_ascii = true;
                }
{quotecontinue} {
                    /* ignore */
                }
.           {
                    /* This is only needed for \ just before EOF */
                    addlitchar(yytext[0], yyscanner);
                }
<>      { yyerror("unterminated quoted string"); }
// 匹配$xxx$
{dolqdelim}     {
                    SET_YYLLOC();
                    yyextra->dolqstart = pstrdup(yytext);
                    BEGIN(xdolq);
                    startlit();
                }
{dolqfailed}    {
                    SET_YYLLOC();
                    /* throw back all but the initial "$" */
                    yyless(1);
                    /* and treat it as {other} */
                    yyextra->is_hint_str = false;
                    return yytext[0];
                }
{dolqdelim} {
                    if (strcmp(yytext, yyextra->dolqstart) == 0)
                    {
                        FREE_POINTER(yyextra->dolqstart);
                        yyextra->dolqstart = NULL;
                        BEGIN(INITIAL);
                        yylval->str = litbufdup(yyscanner);
                        yyextra->is_hint_str = false;
                        return SCONST;
                    }
                    else
                    {
                        /*
                         * When we fail to match $...$ to dolqstart, transfer
                         * the $... part to the output, but put back the final
                         * $ for rescanning.  Consider $delim$...$junk$delim$
                         */
                        addlit(yytext, yyleng-1, yyscanner);
                        yyless(yyleng-1);
                    }
                }
{dolqinside} {
                    addlit(yytext, yyleng, yyscanner);
                }
{dolqfailed} {
                    addlit(yytext, yyleng, yyscanner);
                }
.        {
                    /* This is only needed for $ inside the quoted text */
                    addlitchar(yytext[0], yyscanner);
                }
<>  { yyerror("unterminated dollar-quoted string"); }

{xdstart}       {
                    SET_YYLLOC();
                    BEGIN(xd);
                    startlit();
                }
{xuistart}      {
                    SET_YYLLOC();
                    BEGIN(xui);
                    startlit();
                }
{xdstop}    {
                    char           *ident;

                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbufdup(yyscanner);
                    if (yyextra->literallen >= NAMEDATALEN)
                        truncate_identifier(ident, yyextra->literallen, yyextra->warnOnTruncateIdent);
                    yylval->str = ident;
                    yyextra->ident_quoted = true;
                    yyextra->is_hint_str = false;
                    return IDENT;
                }
{xuistop1} {
                    char           *ident;
                    int             identlen;

                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbuf_udeescape('\\', yyscanner);
                    identlen = strlen(ident);
                    if (identlen >= NAMEDATALEN)
                        truncate_identifier(ident, identlen, yyextra->warnOnTruncateIdent);
                    yylval->str = ident;
                    /* throw back all but the quote */
                    yyless(1);
                    yyextra->ident_quoted = false;
                    yyextra->is_hint_str = false;
                    return IDENT;
                }
{xuistop2} {
                    char           *ident;
                    int             identlen;

                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
                    identlen = strlen(ident);
                    if (identlen >= NAMEDATALEN)
                        truncate_identifier(ident, identlen, yyextra->warnOnTruncateIdent);
                    yylval->str = ident;
                    yyextra->ident_quoted = false;
                    yyextra->is_hint_str = false;
                    return IDENT;
                }
{xddouble}  {
                    addlitchar('"', yyscanner);
                }
{xdinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
<>     { yyerror("unterminated quoted identifier"); }

{xufailed}  {
                    char           *ident;

                    SET_YYLLOC();
                    /* throw back all but the initial u/U */
                    yyless(1);
                    /* and treat it as {identifier} */
                    ident = downcase_truncate_identifier(yytext, yyleng, yyextra->warnOnTruncateIdent);
                    yylval->str = ident;
                    yyextra->ident_quoted = false;
                    yyextra->is_hint_str = false;
                    return IDENT;
                }
// 匹配 ::
{typecast}      {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return TYPECAST;
                }
// 匹配(+)
{plus_join} {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return ORA_JOINOP;
                }
// 匹配 ..
{dot_dot}       {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return DOT_DOT;
                }
// 匹配 :=
{colon_equals}  {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return COLON_EQUALS;
                }
// 匹配 =>
{para_equals}   {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return PARA_EQUALS;
                }
// 匹配单字符
{self}          {
                    SET_YYLLOC();
                    /*
                     * Get the semicolon which is not in proc body nor in the '( )', treat it
                     * as end flag of a single query and store it in locationlist.
                     */
                    if (yyextra->dolqstart == NULL)
                    {
                        if (yytext[0] == '(')
                            yyextra->paren_depth++;
                        else if (yytext[0] == ')' && yyextra->paren_depth > 0)
                            yyextra->paren_depth--;
                        else if (yytext[0] == ';' && yyextra->paren_depth == 0 && !yyextra->in_slash_proc_body)
                            yyextra->query_string_locationlist = lappend_int(yyextra->query_string_locationlist, *yylloc);
                    }
                    yyextra->is_hint_str = false;
                    return yytext[0];
                }
// 匹配操作符
{operator}      {
                    /*
                     * Check for embedded slash-star or dash-dash; those
                     * are comment starts, so operator must stop there.
                     * Note that slash-star or dash-dash at the first
                     * character will match a prior rule, not this one.
                     */
                    int     nchars = yyleng;
                    char   *slashstar = strstr(yytext, "/*");
                    char   *dashdash = strstr(yytext, "--");

                    if (slashstar && dashdash)
                    {
                        /* if both appear, take the first one */
                        if (slashstar > dashdash)
                            slashstar = dashdash;
                    }
                    else if (!slashstar)
                        slashstar = dashdash;
                    if (slashstar)
                        nchars = slashstar - yytext;

                    /*
                     * For SQL compatibility, '+' and '-' cannot be the
                     * last char of a multi-char operator unless the operator
                     * contains chars that are not in SQL operators.
                     * The idea is to lex '=-' as two operators, but not
                     * to forbid operator names like '?-' that could not be
                     * sequences of SQL operators.
                     */
                    while (nchars > 1 &&
                           (yytext[nchars-1] == '+' ||
                            yytext[nchars-1] == '-'))
                    {
                        int     ic;

                        for (ic = nchars-2; ic >= 0; ic--)
                        {
                            if (strchr("~!@#^&|`?%", yytext[ic]))
                                break;
                        }
                        if (ic >= 0)
                            break; /* found a char that makes it OK */
                        nchars--; /* else remove the +/-, and check again */
                    }

                    SET_YYLLOC();

                    if (nchars < (int)yyleng)
                    {
                        /* Strip the unwanted chars from the token */
                        yyless(nchars);
                        /*
                         * If what we have left is only one char, and it's
                         * one of the characters matching "self", then
                         * return it as a character token the same way
                         * that the "self" rule would have.
                         */
                        if (nchars == 1 &&
                            strchr(",()[].;:+-*/%^<>=", yytext[0]))
                        {
                            yyextra->is_hint_str = false;
                            return yytext[0];
                        }
                    }

                    /*
                     * Complain if operator is too long.  Unlike the case
                     * for identifiers, we make this an error not a notice-
                     * and-truncate, because the odds are we are looking at
                     * a syntactic mistake anyway.
                     */
                    if (nchars >= NAMEDATALEN)
                        yyerror("operator too long");

                    /* Convert "!=" operator to "<>" for compatibility */
                    if (strcmp(yytext, "!=") == 0 || strcmp(yytext, "^=") == 0)
                    {
                        yylval->str = pstrdup("<>");
                        yyextra->is_hint_str = false;
                        return CmpOp;
                    }
                    else if (strcmp(yytext, ">=") == 0 || strcmp(yytext, "<=") == 0 || strcmp(yytext, "<>") == 0)
                    {
                        yylval->str = pstrdup(yytext);
                        yyextra->is_hint_str = false;
                        return CmpOp;
                    }
                    else
                        yylval->str = pstrdup(yytext);
                    yyextra->is_hint_str = false;
                    return Op;
                }
{newArray}      {
                    yyless(1);
                    yyextra->is_hint_str = false;
                    return yytext[0];
                }
// 匹配 $n
{param}         {
                    SET_YYLLOC();
                    yylval->ival = getDynaParamSeq(yytext + 1, false, false, yyscanner);
                    yyextra->is_hint_str = false;
                    return PARAM;
                }
{newParam}      {
                    SET_YYLLOC();
                    yylval->ival = getDynaParamSeq(yytext + 1, false, true, yyscanner);
                    yyextra->is_hint_str = false;
                    return PARAM;
                }
// 匹配整数
{integer}       {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return process_integer_literal(yytext, yylval);
                }
// 匹配浮点数
{decimal}       {
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    yyextra->is_hint_str = false;
                    return FCONST;
                }
{decimalfail}   {
                    /* throw back the .., and treat as integer */
                    yyless(yyleng-2);
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return process_integer_literal(yytext, yylval);
                }
{real}          {
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    yyextra->is_hint_str = false;
                    return FCONST;
                }
{realfail1}     {
                    /*
                     * throw back the [Ee], and treat as {decimal}.  Note
                     * that it is possible the input is actually {integer},
                     * but since this case will almost certainly lead to a
                     * syntax error anyway, we don't bother to distinguish.
                     */
                    yyless(yyleng-1);
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    yyextra->is_hint_str = false;
                    return FCONST;
                }
{realfail2}     {
                    /* throw back the [Ee][+-], and proceed as above */
                    yyless(yyleng-2);
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    yyextra->is_hint_str = false;
                    return FCONST;
                }

// 匹配关键字
{identifier}    {
                    const ScanKeyword *keyword;
                    char           *ident;

                    SET_YYLLOC();

                    /* 二分法确认是否关键字 */
                    keyword = ScanKeywordLookup(yytext,
                                                yyextra->keywords,
                                                yyextra->num_keywords);

                    yyextra->is_hint_str = false;

                    if (keyword != NULL)
                    {
                        yylval->keyword = keyword->name;

                        /* Find the CREATE PROCEDURE syntax and set dolqstart. */
                        if (keyword->value == CREATE)
                        {
                            yyextra->is_createstmt = true;
                        }
                        else if (keyword->value == TRIGGER && yyextra->is_createstmt)
                        {
                            /* Create trigger don't need set dolqstart */
                            yyextra->is_createstmt = false;
                        }
                        else if ((keyword->value == PROCEDURE || keyword->value == FUNCTION)
                                 && yyextra->is_createstmt)
                        {
                            /* Make yyextra->dolqstart not NULL means its in a proc with $$. */
                            yyextra->dolqstart = "";
                        }
                        else if (keyword->value == BEGIN_P)
                        {
                            /* cases that have to be a trans stmt and fall quickly */
                            if (yyg->yy_hold_char == ';' || /* found ';' after 'begin' */
                                yyg->yy_hold_char == '\0')  /* found '\0' after 'begin' */
                                return BEGIN_NON_ANOYBLOCK;
                            /* look for other transaction stmt */
                            if (is_trans_stmt(yyextra->scanbuf, yyextra->scanbuflen))
                                return BEGIN_NON_ANOYBLOCK;
                        }
                        else if (keyword->value == SELECT ||
                                 keyword->value == UPDATE||
                                 keyword->value == INSERT ||
                                 keyword->value == DELETE_P ||
                                 keyword->value == MERGE)
                        {
                            yyextra->is_hint_str = true;
                        }

                        return keyword->value;
                    }

                     /* 不是关键字,转换为小写,如果长度超过64进行截断 */ 
                    ident = downcase_truncate_identifier(yytext, yyleng, yyextra->warnOnTruncateIdent);
                    yylval->str = ident;
                    yyextra->ident_quoted = false;
                    return IDENT;
                }

{other}         {
                    SET_YYLLOC();
                    yyextra->is_hint_str = false;
                    return yytext[0];
                }

<>         {
                    SET_YYLLOC();
                    yyterminate();
                }

%%

程序段


/*
 * Arrange access to yyextra for subroutines of the main yylex() function.
 * We expect each subroutine to have a yyscanner parameter.  Rather than
 * use the yyget_xxx functions, which might or might not get inlined by the
 * compiler, we cheat just a bit and cast yyscanner to the right type.
 */
#undef yyextra
#define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)

/* Likewise for a couple of other things we need. */
#undef yylloc
#define yylloc  (((struct yyguts_t *) yyscanner)->yylloc_r)
#undef yyleng
#define yyleng  (((struct yyguts_t *) yyscanner)->yyleng_r)


// 返回词法或语法出错的位置
int
scanner_errposition(int location, core_yyscan_t yyscanner)
{
    int     pos;

    if (location < 0)
        return 0;               /* no-op if location is unknown */

    /* Convert byte offset to character number */
    pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
    /* And pass it to the ereport mechanism */
    return errposition(pos);
}

// 报告词法或语法错误
void
scanner_yyerror(const char *message, core_yyscan_t yyscanner)
{
    const char *loc = yyextra->scanbuf + *yylloc;

    if (*loc == YY_END_OF_BUFFER_CHAR)
    {
        ereport(ERROR,
                (errcode(ERRCODE_SYNTAX_ERROR),
                 /* translator: %s is typically the translation of "syntax error" */
                 errmsg("%s at end of input", _(message)),
                 lexer_errposition()));
    }
    else
    {
        ereport(ERROR,
                (errcode(ERRCODE_SYNTAX_ERROR),
                 /* translator: first %s is typically the translation of "syntax error" */
                 errmsg("%s at or near \"%s\"", _(message), loc),
                 lexer_errposition()));
    }
}

// 初始化flex扫描器
core_yyscan_t
scanner_init(const char *str,
             core_yy_extra_type *yyext,
             const ScanKeyword *keywords,
             int num_keywords)
{
    Size        slen = strlen(str);
    yyscan_t    scanner;
        
    // 初始化flex扫描器
    if (yylex_init(&scanner) != 0)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                    errmsg("yylex_init() failed: %m")));

    core_yyset_extra(yyext, scanner); // 将yyext赋值给scanner->yyextra

    yyext->keywords = keywords;  // 初始化关键字
    yyext->num_keywords = num_keywords; // 关键字数量
    yyext->in_slash_proc_body = false;
    yyext->paren_depth = 0;
    yyext->query_string_locationlist = NIL;
    yyext->is_createstmt = false;
    yyext->dolqstart = NULL;
    yyext->is_hint_str = false;
    yyext->parameter_list = NIL;

    /*
     * Make a scan buffer with special termination needed by flex.
     */
    yyext->scanbuf = (char *) palloc(slen + 2);
    yyext->scanbuflen = slen;
    memcpy(yyext->scanbuf, str, slen);
    yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
    yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);

    /* initialize literal buffer to a reasonable but expansible size */
    yyext->literalalloc = 1024;
    yyext->literalbuf = (char *) palloc(yyext->literalalloc);
    yyext->literallen = 0;
    yyext->warnOnTruncateIdent = true;

    // Added CALL for procedure and function
    getDynaParamSeq("init", true, true, NULL);

    return scanner;
}


// 解析完成后释放内存
void
scanner_finish(core_yyscan_t yyscanner)
{
    if (t_thrd.postgres_cxt.clear_key_memory)
    {
        errno_t rc = EOK;
        memset(yyextra->scanbuf, 0x7F, yyextra->scanbuflen);
        *(volatile char*)(yyextra->scanbuf) = *(volatile char*)(yyextra->scanbuf);
        rc = memset_s(yyextra->literalbuf, yyextra->literallen, 0x7F, yyextra->literallen);
        securec_check(rc, "\0", "\0");
    }

    /*
     * We don't bother to call yylex_destroy(), because all it would do
     * is pfree a small amount of control storage.  It's cheaper to leak
     * the storage until the parsing context is destroyed.  The amount of
     * space involved is usually negligible compared to the output parse
     * tree anyway.
     *
     * We do bother to pfree the scanbuf and literal buffer, but only if they
     * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
     */
    if (yyextra->scanbuflen >= 8192)
        FREE_POINTER(yyextra->scanbuf);
    if (yyextra->literalalloc >= 8192)
        FREE_POINTER(yyextra->literalbuf);
    if (yyextra->parameter_list)
    {
        list_free_deep(yyextra->parameter_list);
        yyextra->parameter_list = NIL;
    }
}


static void
addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
{
    /* enlarge buffer if needed */
    if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
    {
        do
        {
            yyextra->literalalloc *= 2;
        } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);

        /*when yytext is larger than 512M, its double will exceed 1G, so we use repalloc_huge */
        yyextra->literalbuf = (char *) repalloc_huge(yyextra->literalbuf,
                                                yyextra->literalalloc);
    }
    /* append new data */
    memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
    yyextra->literallen += yleng;
}


static void
addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
{
    /* enlarge buffer if needed */
    if ((yyextra->literallen + 1) >= yyextra->literalalloc)
    {
        yyextra->literalalloc *= 2;
        yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
                                                yyextra->literalalloc);
    }
    /* append new data */
    yyextra->literalbuf[yyextra->literallen] = ychar;
    yyextra->literallen += 1;
}


/*
 * Create a palloc'd copy of literalbuf, adding a trailing null.
 */
static char *
litbufdup(core_yyscan_t yyscanner)
{
    int         llen = yyextra->literallen;
    char       *newm;

    newm = (char *)palloc(llen + 1);
    memcpy(newm, yyextra->literalbuf, llen);
    newm[llen] = '\0';
    return newm;
}

static int
process_integer_literal(const char *token, YYSTYPE *lval)
{
    long        val;
    char       *endptr;

    errno = 0;
    val = strtol(token, &endptr, 10);
    if (*endptr != '\0' || errno == ERANGE
#ifdef HAVE_LONG_INT_64
        /* if long > 32 bits, check for overflow of int4 */
        || val != (long) ((int32) val)
#endif
        )
    {
        /* integer too large, treat it as a float */
        lval->str = pstrdup(token);
        return FCONST;
    }
    lval->ival = val;
    return ICONST;
}

static unsigned int
hexval(unsigned char c)
{
    if (c >= '0' && c <= '9')
        return c - '0';
    if (c >= 'a' && c <= 'f')
        return c - 'a' + 0xA;
    if (c >= 'A' && c <= 'F')
        return c - 'A' + 0xA;
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
        errmsg("invalid hexadecimal digit")));
    return 0; /* not reached */
}

static void
check_unicode_value(pg_wchar c, const char *loc, core_yyscan_t yyscanner)
{
    if (GetDatabaseEncoding() == PG_UTF8)
        return;

    if (c > 0x7F)
    {
        ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);   /* 3 for U&" */
        yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
    }
}

static bool
is_utf16_surrogate_first(pg_wchar c)
{
    return (c >= 0xD800 && c <= 0xDBFF);
}

static bool
is_utf16_surrogate_second(pg_wchar c)
{
    return (c >= 0xDC00 && c <= 0xDFFF);
}

static pg_wchar
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
{
    return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}

static void
addunicode(pg_wchar c, core_yyscan_t yyscanner)
{
    char buf[8];

    if (c == 0 || c > 0x10FFFF)
        yyerror("invalid Unicode escape value");
    if (c > 0x7F)
    {
        if (GetDatabaseEncoding() != PG_UTF8)
            yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
        yyextra->saw_non_ascii = true;
    }
    unicode_to_utf8(c, (unsigned char *) buf);
    addlit(buf, pg_mblen(buf), yyscanner);
}

static char *
litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
{
    char *newm;
    char *litbuf, *in, *out;
    pg_wchar pair_first = 0;

    if (isxdigit(escape)
        || escape == '+'
        || escape == '\''
        || escape == '"'
        || scanner_isspace(escape))
    {
        ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1);
        yyerror("invalid Unicode escape character");
    }

    /* Make literalbuf null-terminated to simplify the scanning loop */
    litbuf = yyextra->literalbuf;
    litbuf[yyextra->literallen] = '\0';

    /*
     * This relies on the subtle assumption that a UTF-8 expansion
     * cannot be longer than its escaped representation.
     */
    newm = (char *)palloc(yyextra->literallen + 1);

    in = litbuf;
    out = newm;
    while (*in)
    {
        if (in[0] == escape)
        {
            if (in[1] == escape)
            {
                if (pair_first)
                {
                    ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                    yyerror("invalid Unicode surrogate pair");
                }
                *out++ = escape;
                in += 2;
            }
            else if (isxdigit((unsigned char) in[1]) &&
                     isxdigit((unsigned char) in[2]) &&
                     isxdigit((unsigned char) in[3]) &&
                     isxdigit((unsigned char) in[4]))
            {
                pg_wchar unicode;

                unicode = (hexval(in[1]) << 12) +
                    (hexval(in[2]) << 8) +
                    (hexval(in[3]) << 4) +
                    hexval(in[4]);
                check_unicode_value(unicode, in, yyscanner);
                if (pair_first)
                {
                    if (is_utf16_surrogate_second(unicode))
                    {
                        unicode = surrogate_pair_to_codepoint(pair_first, unicode);
                        pair_first = 0;
                    }
                    else
                    {
                        ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                        yyerror("invalid Unicode surrogate pair");
                    }
                }
                else if (is_utf16_surrogate_second(unicode))
                    yyerror("invalid Unicode surrogate pair");

                if (is_utf16_surrogate_first(unicode))
                    pair_first = unicode;
                else
                {
                    unicode_to_utf8(unicode, (unsigned char *) out);
                    out += pg_mblen(out);
                }
                in += 5;
            }
            else if (in[1] == '+' &&
                     isxdigit((unsigned char) in[2]) &&
                     isxdigit((unsigned char) in[3]) &&
                     isxdigit((unsigned char) in[4]) &&
                     isxdigit((unsigned char) in[5]) &&
                     isxdigit((unsigned char) in[6]) &&
                     isxdigit((unsigned char) in[7]))
            {
                pg_wchar unicode;

                unicode = (hexval(in[2]) << 20) +
                    (hexval(in[3]) << 16) +
                    (hexval(in[4]) << 12) +
                    (hexval(in[5]) << 8) +
                    (hexval(in[6]) << 4) +
                    hexval(in[7]);
                check_unicode_value(unicode, in, yyscanner);
                if (pair_first)
                {
                    if (is_utf16_surrogate_second(unicode))
                    {
                        unicode = surrogate_pair_to_codepoint(pair_first, unicode);
                        pair_first = 0;
                    }
                    else
                    {
                        ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                        yyerror("invalid Unicode surrogate pair");
                    }
                }
                else if (is_utf16_surrogate_second(unicode))
                    yyerror("invalid Unicode surrogate pair");

                if (is_utf16_surrogate_first(unicode))
                    pair_first = unicode;
                else
                {
                    unicode_to_utf8(unicode, (unsigned char *) out);
                    out += pg_mblen(out);
                }
                in += 8;
            }
            else
            {
                ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                yyerror("invalid Unicode escape value");
            }
        }
        else
        {
            if (pair_first)
            {
                ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
                yyerror("invalid Unicode surrogate pair");
            }
            *out++ = *in++;
        }
    }

    /* unfinished surrogate pair? */
    if (pair_first)
    {
        ADVANCE_YYLLOC(in - litbuf + 3);            /* 3 for U&" */
        yyerror("invalid Unicode surrogate pair");
    }

    *out = '\0';
    /*
     * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
     * codes; but it's probably not worth the trouble, since this isn't
     * likely to be a performance-critical path.
     */
    pg_verifymbstr(newm, out - newm, false);
    return newm;
}

static unsigned char
unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
{
    switch (c)
    {
        case 'b':
            return '\b';
        case 'f':
            return '\f';
        case 'n':
            return '\n';
        case 'r':
            return '\r';
        case 't':
            return '\t';
        default:
            /* check for backslash followed by non-7-bit-ASCII */
            if (c == '\0' || IS_HIGHBIT_SET(c))
                yyextra->saw_non_ascii = true;

            return c;
    }
}

static void
check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
{
    if (ychar == '\'')
    {
        if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
            ereport(WARNING,
                    (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                     errmsg("nonstandard use of \\' in a string literal"),
                     errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
                     lexer_errposition()));
        yyextra->warn_on_first_escape = false;  /* warn only once per string */
    }
    else if (ychar == '\\')
    {
        if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
            ereport(WARNING,
                    (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                     errmsg("nonstandard use of \\\\ in a string literal"),
                     errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
                     lexer_errposition()));
        yyextra->warn_on_first_escape = false;  /* warn only once per string */
    }
    else
        check_escape_warning(yyscanner);
}

static void
check_escape_warning(core_yyscan_t yyscanner)
{
    if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
        ereport(WARNING,
                (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                 errmsg("nonstandard use of escape in a string literal"),
                 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
                 lexer_errposition()));
    yyextra->warn_on_first_escape = false;  /* warn only once per string */
}

/*
 * Interface functions to make flex use palloc() instead of malloc().
 * It'd be better to make these static, but flex insists otherwise.
 */

void *
core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
{
    return palloc(bytes);
}

void *
core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
{
    if (ptr)
        return repalloc(ptr, bytes);
    else
        return palloc(bytes);
}

void
core_yyfree(void *ptr, core_yyscan_t yyscanner)
{
    if (ptr)
        FREE_POINTER(ptr);
}


/*
 * @Description:  get the parameter sequence of dynamic SQL
 * @in string: parameter name
 * @in initflag:  mark the operation is init or not
 * @in placeholder: the flag to mark the binding parameter is placeholder or dollar quoting
 * @in yyscanner: for yyextra
 * @return - the sequence number of the parameter
 */
long 
getDynaParamSeq(const char *string, bool initflag, bool placeholder, core_yyscan_t yyscanner)
{
    int result = 0;
    char* str = NULL;
    const ListCell *cell;

    if (initflag)
    { 
        u_sess->parser_cxt.has_dollar = false;
        u_sess->parser_cxt.has_placeholder = false;
        return 0;
    }

    if (placeholder == false)
    {
        if (u_sess->parser_cxt.has_placeholder)
            ereport(ERROR, 
                    (errcode(ERRCODE_SYNTAX_ERROR), 
                    errmsg("It is forbidden to use placeholder and dollar quoting together.")));
        u_sess->parser_cxt.has_dollar = true;
        return atol(string);
    }

    u_sess->parser_cxt.has_placeholder = true;
    if (u_sess->parser_cxt.has_dollar)
        ereport(ERROR, 
                (errcode(ERRCODE_SYNTAX_ERROR), 
                errmsg("It is forbidden to use placeholder and dollar quoting together.")));


    foreach(cell, yyextra->parameter_list)
    {
        result++;
        if (strcmp((char*)(lfirst(cell)),string) == 0)
            return result;
    }

    str = pstrdup(string);
    yyextra->parameter_list = lappend(yyextra->parameter_list, (void*)str);

    return result + 1;
}

/*
 * @Description: if we found begin, check if is a transaction stmt
 * @param[IN] haystack:  the give source string
 * @param[IN] haystack_len: the length of haystack. Note that haystack may have been separated into words by '\0',
                            so haystack_len is needed.
 * @return: true is a transaction stmt, false if not.
 *
 * we have to deal with a tricky case in which we recieve a sql like "begin   " which is not terminated with ';' and
 * followed by servral blank char. In this case we add a variable 'found_non_blank_char' to handle this case.
 * if we haven't found any non blank char in the sql, consider it to be a transaction stmt.
 */
static bool
is_trans_stmt(const char *haystack, int haystack_len)
{
    char *tempstr = (char *)palloc0(haystack_len + 1);
    char *temp = tempstr;
    int line = 1; /* lineno of haystack which split by \0 */
    bool found_non_blank_char = false; /* mark if we find a non blank char after begin */
    errno_t rc = EOK;

    /* we have to make a copy, since haystack is const char* */
    rc = memcpy_s(tempstr, haystack_len + 1, haystack, haystack_len);
    securec_check_ss(rc, "\0", "\0");

    /* find if the 2nd line is prefixed by a valid transaction token */
    while (temp < tempstr + haystack_len)
    {
        /* there may be '\0' in the string, and should be skipped */
        if (*temp == '\0')
        {
            temp++;
            line++;
            /* we only search the 2nd line */
            if (line > 2)
                break;
        }
        /* skip the blank char */
        else if (isspace(*temp))
        {
            temp++;
        }
        else
        {
            /* we found a non blank char after begin, do further checking */
            if (line == 2)
                found_non_blank_char = true;
            /* For a transaction statement, all possible tokens after BEGIN are here */
            if (line == 2 &&(pg_strncasecmp(temp, "transaction", strlen("transaction")) == 0 ||
                              pg_strncasecmp(temp, "work", strlen("work")) == 0 ||
                              pg_strncasecmp(temp, "isolation", strlen("isolation")) == 0 ||
                              pg_strncasecmp(temp, "read", strlen("read")) == 0 ||
                              pg_strncasecmp(temp, "deferrable", strlen("deferrable")) == 0 ||
                              pg_strncasecmp(temp, "not", strlen("not")) == 0 ||
                              pg_strncasecmp(temp, ";", strlen(";")) == 0))
            {
                FREE_POINTER(tempstr);
                return true;
            }

            temp += strlen(temp);
        }
    }

    pfree (tempstr);

    /*
     * if all the char after begin are blank
     *    it is a trans stmt
     * else
     *    it is a anaynomous block stmt
     */
    return found_non_blank_char ? false : true;
}

  • gram.y
    gram.y使用词法分析出的词(token)去匹配相应的语法规则,如果匹配成功,则生成抽象语法树。
    由于语法较多,这里以select语句解析为例
    相关数据结构
typedef struct SelectStmt {
    NodeTag type;

    /*
     * These fields are used only in "leaf" SelectStmts.
     */
    List *distinctClause;   /* distinct子句 */
    IntoClause *intoClause; /* select into的目标值 */
    List *targetList;       /* 需要查询的字段 */
    List *fromClause;       /* from子句 */
    Node *whereClause;      /* where子句 */
    List *groupClause;      /* group by 子句 */
    Node *havingClause;     /* having条件子句 */
    List *windowClause;     /* 窗口函数 */
    WithClause *withClause; /* with子句 */

    /*
     * In a "leaf" node representing a VALUES list, the above fields are all
     * null, and instead this field is set.  Note that the elements of the
     * sublists are just expressions, without ResTarget decoration. Also note
     * that a list element can be DEFAULT (represented as a SetToDefault
     * node), regardless of the context of the VALUES list. It's up to parse
     * analysis to reject that where not valid.
     */
    List *valuesLists; /* untransformed list of expression lists */

    /*
     * These fields are used in both "leaf" SelectStmts and upper-level
     * SelectStmts.
     */
    List *sortClause;    /* sort 子句 */
    Node *limitOffset;   /* limit offset */
    Node *limitCount;    /* limit 返回行 */
    List *lockingClause; /* 锁子句 */
    HintState *hintState;

    /*
     * These fields are used only in upper-level SelectStmts.
     */
    SetOperation op;         /* 操作符 */
    bool all;                /* ALL specified? */
    struct SelectStmt *larg; /* left child */
    struct SelectStmt *rarg; /* right child */

    /*
     * These fields are used by operator "(+)"
     */
    bool hasPlus;
    /* Eventually add fields for CORRESPONDING spec here */
} SelectStmt;

定义段

%{
... 定义宏,数据结构,函数及include文件
%}

%pure-parser
%expect 0
%name-prefix="base_yy"
%locations

%parse-param {core_yyscan_t yyscanner}
%lex-param   {core_yyscan_t yyscanner}

// 修改yylval的类型
%union
{
    core_YYSTYPE        core_yystype;
    /* these fields must match core_YYSTYPE: */
    int                 ival;
    char                *str;
    const char          *keyword;

    char                chr;
    bool                boolean;
    JoinType            jtype;
    DropBehavior        dbehavior;
    OnCommitAction      oncommit;
    List                *list;
    Node                *node;
    Value               *value;
    ObjectType          objtype;
    TypeName            *typnam;
    FunctionParameter   *fun_param;
    FunctionParameterMode fun_param_mode;
    FuncWithArgs        *funwithargs;
    DefElem             *defelt;
    SortBy              *sortby;
    WindowDef           *windef;
    JoinExpr            *jexpr;
    IndexElem           *ielem;
    Alias               *alias;
    RangeVar            *range;
    IntoClause          *into;
    WithClause          *with;
    A_Indices           *aind;
    ResTarget           *target;
    struct PrivTarget   *privtarget;
    AccessPriv          *accesspriv;
    InsertStmt          *istmt;
    VariableSetStmt     *vsetstmt;
/* PGXC_BEGIN */
    DistributeBy        *distby;
    PGXCSubCluster      *subclus;
/* PGXC_END */
    ForeignPartState    *foreignpartby;
    MergeWhenClause     *mergewhen;
    UpsertClause *upsert;
    EncryptionType algtype;
}

// 为与语法的每个部分相关联的值提供单独的类型
%type     stmt schema_stmt
        AlterDatabaseStmt AlterDatabaseSetStmt AlterDataSourceStmt 
...
// 声明由LEX识别的YACC使用的每个语法规则,并给出值的类型
%token     IDENT FCONST SCONST BCONST XCONST Op CmpOp COMMENTSTRING
...
// 关键字
%token  ABORT_P ABSOLUTE_P ACCESS ACCOUNT ACTION ADD_P ADMIN AFTER
...

/* Precedence: lowest to highest */
%nonassoc   PARTIAL_EMPTY_PREC
%nonassoc   CLUSTER
%nonassoc   SET             /* see relation_expr_opt_alias */
%left       UNION EXCEPT MINUS_P
%left       INTERSECT
%left       OR
%left       AND
%right      NOT
%right      '='
%nonassoc   '<' '>' CmpOp
%nonassoc   LIKE ILIKE SIMILAR
%nonassoc   ESCAPE
%nonassoc   OVERLAPS
%nonassoc   BETWEEN
%nonassoc   IN_P
%left       POSTFIXOP       /* dummy for postfix Op rules */
/*
 * To support target_el without AS, we must give IDENT an explicit priority
 * between POSTFIXOP and Op.  We can safely assign the same priority to
 * various unreserved keywords as needed to resolve ambiguities (this can't
 * have any bad effects since obviously the keywords will still behave the
 * same as if they weren't keywords).  We need to do this for PARTITION,
 * RANGE, ROWS to support opt_existing_window_name; and for RANGE, ROWS
 * so that they can follow a_expr without creating postfix-operator problems;
 * and for NULL so that it can follow b_expr in ColQualList without creating
 * postfix-operator problems.
 *
 * To support CUBE and ROLLUP in GROUP BY without reserving them, we give them
 * an explicit priority lower than '(', so that a rule with CUBE '(' will shift
 * rather than reducing a conflicting rule that takes CUBE as a function name.
 * Using the same precedence as IDENT seems right for the reasons given above.
 *
 * The frame_bound productions UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING
 * are even messier: since UNBOUNDED is an unreserved keyword (per spec!),
 * there is no principled way to distinguish these from the productions
 * a_expr PRECEDING/FOLLOWING.  We hack this up by giving UNBOUNDED slightly
 * lower precedence than PRECEDING and FOLLOWING.  At present this doesn't
 * appear to cause UNBOUNDED to be treated differently from other unreserved
 * keywords anywhere else in the grammar, but it's definitely risky.  We can
 * blame any funny behavior of UNBOUNDED on the SQL standard, though.
 */
%nonassoc   UNBOUNDED       /* ideally should have same precedence as IDENT */
%nonassoc   IDENT NULL_P PARTITION RANGE ROWS PRECEDING FOLLOWING CUBE ROLLUP
%left       Op OPERATOR     /* multi-character ops and user-defined operators */
%nonassoc   NOTNULL
%nonassoc   ISNULL
%nonassoc   IS              /* sets precedence for IS NULL, etc */
%left       '+' '-'
%left       '*' '/' '%'
%left       '^'
/* Unary Operators */
%left       AT              /* sets precedence for AT TIME ZONE */
%left       COLLATE
%right      UMINUS
%left       '[' ']'
%left       '(' ')'
%left       TYPECAST
%left       '.'
/*
 * These might seem to be low-precedence, but actually they are not part
 * of the arithmetic hierarchy at all in their use as JOIN operators.
 * We make them high-precedence to support their use as function names.
 * They wouldn't be given a precedence at all, were it not that we need
 * left-associativity among the JOIN rules themselves.
 */
%left       JOIN CROSS LEFT FULL RIGHT INNER_P NATURAL ENCRYPTED
/* kluge to keep xml_whitespace_option from causing shift/reduce conflicts */
%right      PRESERVE STRIP_P

规则段

// 解析完成后赋值
stmtblock:  stmtmulti
            {
                pg_yyget_extra(yyscanner)->parsetree = $1;
            }
        ;
...
// select语法
SelectStmt: select_no_parens            %prec UMINUS
            | select_with_parens        %prec UMINUS
        ;

select_with_parens:
            '(' select_no_parens ')'                { $$ = $2; }
            | '(' select_with_parens ')'            { $$ = $2; }
        ;

select_no_parens:
            simple_select                       { $$ = $1; } // 简单查询
            | select_clause sort_clause // 带sort子句
                {
                    insertSelectOptions((SelectStmt *) $1, $2, NIL,
                                        NULL, NULL, NULL,
                                        yyscanner);
                    $$ = $1;
                }
            | select_clause opt_sort_clause for_locking_clause opt_select_limit  // 带锁子句
                {
                    insertSelectOptions((SelectStmt *) $1, $2, $3,
                                        (Node*)list_nth($4, 0), (Node*)list_nth($4, 1),
                                        NULL,
                                        yyscanner);
                    $$ = $1;
                }
            | select_clause opt_sort_clause select_limit opt_for_locking_clause // 带limit子句
                {
                    insertSelectOptions((SelectStmt *) $1, $2, $4,
                                        (Node*)list_nth($3, 0), (Node*)list_nth($3, 1),
                                        NULL,
                                        yyscanner);
                    $$ = $1;
                }
            | with_clause select_clause  // with cte
                {
                    insertSelectOptions((SelectStmt *) $2, NULL, NIL,
                                        NULL, NULL,
                                        $1,
                                        yyscanner);
                    $$ = $2;
                }
            | with_clause select_clause sort_clause // cte + sort子句
                {
                    insertSelectOptions((SelectStmt *) $2, $3, NIL,
                                        NULL, NULL,
                                        $1,
                                        yyscanner);
                    $$ = $2;
                }
            | with_clause select_clause opt_sort_clause for_locking_clause opt_select_limit  // cte + 锁子句
                {
                    insertSelectOptions((SelectStmt *) $2, $3, $4,
                                        (Node*)list_nth($5, 0), (Node*)list_nth($5, 1),
                                        $1,
                                        yyscanner);
                    $$ = $2;
                }
            | with_clause select_clause opt_sort_clause select_limit opt_for_locking_clause // cte + limit子句
                {
                    insertSelectOptions((SelectStmt *) $2, $3, $5,
                                        (Node*)list_nth($4, 0), (Node*)list_nth($4, 1),
                                        $1,
                                        yyscanner);
                    $$ = $2;
                }
        ;

select_clause:
            simple_select                           { $$ = $1; }
            | select_with_parens                    { $$ = $1; }
        ;

// 简单查询
simple_select:
            SELECT hint_string opt_distinct target_list
            into_clause from_clause where_clause
            group_clause having_clause window_clause
                {
                    // 新建SelectStmt节点
                    SelectStmt *n = makeNode(SelectStmt);
                    n->distinctClause = $3;
                    n->targetList = $4;
                    n->intoClause = $5;
                    n->fromClause = $6;
                    n->whereClause = $7;
                    n->groupClause = $8;
                    n->havingClause = $9;
                    n->windowClause = $10;
                    n->hintState = create_hintstate($2);
                    n->hasPlus = getOperatorPlusFlag();
                    $$ = (Node *)n;
                }
            | values_clause                         { $$ = $1; }
            | TABLE relation_expr
                {
                    /* same as SELECT * FROM relation_expr */
                    ColumnRef *cr = makeNode(ColumnRef);
                    ResTarget *rt = makeNode(ResTarget);
                    SelectStmt *n = makeNode(SelectStmt);

                    cr->fields = list_make1(makeNode(A_Star));
                    cr->location = -1;

                    rt->name = NULL;
                    rt->indirection = NIL;
                    rt->val = (Node *)cr;
                    rt->location = -1;

                    n->targetList = list_make1(rt);
                    n->fromClause = list_make1($2);
                    $$ = (Node *)n;
                }
            | select_clause UNION opt_all select_clause // select union select
                {
                    $$ = makeSetOp(SETOP_UNION, $3, $1, $4);
                }
            | select_clause INTERSECT opt_all select_clause // select intersect select
                {
                    $$ = makeSetOp(SETOP_INTERSECT, $3, $1, $4);
                }
            | select_clause EXCEPT opt_all select_clause // select except select
                {
                    $$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4);
                }
            | select_clause MINUS_P opt_all select_clause // select minus select 
                {
                    $$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4);
                }
        ;

hint_string: // hint
        COMMENTSTRING
            {
                $$ = $1;
            }
        |
            { 
                $$ = NULL;
            }
        ;
/*
 * SQL standard WITH clause looks like:
 *
 * WITH [ RECURSIVE ]  [ (,...) ]
 *      AS (query) [ SEARCH or CYCLE clause ]
 *
 * We don't currently support the SEARCH or CYCLE clause.
 */
with_clause: // cte
        WITH cte_list
            {
                $$ = makeNode(WithClause);
                $$->ctes = $2;
                $$->recursive = false;
                $$->location = @1;
            }
        | WITH RECURSIVE cte_list
            {
                $$ = makeNode(WithClause);
                $$->ctes = $3;
                $$->recursive = true;
                $$->location = @1;
            }
        ;

cte_list:
        common_table_expr                       { $$ = list_make1($1); }
        | cte_list ',' common_table_expr        { $$ = lappend($1, $3); }
        ;

common_table_expr:  name opt_name_list AS '(' PreparableStmt ')'
            {
                CommonTableExpr *n = makeNode(CommonTableExpr);
                n->ctename = $1;
                n->aliascolnames = $2;
                n->ctequery = $5;
                n->location = @1;
                n->locator_type = LOCATOR_TYPE_NONE;
                $$ = (Node *) n;
            }
        ;

opt_with_clause:
        with_clause                             { $$ = $1; }
        | /*EMPTY*/                             { $$ = NULL; }
        ;

into_clause:
            INTO OptTempTableName
                {
                    $$ = makeNode(IntoClause);
                    $$->rel = $2;
                    $$->colNames = NIL;
                    $$->options = NIL;
                    $$->onCommit = ONCOMMIT_NOOP;
                    /* Here $$ is a temp table, so row_compress can be any value. To be safe, REL_CMPRS_PAGE_PLAIN is used. */
                    $$->row_compress = REL_CMPRS_PAGE_PLAIN;
                    $$->tableSpaceName = NULL;
                    $$->skipData = false;
                    $$->relkind = INTO_CLAUSE_RELKIND_DEFAULT;
                }
            | /*EMPTY*/
                { $$ = NULL; }
        ;

/*
 * Redundancy here is needed to avoid shift/reduce conflicts,
 * since TEMP is not a reserved word.  See also OptTemp.
 */
OptTempTableName:
            TEMPORARY opt_table qualified_name
                {
                    $$ = $3;
                    $$->relpersistence = RELPERSISTENCE_TEMP;
                }
            | TEMP opt_table qualified_name
                {
                    $$ = $3;
                    $$->relpersistence = RELPERSISTENCE_TEMP;
                }
            | LOCAL TEMPORARY opt_table qualified_name
                {
                    $$ = $4;
                    $$->relpersistence = RELPERSISTENCE_TEMP;
                }
            | LOCAL TEMP opt_table qualified_name
                {
                    $$ = $4;
                    $$->relpersistence = RELPERSISTENCE_TEMP;
                }
            | GLOBAL TEMPORARY opt_table qualified_name
                {
                    $$ = $4;
#ifdef ENABLE_MULTIPLE_NODES
                    ereport(WARNING,
                            (errmsg("GLOBAL is deprecated in temporary table creation"),
                             parser_errposition(@1)));
                    $$->relpersistence = RELPERSISTENCE_TEMP;
#else
                    $$->relpersistence = RELPERSISTENCE_GLOBAL_TEMP;
#endif
                }
            | GLOBAL TEMP opt_table qualified_name
                {
                    $$ = $4;
#ifdef ENABLE_MULTIPLE_NODES
                    ereport(WARNING,
                            (errmsg("GLOBAL is deprecated in temporary table creation"),
                             parser_errposition(@1)));
                    $$->relpersistence = RELPERSISTENCE_TEMP;
#else
                    $$->relpersistence = RELPERSISTENCE_GLOBAL_TEMP;
#endif
                }
            | UNLOGGED opt_table qualified_name
                {
                    $$ = $3;
                    $$->relpersistence = RELPERSISTENCE_UNLOGGED;
                }
            | TABLE qualified_name
                {
                    $$ = $2;
                    $$->relpersistence = RELPERSISTENCE_PERMANENT;
                }
            | qualified_name
                {
                    $$ = $1;
                    $$->relpersistence = RELPERSISTENCE_PERMANENT;
                }
        ;

opt_table:  TABLE                                   {}
            | /*EMPTY*/                             {}
        ;

opt_all:    ALL                                     { $$ = TRUE; }
            | DISTINCT                              { $$ = FALSE; }
            | /*EMPTY*/                             { $$ = FALSE; }
        ;

/* We use (NIL) as a placeholder to indicate that all target expressions
 * should be placed in the DISTINCT list during parsetree analysis.
 */
opt_distinct: // distinct子句
            DISTINCT                                { $$ = list_make1(NIL); }
            | DISTINCT ON '(' expr_list ')'         { $$ = $4; }
            | ALL                                   { $$ = NIL; }
            | /*EMPTY*/                             { $$ = NIL; }
        ;

opt_sort_clause:
            sort_clause                             { $$ = $1;}
            | /*EMPTY*/                             { $$ = NIL; }
        ;

sort_clause:  // sort子句
            ORDER BY sortby_list                    { $$ = $3; }
        ;

sortby_list:
            sortby                                  { $$ = list_make1($1); }
            | sortby_list ',' sortby                { $$ = lappend($1, $3); }
        ;

sortby:     a_expr USING qual_all_Op opt_nulls_order
                {
                    $$ = makeNode(SortBy);
                    $$->node = $1;
                    $$->sortby_dir = SORTBY_USING;
                    $$->sortby_nulls = (SortByNulls)$4;
                    $$->useOp = $3;
                    $$->location = @3;
                }
            | a_expr opt_asc_desc opt_nulls_order
                {
                    $$ = makeNode(SortBy);
                    $$->node = $1;
                    $$->sortby_dir = (SortByDir)$2;
                    $$->sortby_nulls = (SortByNulls)$3;
                    $$->useOp = NIL;
                    $$->location = -1;      /* no operator */
                }
            | NLSSORT '(' a_expr ',' Sconst ')' opt_asc_desc opt_nulls_order
                {
                    if (checkNlssortArgs($5))
                    {
                        Node  *c = NULL;
                        FuncCall *n = makeNode(FuncCall);
                        c = $3;

                        n->funcname = SystemFuncName("convert_to_nocase");
                        n->args =list_make2(c,makeStringConst("gbk",-1));
                        n->agg_order = NIL;
                        n->agg_star = FALSE;
                        n->agg_distinct = FALSE;
                        n->func_variadic = FALSE;
                        n->over = NULL;
                        n->location = @1;
                        n->call_func = false;

                        $$ = makeNode(SortBy);
                        $$->node = (Node*)n;
                        $$->sortby_dir = (SortByDir)$7;
                        $$->sortby_nulls = (SortByNulls)$8;
                        $$->useOp = NIL;
                        $$->location = @1;
                    }
                    else
                    {
                        $$ = NULL;
                        ereport(ERROR,(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                    errmsg("Sort method %s  is not supported!",$5)));
                    }
                }
        ;


select_limit: // limit子句
            limit_clause offset_clause              { $$ = list_make2($2, $1); }
            | offset_clause limit_clause                { $$ = list_make2($1, $2); }
            | limit_clause                      { $$ = list_make2(NULL, $1); }
            | limit_offcnt_clause                   { $$ = $1; }
            | offset_clause                     { $$ = list_make2($1, NULL); }
        ;

opt_select_limit:
            select_limit                        { $$ = $1; }
            | /* EMPTY */                       { $$ = list_make2(NULL,NULL); }
        ;

opt_delete_limit:
            LIMIT a_expr                        { $$ = list_make2(NULL, $2); }
            | /* EMPTY */                       { $$ = list_make2(NULL, NULL); }


limit_clause:
            LIMIT select_limit_value
                { $$ = $2; }
            /* SQL:2008 syntax */
            | FETCH first_or_next opt_select_fetch_first_value row_or_rows ONLY
                { $$ = $3; }
        ;

limit_offcnt_clause:
            LIMIT select_offset_value ',' select_limit_value
                {
                    $$ = list_make2($2, $4);
                }
        ;

offset_clause:
            OFFSET select_offset_value
                { $$ = $2; }
            /* SQL:2008 syntax */
            | OFFSET select_offset_value2 row_or_rows
                { $$ = $2; }
        ;

select_limit_value:
            a_expr                                  { $$ = $1; }
            | ALL
                {
                    /* LIMIT ALL is represented as a NULL constant */
                    $$ = makeNullAConst(@1);
                }
        ;

select_offset_value:
            a_expr                                  { $$ = $1; }
        ;

/*
 * Allowing full expressions without parentheses causes various parsing
 * problems with the trailing ROW/ROWS key words.  SQL only calls for
 * constants, so we allow the rest only with parentheses.  If omitted,
 * default to 1.
 */
opt_select_fetch_first_value:
            SignedIconst                        { $$ = makeIntConst($1, @1); }
            | '(' a_expr ')'                    { $$ = $2; }
            | /*EMPTY*/                         { $$ = makeIntConst(1, -1); }
        ;

/*
 * Again, the trailing ROW/ROWS in this case prevent the full expression
 * syntax.  c_expr is the best we can do.
 */
select_offset_value2:
            c_expr                                  { $$ = $1; }
        ;

/* noise words */
row_or_rows: ROW                                    { $$ = 0; }
            | ROWS                                  { $$ = 0; }
        ;

first_or_next: FIRST_P                              { $$ = 0; }
            | NEXT                                  { $$ = 0; }
        ;

/*
 * This syntax for group_clause tries to follow the spec quite closely.
 * However, the spec allows only column references, not expressions,
 * which introduces an ambiguity between implicit row constructors
 * (a,b) and lists of column references.
 *
 * We handle this by using the a_expr production for what the spec calls
 * , which in the spec represents either one column
 * reference or a parenthesized list of column references. Then, we check the
 * top node of the a_expr to see if it's an implicit RowExpr, and if so, just
 * grab and use the list, discarding the node. (this is done in parse analysis,
 * not here)
 *
 * (we abuse the row_format field of RowExpr to distinguish implicit and
 * explicit row constructors; it's debatable if anyone sanely wants to use them
 * in a group clause, but if they have a reason to, we make it possible.)
 *
 * Each item in the group_clause list is either an expression tree or a
 * GroupingSet node of some type.
 */

group_clause: // group by 子句
            GROUP_P BY group_by_list                { $$ = $3; }
            | /*EMPTY*/                             { $$ = NIL; }
        ;
group_by_list:
            group_by_item                           { $$ = list_make1($1); }
            | group_by_list ',' group_by_item       { $$ = lappend($1,$3); }
        ;

group_by_item:
            a_expr                                  { $$ = $1; }
            | empty_grouping_set                    { $$ = $1; }
            | cube_clause                           { $$ = $1; }
            | rollup_clause                         { $$ = $1; }
            | grouping_sets_clause                  { $$ = $1; }
        ;

empty_grouping_set:
            '(' ')'
                {
                    $$ = (Node *) makeGroupingSet(GROUPING_SET_EMPTY, NIL, @1);
                }
        ;
/*
 * These hacks rely on setting precedence of CUBE and ROLLUP below that of '(',
 * so that they shift in these rules rather than reducing the conflicting
 * unreserved_keyword rule.
 */

rollup_clause:
            ROLLUP '(' expr_list ')'
                {
                    $$ = (Node *) makeGroupingSet(GROUPING_SET_ROLLUP, $3, @1);
                }
        ;

cube_clause:
            CUBE '(' expr_list ')'
                {
                    $$ = (Node *) makeGroupingSet(GROUPING_SET_CUBE, $3, @1);
                }
        ;

grouping_sets_clause:
            GROUPING_P SETS '(' group_by_list ')'
                {
                    $$ = (Node *) makeGroupingSet(GROUPING_SET_SETS, $4, @1);
                }
        ;


having_clause:
            HAVING a_expr                           { $$ = $2; }
            | /*EMPTY*/                             { $$ = NULL; }
        ;

for_locking_clause: // 锁子句
            for_locking_items                       { $$ = $1; }
            | FOR READ ONLY                         { $$ = NIL; }
        ;

opt_for_locking_clause:
            for_locking_clause                      { $$ = $1; }
            | /* EMPTY */                           { $$ = NIL; }
        ;

for_locking_items:
            for_locking_item                        { $$ = list_make1($1); }
            | for_locking_items for_locking_item    { $$ = lappend($1, $2); }
        ;

for_locking_item:
            FOR UPDATE locked_rels_list opt_nowait
                {
                    LockingClause *n = makeNode(LockingClause);
                    n->lockedRels = $3;
                    n->forUpdate = TRUE;
                    n->noWait = $4;
                    $$ = (Node *) n;
                }
            | FOR SHARE locked_rels_list opt_nowait
                {
                    LockingClause *n = makeNode(LockingClause);
                    n->lockedRels = $3;
                    n->forUpdate = FALSE;
                    n->noWait = $4;
                    $$ = (Node *) n;
                }
        ;

locked_rels_list:
            OF qualified_name_list                  { $$ = $2; }
            | /* EMPTY */                           { $$ = NIL; }
        ;


values_clause: // VALUES子句
            VALUES ctext_row
                {
                    SelectStmt *n = makeNode(SelectStmt);
                    n->valuesLists = list_make1($2);
                    $$ = (Node *) n;
                }
            | values_clause ',' ctext_row
                {
                    SelectStmt *n = (SelectStmt *) $1;
                    n->valuesLists = lappend(n->valuesLists, $3);
                    $$ = (Node *) n;
                }
        ;


/*****************************************************************************
 *
 *  clauses common to all Optimizable Stmts:
 *      from_clause     - allow list of both JOIN expressions and table names
 *      where_clause    - qualifications for joins or restrictions
 *
 *****************************************************************************/

from_clause: // from子句
            FROM from_list                          { $$ = $2; }
            | /*EMPTY*/                             { $$ = NIL; }
        ;

from_list:
            table_ref                               { $$ = list_make1($1); }
            | from_list ',' table_ref               { $$ = lappend($1, $3); }
        ;

/*
 * table_ref is where an alias clause can be attached.  Note we cannot make
 * alias_clause have an empty production because that causes parse conflicts
 * between table_ref := '(' joined_table ')' alias_clause
 * and joined_table := '(' joined_table ')'.  So, we must have the
 * redundant-looking productions here instead.
 */
// 访问表
table_ref:  relation_expr
                {
                    $$ = (Node *) $1;
                }
            | relation_expr alias_clause
                {
                    $1->alias = $2;
                    $$ = (Node *) $1;
                }
            | relation_expr opt_alias_clause tablesample_clause
                {
                    RangeTableSample *n = (RangeTableSample *) $3;
                    $1->alias = $2;
                    /* relation_expr goes inside the RangeTableSample node */
                    n->relation = (Node *) $1;
                    $$ = (Node *) n;
                }

            | relation_expr PARTITION '(' name ')'
                {
                    $1->partitionname = $4;
                    $1->ispartition = true;
                    $$ = (Node *)$1;
                }
            | relation_expr BUCKETS '(' bucket_list ')'
                {
                    $1->buckets = $4;
                    $1->isbucket = true;
                    $$ = (Node *)$1;
                }
            | relation_expr PARTITION_FOR '(' maxValueList ')'
                {
                    $1->partitionKeyValuesList = $4;
                    $1->ispartition = true;
                    $$ = (Node *)$1;
                }

            | relation_expr PARTITION '(' name ')' alias_clause
                {
                    $1->partitionname = $4;
                    $1->alias = $6;
                    $1->ispartition = true;
                    $$ = (Node *)$1;
                }

            | relation_expr PARTITION_FOR '(' maxValueList ')' alias_clause
                {
                    $1->partitionKeyValuesList = $4;
                    $1->alias = $6;
                    $1->ispartition = true;
                    $$ = (Node *)$1;
                }

            | func_table
                {
                    RangeFunction *n = makeNode(RangeFunction);
                    n->funccallnode = $1;
                    n->coldeflist = NIL;
                    $$ = (Node *) n;
                }
            | func_table alias_clause
                {
                    RangeFunction *n = makeNode(RangeFunction);
                    n->funccallnode = $1;
                    n->alias = $2;
                    n->coldeflist = NIL;
                    $$ = (Node *) n;
                }
            | func_table AS '(' TableFuncElementList ')'
                {
                    RangeFunction *n = makeNode(RangeFunction);
                    n->funccallnode = $1;
                    n->coldeflist = $4;
                    $$ = (Node *) n;
                }
            | func_table AS ColId '(' TableFuncElementList ')'
                {
                    RangeFunction *n = makeNode(RangeFunction);
                    Alias *a = makeNode(Alias);
                    n->funccallnode = $1;
                    a->aliasname = $3;
                    n->alias = a;
                    n->coldeflist = $5;
                    $$ = (Node *) n;
                }
            | func_table ColId '(' TableFuncElementList ')'
                {
                    RangeFunction *n = makeNode(RangeFunction);
                    Alias *a = makeNode(Alias);
                    n->funccallnode = $1;
                    a->aliasname = $2;
                    n->alias = a;
                    n->coldeflist = $4;
                    $$ = (Node *) n;
                }
            | select_with_parens
                {
                    /*
                     * The SQL spec does not permit a subselect
                     * () without an alias clause,
                     * so we don't either.  This avoids the problem
                     * of needing to invent a unique refname for it.
                     * That could be surmounted if there's sufficient
                     * popular demand, but for now let's just implement
                     * the spec and see if anyone complains.
                     * However, it does seem like a good idea to emit
                     * an error message that's better than "syntax error".
                     */
                    /* add select_with_parens whthout alias_clause adapt A db for procedure dubug */
                    $$ = NULL;
                    if (IsA($1, SelectStmt) &&
                        ((SelectStmt *) $1)->valuesLists)
                        ereport(ERROR,
                                (errcode(ERRCODE_SYNTAX_ERROR),
                                 errmsg("VALUES in FROM must have an alias"),
                                 errhint("For example, FROM (VALUES ...) [AS] foo."),
                                 parser_errposition(@1)));
                    else
                    {
                        /*
                        * add a anonymous table name for this subquery
                        * simulate A db to support no alias for subquery,
                        * give the suqquery a default name "anonymous_table"
                        */
                        RangeSubselect *n = makeNode(RangeSubselect);
                        Alias *a = makeNode(Alias);
                        n->subquery = $1;
                        n->alias = NULL;
                        a->aliasname = pstrdup("__unnamed_subquery__");
                        n->alias = a;
                        $$ = (Node *) n;
                    }
                }
            | select_with_parens alias_clause
                {
                    RangeSubselect *n = makeNode(RangeSubselect);
                    n->subquery = $1;
                    n->alias = $2;
                    $$ = (Node *) n;
                }
            | joined_table
                {
                    $$ = (Node *) $1;
                }
            | '(' joined_table ')' alias_clause
                {
                    $2->alias = $4;
                    $$ = (Node *) $2;
                }
        ;


/*
 * It may seem silly to separate joined_table from table_ref, but there is
 * method in SQL92's madness: if you don't do it this way you get reduce-
 * reduce conflicts, because it's not clear to the parser generator whether
 * to expect alias_clause after ')' or not.  For the same reason we must
 * treat 'JOIN' and 'join_type JOIN' separately, rather than allowing
 * join_type to expand to empty; if we try it, the parser generator can't
 * figure out when to reduce an empty join_type right after table_ref.
 *
 * Note that a CROSS JOIN is the same as an unqualified
 * INNER JOIN, and an INNER JOIN/ON has the same shape
 * but a qualification expression to limit membership.
 * A NATURAL JOIN implicitly matches column names between
 * tables and the shape is determined by which columns are
 * in common. We'll collect columns during the later transformations.
 */

joined_table: // 连接
            '(' joined_table ')'
                {
                    $$ = $2;
                }
            | table_ref CROSS JOIN table_ref
                {
                    /* CROSS JOIN is same as unqualified inner join */
                    JoinExpr *n = makeNode(JoinExpr);
                    n->jointype = JOIN_INNER;
                    n->isNatural = FALSE;
                    n->larg = $1;
                    n->rarg = $4;
                    n->usingClause = NIL;
                    n->quals = NULL;
                    $$ = n;
                }
            | table_ref join_type JOIN table_ref join_qual
                {
                    JoinExpr *n = makeNode(JoinExpr);
                    n->jointype = $2;
                    n->isNatural = FALSE;
                    n->larg = $1;
                    n->rarg = $4;
                    if ($5 != NULL && IsA($5, List))
                        n->usingClause = (List *) $5; /* USING clause */
                    else
                        n->quals = $5; /* ON clause */
                    $$ = n;
                }
            | table_ref JOIN table_ref join_qual
                {
                    /* letting join_type reduce to empty doesn't work */
                    JoinExpr *n = makeNode(JoinExpr);
                    n->jointype = JOIN_INNER;
                    n->isNatural = FALSE;
                    n->larg = $1;
                    n->rarg = $3;
                    if ($4 != NULL && IsA($4, List))
                        n->usingClause = (List *) $4; /* USING clause */
                    else
                        n->quals = $4; /* ON clause */
                    $$ = n;
                }
            | table_ref NATURAL join_type JOIN table_ref
                {
                    JoinExpr *n = makeNode(JoinExpr);
                    n->jointype = $3;
                    n->isNatural = TRUE;
                    n->larg = $1;
                    n->rarg = $5;
                    n->usingClause = NIL; /* figure out which columns later... */
                    n->quals = NULL; /* fill later */
                    $$ = n;
                }
            | table_ref NATURAL JOIN table_ref
                {
                    /* letting join_type reduce to empty doesn't work */
                    JoinExpr *n = makeNode(JoinExpr);
                    n->jointype = JOIN_INNER;
                    n->isNatural = TRUE;
                    n->larg = $1;
                    n->rarg = $4;
                    n->usingClause = NIL; /* figure out which columns later... */
                    n->quals = NULL; /* fill later */
                    $$ = n;
                }
        ;

alias_clause: // 别名
            AS ColId '(' name_list ')'
                {
                    $$ = makeNode(Alias);
                    $$->aliasname = $2;
                    $$->colnames = $4;
                }
            | AS ColId
                {
                    $$ = makeNode(Alias);
                    $$->aliasname = $2;
                }
            | ColId '(' name_list ')'
                {
                    $$ = makeNode(Alias);
                    $$->aliasname = $1;
                    $$->colnames = $3;
                }
            | ColId
                {
                    $$ = makeNode(Alias);
                    $$->aliasname = $1;
                }
        ;

opt_alias_clause: alias_clause      { $$ = $1; }
            | /*EMPTY*/ { $$ = NULL; }
        ;

join_type:  FULL join_outer                         { $$ = JOIN_FULL; }
            | LEFT join_outer                       { $$ = JOIN_LEFT; }
            | RIGHT join_outer                      { $$ = JOIN_RIGHT; }
            | INNER_P                               { $$ = JOIN_INNER; }
        ;

/* OUTER is just noise... */
join_outer: OUTER_P                                 { $$ = NULL; }
            | /*EMPTY*/                             { $$ = NULL; }
        ;

/* JOIN qualification clauses
 * Possibilities are:
 *  USING ( column list ) allows only unqualified column names,
 *                        which must match between tables.
 *  ON expr allows more general qualifications.
 *
 * We return USING as a List node, while an ON-expr will not be a List.
 */

join_qual:  USING '(' name_list ')'                 { $$ = (Node *) $3; }
            | ON a_expr                             { $$ = $2; }
        ;


relation_expr:
            qualified_name
                {
                    /* default inheritance */
                    $$ = $1;
                    $$->inhOpt = INH_DEFAULT;
                    $$->alias = NULL;
                }
            | qualified_name '*'
                {
                    /* inheritance query */
                    $$ = $1;
                    $$->inhOpt = INH_YES;
                    $$->alias = NULL;
                }
            | ONLY qualified_name
                {
                    /* no inheritance */
                    $$ = $2;
                    $$->inhOpt = INH_NO;
                    $$->alias = NULL;
                }
            | ONLY '(' qualified_name ')'
                {
                    /* no inheritance, SQL99-style syntax */
                    $$ = $3;
                    $$->inhOpt = INH_NO;
                    $$->alias = NULL;
                }
        ;


relation_expr_list:
            relation_expr                           { $$ = list_make1($1); }
            | relation_expr_list ',' relation_expr  { $$ = lappend($1, $3); }
        ;

...
where_clause: // where子句
            WHERE a_expr                            { $$ = $2; }
            | /*EMPTY*/                             { $$ = NULL; }
        ;
...

程序段

...
/* parser_init()
 * Initialize to parse one query string
 */
void
parser_init(base_yy_extra_type *yyext)
{
    yyext->parsetree = NIL;     /* in case grammar forgets to set it */
    yyext->core_yy_extra.query_string_locationlist = NIL;
    yyext->core_yy_extra.paren_depth = 0;
}
...
  • 词法语法解析流程
    这里用以下查询语句进行分析
postgres=# select * from a where id < 100 order by id;

语句执行流程图如下:


parse
  1. 词法语法解析入口函数raw_parser,调用base_yyparse开始解析
  2. 首先词法解析到SELECT关键字
simple_select:
            SELECT hint_string opt_distinct target_list
            into_clause from_clause where_clause
            group_clause having_clause window_clause
                {
                    SelectStmt *n = makeNode(SelectStmt);
                    n->distinctClause = $3;
                    n->targetList = $4;
                    n->intoClause = $5;
                    n->fromClause = $6;
                    n->whereClause = $7;
                    n->groupClause = $8;
                    n->havingClause = $9;
                    n->windowClause = $10;
                    n->hintState = create_hintstate($2);
                    n->hasPlus = getOperatorPlusFlag();
                    $$ = (Node *)n;
                }

(1) 由SELECT关键字匹配到simple_select语法规则
(2) hint_string, opt_distinct 返回空
(3) target_list匹配到 '*' 字符,构建ColumnRef,加入到list
(4) into_clause 返回空
(5) 匹配FROM关键字,匹配表名,构建RangeVar,加入到list
(6) 匹配WHERE关键字,匹配字段名,构建ColumnRef,匹配int常量,匹配<表达式,构建A_Expr
(7) group_clause,having_clause,window_clause 返回空
(8) 最后构建 SelectStmt

  1. 匹配order by
sort_clause:
            ORDER BY sortby_list                    { $$ = $3; }
        ;
sortby_list:
            sortby                                  { $$ = list_make1($1); }
            | sortby_list ',' sortby                { $$ = lappend($1, $3); }
        ;

sortby:     a_expr USING qual_all_Op opt_nulls_order
                {
                    $$ = makeNode(SortBy);
                    $$->node = $1;
                    $$->sortby_dir = SORTBY_USING;
                    $$->sortby_nulls = (SortByNulls)$4;
                    $$->useOp = $3;
                    $$->location = @3;
                }
            | a_expr opt_asc_desc opt_nulls_order
                {
                    $$ = makeNode(SortBy);
                    $$->node = $1;
                    $$->sortby_dir = (SortByDir)$2;
                    $$->sortby_nulls = (SortByNulls)$3;
                    $$->useOp = NIL;
                    $$->location = -1;      /* no operator */
                }
...
select_no_parens:
            simple_select                       { $$ = $1; }
            | select_clause sort_clause
                {
                    insertSelectOptions((SelectStmt *) $1, $2, NIL,
                                        NULL, NULL, NULL,
                                        yyscanner);
                    $$ = $1;
                }

(1) 读取到ORDER, BY关键字,匹配sort_clause语法规则
(2) 匹配字段名,构建ColumnRef,构建SortBy节点
(3) 匹配 select_clause sort_clause 规则,将sort_clause中构建的SortBy节点加入到上一步的SelectStmt中

  1. 返回抽象语法树
stmtblock:  stmtmulti
            {
                pg_yyget_extra(yyscanner)->parsetree = $1;
            }
        ;

/* the thrashing around here is to discard "empty" statements... */
stmtmulti:  stmtmulti ';' stmt
                {
                    if ($3 != NULL)
                    {
                        if (IsA($3, List))
                        {
                            $$ = list_concat($1, (List*)$3);
                        }
                        else
                        {
                        $$ = lappend($1, $3);
                        }
                    }
                    else
                        $$ = $1;
                }
            | stmt
                {
                    if ($1 != NULL)
                    {
                        if (IsA($1, List))
                        {
                            $$ = (List*)$1;
                        }
                        else
                        {
                        $$ = list_make1($1);
                        }
                    }
                    else
                        $$ = NIL;
                }
        ;

(1) 将上述SelectStmt加入list,赋值给yyextra.parsetree
(2) raw_parser函数将parsetree返回给上层调用函数

你可能感兴趣的:(词法语法解析)