PostgreSQL查询SQL的语法分析(1)——词法分析

一、背景

postgreSQL命令的词法分析和语法分析是由Unix工具Yacc和Lex制作的。使用的是 Bison 和
Flex。

二、代码分析

2.1 源码结构

词法分析和语法分析依赖的文件定义在src\backend\parser下的scan.l和gram.y。其中:

  • 词法分析器在文件 scan.l里定义。负责识别标识符,SQL 关键字等,对于发现的每个关键字或者标识符都会生成一个记号并且传递给分析器;

  • 语法分析器在文件 gram.y里定义。包含一套语法规则和触发规则时执行的动作.

在raw_parser函数(在src/backend/parser/parser.c下)中,主要通过调用Lex和Yacc配合生成的base_yyparse函数来实现词法分析和语法分析的工作。

其他的重要源码文件:

  • kwlist.h:SQL关键字定义,注意:关键字名要小写,按照字符串值顺序定义

  • kwlookup.h:定义结构体ScanKeyword;

  • kwlookup.c:使用kwlist.h初始化关键字数组ScanKeywords,提供ScanKeywordLookup函数,该函数判断输入的字符串是否是关键字,若是则返回单词表中对应单词的指针,采用二分法查找;

  • scanup.c:提供几个词法分析时常用的函数。scanstr函数处理转义字符,downcase_truncate_identifier函数将大写英文字符转换为小写字符,truncate_identifier函数截断超过最大标识符长度的标识符,scanner_isspace函数判断输入字符是否为空白字符。

  • scan.l:定义词法结构,编译生成scan.c;这里会忽略comment等无用信息。

  • gram.y:定义语法结构,编译生成gram.c;分析后生成语法分析树。

  • gram.h:定义关键字的数值编号。

辅助脚本:

  • check_keywords.pl:检查在gram.y 和 kwlist.h 中定义的关键字列表是否一致。
文件调用关系图

2.2 源码分析

  • SQL关键字
    SQL关键字结构体
/* src/include/common/keywords.h */ 

...
/* 关键字类目 Keyword categories --- should match lists in gram.y */
#define UNRESERVED_KEYWORD      0
#define COL_NAME_KEYWORD        1
#define TYPE_FUNC_NAME_KEYWORD  2
#define RESERVED_KEYWORD        3

typedef struct ScanKeyword
{
    const char *name;           /* 关键字,小写 */
    int16       value;          /* 语法分析的token代码,对应gram.y 的keyword */
    int16       category;       /* 关键字类目 */
} ScanKeyword;
...

SQL关键字定义及查找函数

/* src/common/keywords.c */ 

#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#ifndef FRONTEND

#include "parser/gramparse.h"

#define PG_KEYWORD(a,b,c) {a,b,c},

#else

#include "common/keywords.h"

/*
 * We don't need the token number for frontend uses, so leave it out to avoid
 * requiring backend headers that won't compile cleanly here.
 */
#define PG_KEYWORD(a,b,c) {a,0,c},  

#endif                          /* FRONTEND */


const ScanKeyword ScanKeywords[] = {
#include "parser/kwlist.h"  // 通过parser/kwlist.h引入关键字列表
};

const int   NumScanKeywords = lengthof(ScanKeywords); // 关键字的个数


/** 检查字符串是否属于关键字
把text转换为小写,然后用二分查找法在关键字列表中查找相应的关键字,如果找到,返回对应的ScanKeyword 指针。如果没找到,返回NULL。
 */
const ScanKeyword *
ScanKeywordLookup(const char *text,    // 查找的字符串
                  const ScanKeyword *keywords, // 关键字定义列表
                  int num_keywords)   // 定义关键字的个数
{
    int         len,
                i;
    char        word[NAMEDATALEN];
    const ScanKeyword *low;
    const ScanKeyword *high;

    len = strlen(text);
    /* We assume all keywords are shorter than NAMEDATALEN. */
    if (len >= NAMEDATALEN)
        return NULL;

    /*
     * 关键字列表中的都是小写,所以查找前先转换为小写
     */
    for (i = 0; i < len; i++)
    {
        char        ch = text[i];

        if (ch >= 'A' && ch <= 'Z')
            ch += 'a' - 'A';
        word[i] = ch;
    }
    word[len] = '\0';

    /*
     * 采用二分查找法,在关键字列表中查找字符串
     */
    low = keywords;
    high = keywords + (num_keywords - 1);
    while (low <= high)
    {
        const ScanKeyword *middle;
        int         difference;

        middle = low + (high - low) / 2;
        difference = strcmp(middle->name, word);
        if (difference == 0)
            return middle;
        else if (difference < 0)
            low = middle + 1;
        else
            high = middle - 1;
    }

    return NULL;
}

SQL关键字列表
因为查找是采用二分查找法。如果添加关键字,需要保持name的顺序。

/* src/include/parser/kwlist.h */

/* name, value, category */
PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("absolute", ABSOLUTE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("access", ACCESS, UNRESERVED_KEYWORD)
PG_KEYWORD("action", ACTION, UNRESERVED_KEYWORD)
PG_KEYWORD("add", ADD_P, UNRESERVED_KEYWORD)
PG_KEYWORD("admin", ADMIN, UNRESERVED_KEYWORD)
PG_KEYWORD("after", AFTER, UNRESERVED_KEYWORD)
PG_KEYWORD("aggregate", AGGREGATE, UNRESERVED_KEYWORD)
PG_KEYWORD("all", ALL, RESERVED_KEYWORD)
PG_KEYWORD("also", ALSO, UNRESERVED_KEYWORD)
PG_KEYWORD("alter", ALTER, UNRESERVED_KEYWORD)
PG_KEYWORD("always", ALWAYS, UNRESERVED_KEYWORD)
PG_KEYWORD("analyse", ANALYSE, RESERVED_KEYWORD)        /* British spelling */
PG_KEYWORD("analyze", ANALYZE, RESERVED_KEYWORD)
PG_KEYWORD("and", AND, RESERVED_KEYWORD)
PG_KEYWORD("any", ANY, RESERVED_KEYWORD)
PG_KEYWORD("array", ARRAY, RESERVED_KEYWORD)
PG_KEYWORD("as", AS, RESERVED_KEYWORD)
PG_KEYWORD("asc", ASC, RESERVED_KEYWORD)
PG_KEYWORD("assertion", ASSERTION, UNRESERVED_KEYWORD)
PG_KEYWORD("assignment", ASSIGNMENT, UNRESERVED_KEYWORD)
PG_KEYWORD("asymmetric", ASYMMETRIC, RESERVED_KEYWORD)
PG_KEYWORD("at", AT, UNRESERVED_KEYWORD)
PG_KEYWORD("attach", ATTACH, UNRESERVED_KEYWORD)
PG_KEYWORD("attribute", ATTRIBUTE, UNRESERVED_KEYWORD)
PG_KEYWORD("authorization", AUTHORIZATION, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("backward", BACKWARD, UNRESERVED_KEYWORD)
PG_KEYWORD("before", BEFORE, UNRESERVED_KEYWORD)
PG_KEYWORD("begin", BEGIN_P, UNRESERVED_KEYWORD)
PG_KEYWORD("between", BETWEEN, COL_NAME_KEYWORD)
PG_KEYWORD("bigint", BIGINT, COL_NAME_KEYWORD)
PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD)
PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD)
PG_KEYWORD("both", BOTH, RESERVED_KEYWORD)
PG_KEYWORD("by", BY, UNRESERVED_KEYWORD)
PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD)
PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD)
PG_KEYWORD("called", CALLED, UNRESERVED_KEYWORD)
PG_KEYWORD("cascade", CASCADE, UNRESERVED_KEYWORD)
PG_KEYWORD("cascaded", CASCADED, UNRESERVED_KEYWORD)
PG_KEYWORD("case", CASE, RESERVED_KEYWORD)
PG_KEYWORD("cast", CAST, RESERVED_KEYWORD)
PG_KEYWORD("catalog", CATALOG_P, UNRESERVED_KEYWORD)
PG_KEYWORD("chain", CHAIN, UNRESERVED_KEYWORD)
PG_KEYWORD("char", CHAR_P, COL_NAME_KEYWORD)
PG_KEYWORD("character", CHARACTER, COL_NAME_KEYWORD)
PG_KEYWORD("characteristics", CHARACTERISTICS, UNRESERVED_KEYWORD)
PG_KEYWORD("check", CHECK, RESERVED_KEYWORD)
PG_KEYWORD("checkpoint", CHECKPOINT, UNRESERVED_KEYWORD)
PG_KEYWORD("class", CLASS, UNRESERVED_KEYWORD)
PG_KEYWORD("close", CLOSE, UNRESERVED_KEYWORD)
PG_KEYWORD("cluster", CLUSTER, UNRESERVED_KEYWORD)
PG_KEYWORD("coalesce", COALESCE, COL_NAME_KEYWORD)
PG_KEYWORD("collate", COLLATE, RESERVED_KEYWORD)
PG_KEYWORD("collation", COLLATION, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("column", COLUMN, RESERVED_KEYWORD)
PG_KEYWORD("columns", COLUMNS, UNRESERVED_KEYWORD)
PG_KEYWORD("comment", COMMENT, UNRESERVED_KEYWORD)
PG_KEYWORD("comments", COMMENTS, UNRESERVED_KEYWORD)
PG_KEYWORD("commit", COMMIT, UNRESERVED_KEYWORD)
PG_KEYWORD("committed", COMMITTED, UNRESERVED_KEYWORD)
PG_KEYWORD("concurrently", CONCURRENTLY, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("configuration", CONFIGURATION, UNRESERVED_KEYWORD)
PG_KEYWORD("conflict", CONFLICT, UNRESERVED_KEYWORD)
PG_KEYWORD("connection", CONNECTION, UNRESERVED_KEYWORD)
PG_KEYWORD("constraint", CONSTRAINT, RESERVED_KEYWORD)
PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD)
PG_KEYWORD("content", CONTENT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("continue", CONTINUE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("conversion", CONVERSION_P, UNRESERVED_KEYWORD)
PG_KEYWORD("copy", COPY, UNRESERVED_KEYWORD)
PG_KEYWORD("cost", COST, UNRESERVED_KEYWORD)
PG_KEYWORD("create", CREATE, RESERVED_KEYWORD)
PG_KEYWORD("cross", CROSS, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("csv", CSV, UNRESERVED_KEYWORD)
PG_KEYWORD("cube", CUBE, UNRESERVED_KEYWORD)
PG_KEYWORD("current", CURRENT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("current_catalog", CURRENT_CATALOG, RESERVED_KEYWORD)
PG_KEYWORD("current_date", CURRENT_DATE, RESERVED_KEYWORD)
PG_KEYWORD("current_role", CURRENT_ROLE, RESERVED_KEYWORD)
PG_KEYWORD("current_schema", CURRENT_SCHEMA, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("current_time", CURRENT_TIME, RESERVED_KEYWORD)
PG_KEYWORD("current_timestamp", CURRENT_TIMESTAMP, RESERVED_KEYWORD)
PG_KEYWORD("current_user", CURRENT_USER, RESERVED_KEYWORD)
PG_KEYWORD("cursor", CURSOR, UNRESERVED_KEYWORD)
PG_KEYWORD("cycle", CYCLE, UNRESERVED_KEYWORD)
PG_KEYWORD("data", DATA_P, UNRESERVED_KEYWORD)
PG_KEYWORD("database", DATABASE, UNRESERVED_KEYWORD)
PG_KEYWORD("day", DAY_P, UNRESERVED_KEYWORD)
PG_KEYWORD("deallocate", DEALLOCATE, UNRESERVED_KEYWORD)
PG_KEYWORD("dec", DEC, COL_NAME_KEYWORD)
PG_KEYWORD("decimal", DECIMAL_P, COL_NAME_KEYWORD)
PG_KEYWORD("declare", DECLARE, UNRESERVED_KEYWORD)
PG_KEYWORD("default", DEFAULT, RESERVED_KEYWORD)
PG_KEYWORD("defaults", DEFAULTS, UNRESERVED_KEYWORD)
PG_KEYWORD("deferrable", DEFERRABLE, RESERVED_KEYWORD)
PG_KEYWORD("deferred", DEFERRED, UNRESERVED_KEYWORD)
PG_KEYWORD("definer", DEFINER, UNRESERVED_KEYWORD)
PG_KEYWORD("delete", DELETE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("delimiter", DELIMITER, UNRESERVED_KEYWORD)
PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD)
PG_KEYWORD("depends", DEPENDS, UNRESERVED_KEYWORD)
PG_KEYWORD("desc", DESC, RESERVED_KEYWORD)
PG_KEYWORD("detach", DETACH, UNRESERVED_KEYWORD)
PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD)
PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD)
PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD)
PG_KEYWORD("do", DO, RESERVED_KEYWORD)
PG_KEYWORD("document", DOCUMENT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("domain", DOMAIN_P, UNRESERVED_KEYWORD)
PG_KEYWORD("double", DOUBLE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("drop", DROP, UNRESERVED_KEYWORD)
PG_KEYWORD("each", EACH, UNRESERVED_KEYWORD)
PG_KEYWORD("else", ELSE, RESERVED_KEYWORD)
PG_KEYWORD("enable", ENABLE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("encoding", ENCODING, UNRESERVED_KEYWORD)
PG_KEYWORD("encrypted", ENCRYPTED, UNRESERVED_KEYWORD)
PG_KEYWORD("end", END_P, RESERVED_KEYWORD)
PG_KEYWORD("enum", ENUM_P, UNRESERVED_KEYWORD)
PG_KEYWORD("escape", ESCAPE, UNRESERVED_KEYWORD)
PG_KEYWORD("event", EVENT, UNRESERVED_KEYWORD)
PG_KEYWORD("except", EXCEPT, RESERVED_KEYWORD)
PG_KEYWORD("exclude", EXCLUDE, UNRESERVED_KEYWORD)
PG_KEYWORD("excluding", EXCLUDING, UNRESERVED_KEYWORD)
PG_KEYWORD("exclusive", EXCLUSIVE, UNRESERVED_KEYWORD)
PG_KEYWORD("execute", EXECUTE, UNRESERVED_KEYWORD)
PG_KEYWORD("exists", EXISTS, COL_NAME_KEYWORD)
PG_KEYWORD("explain", EXPLAIN, UNRESERVED_KEYWORD)
PG_KEYWORD("extension", EXTENSION, UNRESERVED_KEYWORD)
PG_KEYWORD("external", EXTERNAL, UNRESERVED_KEYWORD)
PG_KEYWORD("extract", EXTRACT, COL_NAME_KEYWORD)
PG_KEYWORD("false", FALSE_P, RESERVED_KEYWORD)
PG_KEYWORD("family", FAMILY, UNRESERVED_KEYWORD)
PG_KEYWORD("fetch", FETCH, RESERVED_KEYWORD)
PG_KEYWORD("filter", FILTER, UNRESERVED_KEYWORD)
PG_KEYWORD("first", FIRST_P, UNRESERVED_KEYWORD)
PG_KEYWORD("float", FLOAT_P, COL_NAME_KEYWORD)
PG_KEYWORD("following", FOLLOWING, UNRESERVED_KEYWORD)
PG_KEYWORD("for", FOR, RESERVED_KEYWORD)
PG_KEYWORD("force", FORCE, UNRESERVED_KEYWORD)
PG_KEYWORD("foreign", FOREIGN, RESERVED_KEYWORD)
PG_KEYWORD("forward", FORWARD, UNRESERVED_KEYWORD)
PG_KEYWORD("freeze", FREEZE, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("from", FROM, RESERVED_KEYWORD)
PG_KEYWORD("full", FULL, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("function", FUNCTION, UNRESERVED_KEYWORD)
PG_KEYWORD("functions", FUNCTIONS, UNRESERVED_KEYWORD)
PG_KEYWORD("generated", GENERATED, UNRESERVED_KEYWORD)
PG_KEYWORD("global", GLOBAL, UNRESERVED_KEYWORD)
PG_KEYWORD("grant", GRANT, RESERVED_KEYWORD)
PG_KEYWORD("granted", GRANTED, UNRESERVED_KEYWORD)
PG_KEYWORD("greatest", GREATEST, COL_NAME_KEYWORD)
PG_KEYWORD("group", GROUP_P, RESERVED_KEYWORD)
PG_KEYWORD("grouping", GROUPING, COL_NAME_KEYWORD)
PG_KEYWORD("groups", GROUPS, UNRESERVED_KEYWORD)
PG_KEYWORD("handler", HANDLER, UNRESERVED_KEYWORD)
PG_KEYWORD("having", HAVING, RESERVED_KEYWORD)
PG_KEYWORD("header", HEADER_P, UNRESERVED_KEYWORD)
PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD)
PG_KEYWORD("hour", HOUR_P, UNRESERVED_KEYWORD)
PG_KEYWORD("identity", IDENTITY_P, UNRESERVED_KEYWORD)
PG_KEYWORD("if", IF_P, UNRESERVED_KEYWORD)
PG_KEYWORD("ilike", ILIKE, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("immediate", IMMEDIATE, UNRESERVED_KEYWORD)
PG_KEYWORD("immutable", IMMUTABLE, UNRESERVED_KEYWORD)
PG_KEYWORD("implicit", IMPLICIT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("import", IMPORT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("in", IN_P, RESERVED_KEYWORD)
PG_KEYWORD("include", INCLUDE, UNRESERVED_KEYWORD)
PG_KEYWORD("including", INCLUDING, UNRESERVED_KEYWORD)
PG_KEYWORD("increment", INCREMENT, UNRESERVED_KEYWORD)
PG_KEYWORD("index", INDEX, UNRESERVED_KEYWORD)
PG_KEYWORD("indexes", INDEXES, UNRESERVED_KEYWORD)
PG_KEYWORD("inherit", INHERIT, UNRESERVED_KEYWORD)
PG_KEYWORD("inherits", INHERITS, UNRESERVED_KEYWORD)
PG_KEYWORD("initially", INITIALLY, RESERVED_KEYWORD)
PG_KEYWORD("inline", INLINE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("inner", INNER_P, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("inout", INOUT, COL_NAME_KEYWORD)
PG_KEYWORD("input", INPUT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("insensitive", INSENSITIVE, UNRESERVED_KEYWORD)
PG_KEYWORD("insert", INSERT, UNRESERVED_KEYWORD)
PG_KEYWORD("instead", INSTEAD, UNRESERVED_KEYWORD)
PG_KEYWORD("int", INT_P, COL_NAME_KEYWORD)
PG_KEYWORD("integer", INTEGER, COL_NAME_KEYWORD)
PG_KEYWORD("intersect", INTERSECT, RESERVED_KEYWORD)
PG_KEYWORD("interval", INTERVAL, COL_NAME_KEYWORD)
PG_KEYWORD("into", INTO, RESERVED_KEYWORD)
PG_KEYWORD("invoker", INVOKER, UNRESERVED_KEYWORD)
PG_KEYWORD("is", IS, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("isnull", ISNULL, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("isolation", ISOLATION, UNRESERVED_KEYWORD)
PG_KEYWORD("join", JOIN, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("key", KEY, UNRESERVED_KEYWORD)
PG_KEYWORD("label", LABEL, UNRESERVED_KEYWORD)
PG_KEYWORD("language", LANGUAGE, UNRESERVED_KEYWORD)
PG_KEYWORD("large", LARGE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("last", LAST_P, UNRESERVED_KEYWORD)
PG_KEYWORD("lateral", LATERAL_P, RESERVED_KEYWORD)
PG_KEYWORD("leading", LEADING, RESERVED_KEYWORD)
PG_KEYWORD("leakproof", LEAKPROOF, UNRESERVED_KEYWORD)
PG_KEYWORD("least", LEAST, COL_NAME_KEYWORD)
PG_KEYWORD("left", LEFT, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("level", LEVEL, UNRESERVED_KEYWORD)
PG_KEYWORD("like", LIKE, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("limit", LIMIT, RESERVED_KEYWORD)
PG_KEYWORD("listen", LISTEN, UNRESERVED_KEYWORD)
PG_KEYWORD("load", LOAD, UNRESERVED_KEYWORD)
PG_KEYWORD("local", LOCAL, UNRESERVED_KEYWORD)
PG_KEYWORD("localtime", LOCALTIME, RESERVED_KEYWORD)
PG_KEYWORD("localtimestamp", LOCALTIMESTAMP, RESERVED_KEYWORD)
PG_KEYWORD("location", LOCATION, UNRESERVED_KEYWORD)
PG_KEYWORD("lock", LOCK_P, UNRESERVED_KEYWORD)
PG_KEYWORD("locked", LOCKED, UNRESERVED_KEYWORD)
PG_KEYWORD("logged", LOGGED, UNRESERVED_KEYWORD)
PG_KEYWORD("mapping", MAPPING, UNRESERVED_KEYWORD)
PG_KEYWORD("match", MATCH, UNRESERVED_KEYWORD)
PG_KEYWORD("materialized", MATERIALIZED, UNRESERVED_KEYWORD)
PG_KEYWORD("maxvalue", MAXVALUE, UNRESERVED_KEYWORD)
PG_KEYWORD("method", METHOD, UNRESERVED_KEYWORD)
PG_KEYWORD("minute", MINUTE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("minvalue", MINVALUE, UNRESERVED_KEYWORD)
PG_KEYWORD("mode", MODE, UNRESERVED_KEYWORD)
PG_KEYWORD("month", MONTH_P, UNRESERVED_KEYWORD)
PG_KEYWORD("move", MOVE, UNRESERVED_KEYWORD)
PG_KEYWORD("name", NAME_P, UNRESERVED_KEYWORD)
PG_KEYWORD("names", NAMES, UNRESERVED_KEYWORD)
PG_KEYWORD("national", NATIONAL, COL_NAME_KEYWORD)
PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD)
PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD)
PG_KEYWORD("notnull", NOTNULL, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("nowait", NOWAIT, UNRESERVED_KEYWORD)
PG_KEYWORD("null", NULL_P, RESERVED_KEYWORD)
PG_KEYWORD("nullif", NULLIF, COL_NAME_KEYWORD)
PG_KEYWORD("nulls", NULLS_P, UNRESERVED_KEYWORD)
PG_KEYWORD("numeric", NUMERIC, COL_NAME_KEYWORD)
PG_KEYWORD("object", OBJECT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("of", OF, UNRESERVED_KEYWORD)
PG_KEYWORD("off", OFF, UNRESERVED_KEYWORD)
PG_KEYWORD("offset", OFFSET, RESERVED_KEYWORD)
PG_KEYWORD("oids", OIDS, UNRESERVED_KEYWORD)
PG_KEYWORD("old", OLD, UNRESERVED_KEYWORD)
PG_KEYWORD("on", ON, RESERVED_KEYWORD)
PG_KEYWORD("only", ONLY, RESERVED_KEYWORD)
PG_KEYWORD("operator", OPERATOR, UNRESERVED_KEYWORD)
PG_KEYWORD("option", OPTION, UNRESERVED_KEYWORD)
PG_KEYWORD("options", OPTIONS, UNRESERVED_KEYWORD)
PG_KEYWORD("or", OR, RESERVED_KEYWORD)
PG_KEYWORD("order", ORDER, RESERVED_KEYWORD)
PG_KEYWORD("ordinality", ORDINALITY, UNRESERVED_KEYWORD)
PG_KEYWORD("others", OTHERS, UNRESERVED_KEYWORD)
PG_KEYWORD("out", OUT_P, COL_NAME_KEYWORD)
PG_KEYWORD("outer", OUTER_P, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("over", OVER, UNRESERVED_KEYWORD)
PG_KEYWORD("overlaps", OVERLAPS, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("overlay", OVERLAY, COL_NAME_KEYWORD)
PG_KEYWORD("overriding", OVERRIDING, UNRESERVED_KEYWORD)
PG_KEYWORD("owned", OWNED, UNRESERVED_KEYWORD)
PG_KEYWORD("owner", OWNER, UNRESERVED_KEYWORD)
PG_KEYWORD("parallel", PARALLEL, UNRESERVED_KEYWORD)
PG_KEYWORD("parser", PARSER, UNRESERVED_KEYWORD)
PG_KEYWORD("partial", PARTIAL, UNRESERVED_KEYWORD)
PG_KEYWORD("partition", PARTITION, UNRESERVED_KEYWORD)
PG_KEYWORD("passing", PASSING, UNRESERVED_KEYWORD)
PG_KEYWORD("password", PASSWORD, UNRESERVED_KEYWORD)
PG_KEYWORD("placing", PLACING, RESERVED_KEYWORD)
PG_KEYWORD("plans", PLANS, UNRESERVED_KEYWORD)
PG_KEYWORD("policy", POLICY, UNRESERVED_KEYWORD)
PG_KEYWORD("position", POSITION, COL_NAME_KEYWORD)
PG_KEYWORD("preceding", PRECEDING, UNRESERVED_KEYWORD)
PG_KEYWORD("precision", PRECISION, COL_NAME_KEYWORD)
PG_KEYWORD("prepare", PREPARE, UNRESERVED_KEYWORD)
PG_KEYWORD("prepared", PREPARED, UNRESERVED_KEYWORD)
PG_KEYWORD("preserve", PRESERVE, UNRESERVED_KEYWORD)
PG_KEYWORD("primary", PRIMARY, RESERVED_KEYWORD)
PG_KEYWORD("prior", PRIOR, UNRESERVED_KEYWORD)
PG_KEYWORD("privileges", PRIVILEGES, UNRESERVED_KEYWORD)
PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD)
PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD)
PG_KEYWORD("procedures", PROCEDURES, UNRESERVED_KEYWORD)
PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD)
PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD)
PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD)
PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD)
PG_KEYWORD("read", READ, UNRESERVED_KEYWORD)
PG_KEYWORD("real", REAL, COL_NAME_KEYWORD)
PG_KEYWORD("reassign", REASSIGN, UNRESERVED_KEYWORD)
PG_KEYWORD("recheck", RECHECK, UNRESERVED_KEYWORD)
PG_KEYWORD("recursive", RECURSIVE, UNRESERVED_KEYWORD)
PG_KEYWORD("ref", REF, UNRESERVED_KEYWORD)
PG_KEYWORD("references", REFERENCES, RESERVED_KEYWORD)
PG_KEYWORD("referencing", REFERENCING, UNRESERVED_KEYWORD)
PG_KEYWORD("refresh", REFRESH, UNRESERVED_KEYWORD)
PG_KEYWORD("reindex", REINDEX, UNRESERVED_KEYWORD)
PG_KEYWORD("relative", RELATIVE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("release", RELEASE, UNRESERVED_KEYWORD)
PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD)
PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD)
PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD)
PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD)
PG_KEYWORD("reset", RESET, UNRESERVED_KEYWORD)
PG_KEYWORD("restart", RESTART, UNRESERVED_KEYWORD)
PG_KEYWORD("restrict", RESTRICT, UNRESERVED_KEYWORD)
PG_KEYWORD("returning", RETURNING, RESERVED_KEYWORD)
PG_KEYWORD("returns", RETURNS, UNRESERVED_KEYWORD)
PG_KEYWORD("revoke", REVOKE, UNRESERVED_KEYWORD)
PG_KEYWORD("right", RIGHT, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("role", ROLE, UNRESERVED_KEYWORD)
PG_KEYWORD("rollback", ROLLBACK, UNRESERVED_KEYWORD)
PG_KEYWORD("rollup", ROLLUP, UNRESERVED_KEYWORD)
PG_KEYWORD("routine", ROUTINE, UNRESERVED_KEYWORD)
PG_KEYWORD("routines", ROUTINES, UNRESERVED_KEYWORD)
PG_KEYWORD("row", ROW, COL_NAME_KEYWORD)
PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD)
PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD)
PG_KEYWORD("savepoint", SAVEPOINT, UNRESERVED_KEYWORD)
PG_KEYWORD("schema", SCHEMA, UNRESERVED_KEYWORD)
PG_KEYWORD("schemas", SCHEMAS, UNRESERVED_KEYWORD)
PG_KEYWORD("scroll", SCROLL, UNRESERVED_KEYWORD)
PG_KEYWORD("search", SEARCH, UNRESERVED_KEYWORD)
PG_KEYWORD("second", SECOND_P, UNRESERVED_KEYWORD)
PG_KEYWORD("security", SECURITY, UNRESERVED_KEYWORD)
PG_KEYWORD("select", SELECT, RESERVED_KEYWORD)
PG_KEYWORD("sequence", SEQUENCE, UNRESERVED_KEYWORD)
PG_KEYWORD("sequences", SEQUENCES, UNRESERVED_KEYWORD)
PG_KEYWORD("serializable", SERIALIZABLE, UNRESERVED_KEYWORD)
PG_KEYWORD("server", SERVER, UNRESERVED_KEYWORD)
PG_KEYWORD("session", SESSION, UNRESERVED_KEYWORD)
PG_KEYWORD("session_user", SESSION_USER, RESERVED_KEYWORD)
PG_KEYWORD("set", SET, UNRESERVED_KEYWORD)
PG_KEYWORD("setof", SETOF, COL_NAME_KEYWORD)
PG_KEYWORD("sets", SETS, UNRESERVED_KEYWORD)
PG_KEYWORD("share", SHARE, UNRESERVED_KEYWORD)
PG_KEYWORD("show", SHOW, UNRESERVED_KEYWORD)
PG_KEYWORD("similar", SIMILAR, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("simple", SIMPLE, UNRESERVED_KEYWORD)
PG_KEYWORD("skip", SKIP, UNRESERVED_KEYWORD)
PG_KEYWORD("smallint", SMALLINT, COL_NAME_KEYWORD)
PG_KEYWORD("snapshot", SNAPSHOT, UNRESERVED_KEYWORD)
PG_KEYWORD("some", SOME, RESERVED_KEYWORD)
PG_KEYWORD("sql", SQL_P, UNRESERVED_KEYWORD)
PG_KEYWORD("stable", STABLE, UNRESERVED_KEYWORD)
PG_KEYWORD("standalone", STANDALONE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("start", START, UNRESERVED_KEYWORD)
PG_KEYWORD("statement", STATEMENT, UNRESERVED_KEYWORD)
PG_KEYWORD("statistics", STATISTICS, UNRESERVED_KEYWORD)
PG_KEYWORD("stdin", STDIN, UNRESERVED_KEYWORD)
PG_KEYWORD("stdout", STDOUT, UNRESERVED_KEYWORD)
PG_KEYWORD("storage", STORAGE, UNRESERVED_KEYWORD)
PG_KEYWORD("strict", STRICT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("strip", STRIP_P, UNRESERVED_KEYWORD)
PG_KEYWORD("subscription", SUBSCRIPTION, UNRESERVED_KEYWORD)
PG_KEYWORD("substring", SUBSTRING, COL_NAME_KEYWORD)
PG_KEYWORD("symmetric", SYMMETRIC, RESERVED_KEYWORD)
PG_KEYWORD("sysid", SYSID, UNRESERVED_KEYWORD)
PG_KEYWORD("system", SYSTEM_P, UNRESERVED_KEYWORD)
PG_KEYWORD("table", TABLE, RESERVED_KEYWORD)
PG_KEYWORD("tables", TABLES, UNRESERVED_KEYWORD)
PG_KEYWORD("tablesample", TABLESAMPLE, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("tablespace", TABLESPACE, UNRESERVED_KEYWORD)
PG_KEYWORD("temp", TEMP, UNRESERVED_KEYWORD)
PG_KEYWORD("template", TEMPLATE, UNRESERVED_KEYWORD)
PG_KEYWORD("temporary", TEMPORARY, UNRESERVED_KEYWORD)
PG_KEYWORD("text", TEXT_P, UNRESERVED_KEYWORD)
PG_KEYWORD("then", THEN, RESERVED_KEYWORD)
PG_KEYWORD("ties", TIES, UNRESERVED_KEYWORD)
PG_KEYWORD("time", TIME, COL_NAME_KEYWORD)
PG_KEYWORD("timestamp", TIMESTAMP, COL_NAME_KEYWORD)
PG_KEYWORD("to", TO, RESERVED_KEYWORD)
PG_KEYWORD("trailing", TRAILING, RESERVED_KEYWORD)
PG_KEYWORD("transaction", TRANSACTION, UNRESERVED_KEYWORD)
PG_KEYWORD("transform", TRANSFORM, UNRESERVED_KEYWORD)
PG_KEYWORD("treat", TREAT, COL_NAME_KEYWORD)
PG_KEYWORD("trigger", TRIGGER, UNRESERVED_KEYWORD)
PG_KEYWORD("trim", TRIM, COL_NAME_KEYWORD)
PG_KEYWORD("true", TRUE_P, RESERVED_KEYWORD)
PG_KEYWORD("truncate", TRUNCATE, UNRESERVED_KEYWORD)
PG_KEYWORD("trusted", TRUSTED, UNRESERVED_KEYWORD)
PG_KEYWORD("type", TYPE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("types", TYPES_P, UNRESERVED_KEYWORD)
PG_KEYWORD("unbounded", UNBOUNDED, UNRESERVED_KEYWORD)
PG_KEYWORD("uncommitted", UNCOMMITTED, UNRESERVED_KEYWORD)
PG_KEYWORD("unencrypted", UNENCRYPTED, UNRESERVED_KEYWORD)
PG_KEYWORD("union", UNION, RESERVED_KEYWORD)
PG_KEYWORD("unique", UNIQUE, RESERVED_KEYWORD)
PG_KEYWORD("unknown", UNKNOWN, UNRESERVED_KEYWORD)
PG_KEYWORD("unlisten", UNLISTEN, UNRESERVED_KEYWORD)
PG_KEYWORD("unlogged", UNLOGGED, UNRESERVED_KEYWORD)
PG_KEYWORD("until", UNTIL, UNRESERVED_KEYWORD)
PG_KEYWORD("update", UPDATE, UNRESERVED_KEYWORD)
PG_KEYWORD("user", USER, RESERVED_KEYWORD)
PG_KEYWORD("using", USING, RESERVED_KEYWORD)
PG_KEYWORD("vacuum", VACUUM, UNRESERVED_KEYWORD)
PG_KEYWORD("valid", VALID, UNRESERVED_KEYWORD)
PG_KEYWORD("validate", VALIDATE, UNRESERVED_KEYWORD)
PG_KEYWORD("validator", VALIDATOR, UNRESERVED_KEYWORD)
PG_KEYWORD("value", VALUE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("values", VALUES, COL_NAME_KEYWORD)
PG_KEYWORD("varchar", VARCHAR, COL_NAME_KEYWORD)
PG_KEYWORD("variadic", VARIADIC, RESERVED_KEYWORD)
PG_KEYWORD("varying", VARYING, UNRESERVED_KEYWORD)
PG_KEYWORD("verbose", VERBOSE, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("version", VERSION_P, UNRESERVED_KEYWORD)
PG_KEYWORD("view", VIEW, UNRESERVED_KEYWORD)
PG_KEYWORD("views", VIEWS, UNRESERVED_KEYWORD)
PG_KEYWORD("volatile", VOLATILE, UNRESERVED_KEYWORD)
PG_KEYWORD("when", WHEN, RESERVED_KEYWORD)
PG_KEYWORD("where", WHERE, RESERVED_KEYWORD)
PG_KEYWORD("whitespace", WHITESPACE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("window", WINDOW, RESERVED_KEYWORD)
PG_KEYWORD("with", WITH, RESERVED_KEYWORD)
PG_KEYWORD("within", WITHIN, UNRESERVED_KEYWORD)
PG_KEYWORD("without", WITHOUT, UNRESERVED_KEYWORD)
PG_KEYWORD("work", WORK, UNRESERVED_KEYWORD)
PG_KEYWORD("wrapper", WRAPPER, UNRESERVED_KEYWORD)
PG_KEYWORD("write", WRITE, UNRESERVED_KEYWORD)
PG_KEYWORD("xml", XML_P, UNRESERVED_KEYWORD)
PG_KEYWORD("xmlattributes", XMLATTRIBUTES, COL_NAME_KEYWORD)
PG_KEYWORD("xmlconcat", XMLCONCAT, COL_NAME_KEYWORD)
PG_KEYWORD("xmlelement", XMLELEMENT, COL_NAME_KEYWORD)
PG_KEYWORD("xmlexists", XMLEXISTS, COL_NAME_KEYWORD)
PG_KEYWORD("xmlforest", XMLFOREST, COL_NAME_KEYWORD)
PG_KEYWORD("xmlnamespaces", XMLNAMESPACES, COL_NAME_KEYWORD)
PG_KEYWORD("xmlparse", XMLPARSE, COL_NAME_KEYWORD)
PG_KEYWORD("xmlpi", XMLPI, COL_NAME_KEYWORD)
PG_KEYWORD("xmlroot", XMLROOT, COL_NAME_KEYWORD)
PG_KEYWORD("xmlserialize", XMLSERIALIZE, COL_NAME_KEYWORD)
PG_KEYWORD("xmltable", XMLTABLE, COL_NAME_KEYWORD)
PG_KEYWORD("year", YEAR_P, UNRESERVED_KEYWORD)
PG_KEYWORD("yes", YES_P, UNRESERVED_KEYWORD)
PG_KEYWORD("zone", ZONE, UNRESERVED_KEYWORD)
  • 词法分析器scan.l ——定义段
/* src/backend/parser/scan.l */

/* 定义段 */
...
// %x 申明了排他的开始条件
%x xb  /* 二进制字符串;例如:bB'0101'  */
%x xc  /*  C语言风格的comment;例如:/*  comment  * / */
%x xd  /* 使用双引号括起来的标识符;例如:"colname" */
%x xh  /* 十六进制字符串;例如:xX'FE5F'  */
%x xq  /* 基本的单引号字符串;例如:'string'  */
%x xe  /* 扩展的单引号字符串,支持转义字符\ ;例如:'string \' string2' */
%x xdolq  /* 采用$符号括着的字符串,例如:$foo$   */
%x xui  /* 使用Unicode括起来的标识符,例如:uU"FE5F"   */
%x xuiend  /*  xui  的结束  */
%x xus    /* 使用Unicode括起来的字符串,例如:uU"FE5F"   */
%x xusend  /*  xus   的结束 */
%x xeu  /* xe里面的Unicode字符串 例如:'string \uD5EF string2'  */

space           [ \t\n\r\f] // 空白字符
horiz_space     [ \t\f]
newline         [\n\r] // 换行字符
non_newline     [^\n\r]
comment         ("--"{non_newline}*) // 行末注释
whitespace      ({space}+|{comment}) // 行末空白字符及注释

special_whitespace      ({space}+|{comment}{newline})
horiz_whitespace        ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)

/*单引号*/
quote           '           /*【取消'】*/
quotestop       {quote}{whitespace}*
quotecontinue   {quote}{whitespace_with_newline}{quote}
quotefail       {quote}{whitespace}*"-"

/*二进制字符串的开始、内部不能含有' */
xbstart         [bB]{quote}
xbinside        [^']*           /*【取消'】*/

/*十六进制字符串的开始、内部不能含有' */
xhstart         [xX]{quote}
xhinside        [^']*          /*【取消'】*/

/*nchar字符串的开始 */
xnstart         [nN]{quote}

/* 扩展的单引号字符串,支持转义字符\  */
xestart         [eE]{quote}
xeinside        [^\\']+          /*【取消'】*/
xeescape        [\\][^0-7]
xeoctesc        [\\][0-7]{1,3}
xehexesc        [\\]x[0-9A-Fa-f]{1,2}
xeunicode       [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
xeunicodefail   [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})

/* 基本的单引号字符串,内部不能含有' */
xqstart         {quote}
xqdouble        {quote}{quote}
xqinside        [^']+           /*【取消'】*/

/* $foo$ 样式的括号 */
dolq_start      [A-Za-z\200-\377_]
dolq_cont       [A-Za-z\200-\377_0-9]
dolqdelim       \$({dolq_start}{dolq_cont}*)?\$
dolqfailed      \${dolq_start}{dolq_cont}*
dolqinside      [^$]+

/* 使用双引号括起来的标识符 */
dquote          \"         /*【取消"】*/
xdstart         {dquote}
xdstop          {dquote}
xddouble        {dquote}{dquote}
xdinside        [^"]+        /*【取消"】*/

/* Unicode escapes */
uescape         [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}          /*【取消'】*/
/* error rule to avoid backup */
uescapefail     [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]          /*【取消'】*/

/* 使用Unicode括起来的标识符 */
xuistart        [uU]&{dquote}

/* 使用Unicode括起来的字符串 */
xusstart        [uU]&{quote}

/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
xustop1     {uescapefail}?
xustop2     {uescape}

/* error rule to avoid backup */
xufailed        [uU]&

/* C语言风格的注释 */
xcstart         \/\*{op_chars}*
xcstop          \*+\/
xcinside        [^*/]+

digit           [0-9]
ident_start     [A-Za-z\200-\377_]
ident_cont      [A-Za-z\200-\377_0-9\$]

/* 标识符 */
identifier      {ident_start}{ident_cont}*

/* 操作符 */
typecast        "::"
dot_dot         \.\.
colon_equals    ":="
equals_greater  "=>"
less_equals     "<="
greater_equals  ">="
less_greater    "<>"
not_equals      "!="

self            [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
op_chars        [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
operator        {op_chars}+

integer         {digit}+
decimal         (({digit}*\.{digit}+)|({digit}+\.{digit}*))
decimalfail     {digit}+\.\.
real            ({integer}|{decimal})[Ee][-+]?{digit}+
realfail1       ({integer}|{decimal})[Ee]
realfail2       ({integer}|{decimal})[Ee][-+]

param           \${integer}

other           .

  • 词法分析器scan.l ——规则段
%%
/* 规则段 */
{whitespace}    {
                    /* 忽略空白字符 */
                }
/* comment 注释*/
{xcstart}       {
                    /* comment 开始*/
                    SET_YYLLOC();
                    yyextra->xcdepth = 0;
                    BEGIN(xc); // 设置comment开始条件
                    /* Put back any characters past slash-star; see above */
                    yyless(2);
                }

{xcstart}   {
                    (yyextra->xcdepth)++; // comment层次加1
                    /* Put back any characters past slash-star; see above */
                    yyless(2);
                }

{xcstop}    {
                    if (yyextra->xcdepth <= 0)
                        BEGIN(INITIAL);  // 退出comment条件
                    else
                        (yyextra->xcdepth)--; // comment层次减1
                }

{xcinside}  {
                    /* ignore */
                }

{op_chars}  {
                    /* ignore */
                }

\*+         {
                    /* ignore */
                }

<>     { yyerror("unterminated /* comment"); }
/* 二进制字符串*/
{xbstart}       {
                    /* Binary bit type.
                     * At some point we should simply pass the string
                     * forward to the parser and label it there.
                     * In the meantime, place a leading "b" on the string
                     * to mark it for the input routine as a binary string.
                     */
                    SET_YYLLOC();
                    BEGIN(xb); /* 二进制字符串开始条件*/
                    startlit();
                    addlitchar('b', yyscanner);
                }
{quotestop} |
{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);/* 二进制字符串结束条件*/
                    yylval->str = litbufdup(yyscanner);
                    return BCONST;
                }
{xhinside}  |
{xbinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
{quotecontinue} |
{quotecontinue} {
                    /* ignore */
                }
<>     { yyerror("unterminated bit string literal"); }
/* 十六进制字符串*/
{xhstart}       {
                    /* Hexadecimal bit type.
                     * At some point we should simply pass the string
                     * forward to the parser and label it there.
                     * In the meantime, place a leading "x" on the string
                     * to mark it for the input routine as a hex string.
                     */
                    SET_YYLLOC();
                    BEGIN(xh);
                    startlit();
                    addlitchar('x', yyscanner);
                }
{quotestop} |
{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    yylval->str = litbufdup(yyscanner);
                    return XCONST;
                }
<>     { yyerror("unterminated hexadecimal string literal"); }
/* nchar 字符串*/
{xnstart}       {
                    /* National character.
                     * We will pass this along as a normal character string,
                     * but preceded with an internally-generated "NCHAR".
                     */
                    const ScanKeyword *keyword;

                    SET_YYLLOC();
                    yyless(1);  /* eat only 'n' this time */

                    keyword = ScanKeywordLookup("nchar",
                                                yyextra->keywords,
                                                yyextra->num_keywords);
                    if (keyword != NULL)
                    {
                        yylval->keyword = keyword->name;
                        return keyword->value;
                    }
                    else
                    {
                        /* If NCHAR isn't a keyword, just return "n" */
                        yylval->str = pstrdup("n");
                        return IDENT;
                    }
                }
/* 基本的单引号字符串*/
{xqstart}       {
                    yyextra->warn_on_first_escape = true;
                    yyextra->saw_non_ascii = false;
                    SET_YYLLOC();
                    if (yyextra->standard_conforming_strings)
                        BEGIN(xq);
                    else
                        BEGIN(xe);
                    startlit();
                }
/* 扩展的单引号字符串 */
{xestart}       {
                    yyextra->warn_on_first_escape = false;
                    yyextra->saw_non_ascii = false;
                    SET_YYLLOC();
                    BEGIN(xe);
                    startlit();
                }
/* 使用Unicode括起来的字符串 */
{xusstart}      {
                    SET_YYLLOC();
                    if (!yyextra->standard_conforming_strings)
                        ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 errmsg("unsafe use of string constant with Unicode escapes"),
                                 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
                                 lexer_errposition()));
                    BEGIN(xus);
                    startlit();
                }
{quotestop}  |
{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    /*
                     * check that the data remains valid if it might have been
                     * made invalid by unescaping any chars.
                     */
                    if (yyextra->saw_non_ascii)
                        pg_verifymbstr(yyextra->literalbuf,
                                       yyextra->literallen,
                                       false);
                    yylval->str = litbufdup(yyscanner);
                    return SCONST;
                }
{quotestop} |
{quotefail} {
                    /* throw back all but the quote */
                    yyless(1);
                    /* xusend state looks for possible UESCAPE */
                    BEGIN(xusend);
                }
{whitespace} {
                    /* stay in xusend state over whitespace */
                }
<> |
{other} |
{xustop1} {
                    /* no UESCAPE after the quote, throw back everything */
                    yyless(0);
                    BEGIN(INITIAL);
                    yylval->str = litbuf_udeescape('\\', yyscanner);
                    return SCONST;
                }
{xustop2} {
                    /* found UESCAPE after the end quote */
                    BEGIN(INITIAL);
                    if (!check_uescapechar(yytext[yyleng - 2]))
                    {
                        SET_YYLLOC();
                        ADVANCE_YYLLOC(yyleng - 2);
                        yyerror("invalid Unicode escape character");
                    }
                    yylval->str = litbuf_udeescape(yytext[yyleng - 2],
                                                   yyscanner);
                    return SCONST;
                }
{xqdouble} {
                    addlitchar('\'', yyscanner);
                }
{xqinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
{xeinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
{xeunicode} {
                    pg_wchar    c = strtoul(yytext + 2, NULL, 16);

                    check_escape_warning(yyscanner);

                    if (is_utf16_surrogate_first(c))
                    {
                        yyextra->utf16_first_part = c;
                        BEGIN(xeu);
                    }
                    else if (is_utf16_surrogate_second(c))
                        yyerror("invalid Unicode surrogate pair");
                    else
                        addunicode(c, yyscanner);
                }
{xeunicode} {
                    pg_wchar    c = strtoul(yytext + 2, NULL, 16);

                    if (!is_utf16_surrogate_second(c))
                        yyerror("invalid Unicode surrogate pair");

                    c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);

                    addunicode(c, yyscanner);

                    BEGIN(xe);
                }
.          { yyerror("invalid Unicode surrogate pair"); }
\n         { yyerror("invalid Unicode surrogate pair"); }
<>    { yyerror("invalid Unicode surrogate pair"); }
{xeunicodefail} {
                    ereport(ERROR,
                            (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                             errmsg("invalid Unicode escape"),
                             errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
                             lexer_errposition()));
                }
{xeescape}  {
                    if (yytext[1] == '\'')
                    {
                        if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
                            (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
                             PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
                            ereport(ERROR,
                                    (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                                     errmsg("unsafe use of \\' in a string literal"),
                                     errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
                                     lexer_errposition()));
                    }
                    check_string_escape_warning(yytext[1], yyscanner);
                    addlitchar(unescape_single_char(yytext[1], yyscanner),
                               yyscanner);
                }
{xeoctesc}  {
                    unsigned char c = strtoul(yytext + 1, NULL, 8);

                    check_escape_warning(yyscanner);
                    addlitchar(c, yyscanner);
                    if (c == '\0' || IS_HIGHBIT_SET(c))
                        yyextra->saw_non_ascii = true;
                }
{xehexesc}  {
                    unsigned char c = strtoul(yytext + 2, NULL, 16);

                    check_escape_warning(yyscanner);
                    addlitchar(c, yyscanner);
                    if (c == '\0' || IS_HIGHBIT_SET(c))
                        yyextra->saw_non_ascii = true;
                }
{quotecontinue} {
                    /* ignore */
                }
.           {
                    /* This is only needed for \ just before EOF */
                    addlitchar(yytext[0], yyscanner);
                }
<>      { yyerror("unterminated quoted string"); }

/* 采用$符号括着的字符串,例如:$foo$   */
{dolqdelim}     {
                    SET_YYLLOC();
                    yyextra->dolqstart = pstrdup(yytext);
                    BEGIN(xdolq);
                    startlit();
                }
{dolqfailed}    {
                    SET_YYLLOC();
                    /* throw back all but the initial "$" */
                    yyless(1);
                    /* and treat it as {other} */
                    return yytext[0];
                }
{dolqdelim} {
                    if (strcmp(yytext, yyextra->dolqstart) == 0)
                    {
                        pfree(yyextra->dolqstart);
                        yyextra->dolqstart = NULL;
                        BEGIN(INITIAL);
                        yylval->str = litbufdup(yyscanner);
                        return SCONST;
                    }
                    else
                    {
                        /*
                         * When we fail to match $...$ to dolqstart, transfer
                         * the $... part to the output, but put back the final
                         * $ for rescanning.  Consider $delim$...$junk$delim$
                         */
                        addlit(yytext, yyleng - 1, yyscanner);
                        yyless(yyleng - 1);
                    }
                }
{dolqinside} {
                    addlit(yytext, yyleng, yyscanner);
                }
{dolqfailed} {
                    addlit(yytext, yyleng, yyscanner);
                }
.        {
                    /* This is only needed for $ inside the quoted text */
                    addlitchar(yytext[0], yyscanner);
                }
<>  { yyerror("unterminated dollar-quoted string"); }
/*使用双引号括起来的标识符*/
{xdstart}       {
                    SET_YYLLOC();
                    BEGIN(xd);
                    startlit();
                }
/* 使用Unicode括起来的标识符,例如:uU"FE5F"   */
{xuistart}      {
                    SET_YYLLOC();
                    BEGIN(xui);
                    startlit();
                }
{xdstop}    {
                    char       *ident;

                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbufdup(yyscanner);
                    if (yyextra->literallen >= NAMEDATALEN)
                        truncate_identifier(ident, yyextra->literallen, true);
                    yylval->str = ident;
                    return IDENT;
                }
{dquote} {
                    yyless(1);
                    /* xuiend state looks for possible UESCAPE */
                    BEGIN(xuiend);
                }
{whitespace} {
                    /* stay in xuiend state over whitespace */
                }
<> |
{other} |
{xustop1} {
                    /* no UESCAPE after the quote, throw back everything */
                    char       *ident;
                    int         identlen;

                    yyless(0);

                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbuf_udeescape('\\', yyscanner);
                    identlen = strlen(ident);
                    if (identlen >= NAMEDATALEN)
                        truncate_identifier(ident, identlen, true);
                    yylval->str = ident;
                    return IDENT;
                }
{xustop2}   {
                    /* found UESCAPE after the end quote */
                    char       *ident;
                    int         identlen;

                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    if (!check_uescapechar(yytext[yyleng - 2]))
                    {
                        SET_YYLLOC();
                        ADVANCE_YYLLOC(yyleng - 2);
                        yyerror("invalid Unicode escape character");
                    }
                    ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
                    identlen = strlen(ident);
                    if (identlen >= NAMEDATALEN)
                        truncate_identifier(ident, identlen, true);
                    yylval->str = ident;
                    return IDENT;
                }
{xddouble}  {
                    addlitchar('"', yyscanner);
                }
{xdinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
<>     { yyerror("unterminated quoted identifier"); }

{xufailed}  {
                    char       *ident;

                    SET_YYLLOC();
                    /* throw back all but the initial u/U */
                    yyless(1);
                    /* and treat it as {identifier} */
                    ident = downcase_truncate_identifier(yytext, yyleng, true);
                    yylval->str = ident;
                    return IDENT;
                }
/* 操作符   */
{typecast}      {
                    SET_YYLLOC();
                    return TYPECAST;
                }

{dot_dot}       {
                    SET_YYLLOC();
                    return DOT_DOT;
                }

{colon_equals}  {
                    SET_YYLLOC();
                    return COLON_EQUALS;
                }

{equals_greater} {
                    SET_YYLLOC();
                    return EQUALS_GREATER;
                }

{less_equals}   {
                    SET_YYLLOC();
                    return LESS_EQUALS;
                }

{greater_equals} {
                    SET_YYLLOC();
                    return GREATER_EQUALS;
                }

{less_greater}  {
                    /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
                    SET_YYLLOC();
                    return NOT_EQUALS;
                }

{not_equals}    {
                    /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
                    SET_YYLLOC();
                    return NOT_EQUALS;
                }

{self}          {
                    SET_YYLLOC();
                    return yytext[0];
                }

{operator}      {
                    /*
                     * Check for embedded slash-star or dash-dash; those
                     * are comment starts, so operator must stop there.
                     * Note that slash-star or dash-dash at the first
                     * character will match a prior rule, not this one.
                     */
                    int         nchars = yyleng;
                    char       *slashstar = strstr(yytext, "/*");
                    char       *dashdash = strstr(yytext, "--");

                    if (slashstar && dashdash)
                    {
                        /* if both appear, take the first one */
                        if (slashstar > dashdash)
                            slashstar = dashdash;
                    }
                    else if (!slashstar)
                        slashstar = dashdash;
                    if (slashstar)
                        nchars = slashstar - yytext;

                    /*
                     * For SQL compatibility, '+' and '-' cannot be the
                     * last char of a multi-char operator unless the operator
                     * contains chars that are not in SQL operators.
                     * The idea is to lex '=-' as two operators, but not
                     * to forbid operator names like '?-' that could not be
                     * sequences of SQL operators.
                     */
                    while (nchars > 1 &&
                           (yytext[nchars - 1] == '+' ||
                            yytext[nchars - 1] == '-'))
                    {
                        int         ic;

                        for (ic = nchars - 2; ic >= 0; ic--)
                        {
                            if (strchr("~!@#^&|`?%", yytext[ic]))
                                break;
                        }
                        if (ic >= 0)
                            break; /* found a char that makes it OK */
                        nchars--; /* else remove the +/-, and check again */
                    }

                    SET_YYLLOC();

                    if (nchars < yyleng)
                    {
                        /* Strip the unwanted chars from the token */
                        yyless(nchars);
                        /*
                         * If what we have left is only one char, and it's
                         * one of the characters matching "self", then
                         * return it as a character token the same way
                         * that the "self" rule would have.
                         */
                        if (nchars == 1 &&
                            strchr(",()[].;:+-*/%^<>=", yytext[0]))
                            return yytext[0];
                    }

                    /*
                     * Complain if operator is too long.  Unlike the case
                     * for identifiers, we make this an error not a notice-
                     * and-truncate, because the odds are we are looking at
                     * a syntactic mistake anyway.
                     */
                    if (nchars >= NAMEDATALEN)
                        yyerror("operator too long");

                    yylval->str = pstrdup(yytext);
                    return Op;
                }
/* 数值类型   */
{param}         {
                    SET_YYLLOC();
                    yylval->ival = atol(yytext + 1);
                    return PARAM;
                }

{integer}       {
                    SET_YYLLOC();
                    return process_integer_literal(yytext, yylval);
                }
{decimal}       {
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{decimalfail}   {
                    /* throw back the .., and treat as integer */
                    yyless(yyleng - 2);
                    SET_YYLLOC();
                    return process_integer_literal(yytext, yylval);
                }
{real}          {
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{realfail1}     {
                    /*
                     * throw back the [Ee], and treat as {decimal}.  Note
                     * that it is possible the input is actually {integer},
                     * but since this case will almost certainly lead to a
                     * syntax error anyway, we don't bother to distinguish.
                     */
                    yyless(yyleng - 1);
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{realfail2}     {
                    /* throw back the [Ee][+-], and proceed as above */
                    yyless(yyleng - 2);
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }

/* SQL的关键字标识符   */
{identifier}    {
                    const ScanKeyword *keyword;
                    char       *ident;

                    SET_YYLLOC();

                    /* 判断是否是SQL关键字 */
                    keyword = ScanKeywordLookup(yytext,
                                                yyextra->keywords,
                                                yyextra->num_keywords);
                    if (keyword != NULL)
                    {
                        yylval->keyword = keyword->name;
                        return keyword->value;
                    }

                    /*
                     * No.  Convert the identifier to lower case, and truncate
                     * if necessary.
                     */
                    ident = downcase_truncate_identifier(yytext, yyleng, true);
                    yylval->str = ident;
                    return IDENT; /*标识符*/
                }

{other}         {
                    SET_YYLLOC();
                    return yytext[0];
                }

/*结束符*/
<>         {
                    SET_YYLLOC();
                    yyterminate();
                }
  • 词法分析器scan.l ——用户子程序段
%%
/*用户子程序段*/
...
/*
 * 初始化lex的词法分析器,在词法分析前调用,初始化分析器的变量值
 */
core_yyscan_t
scanner_init(const char *str,
             core_yy_extra_type *yyext,
             const ScanKeyword *keywords,
             int num_keywords)
{
    Size        slen = strlen(str);
    yyscan_t    scanner;

    if (yylex_init(&scanner) != 0)
        elog(ERROR, "yylex_init() failed: %m");

    core_yyset_extra(yyext, scanner);

    yyext->keywords = keywords;
    yyext->num_keywords = num_keywords;

    yyext->backslash_quote = backslash_quote;
    yyext->escape_string_warning = escape_string_warning;
    yyext->standard_conforming_strings = standard_conforming_strings;

    /*
     * Make a scan buffer with special termination needed by flex.
     */
    yyext->scanbuf = (char *) palloc(slen + 2);
    yyext->scanbuflen = slen;
    memcpy(yyext->scanbuf, str, slen);
    yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
    yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);

    /* initialize literal buffer to a reasonable but expansible size */
    yyext->literalalloc = 1024;
    yyext->literalbuf = (char *) palloc(yyext->literalalloc);
    yyext->literallen = 0;

    return scanner;
}


/*
 * lex的词法分析器后调用,清除scanner_init()的值
 */
void
scanner_finish(core_yyscan_t yyscanner)
{
    /*
     * We don't bother to call yylex_destroy(), because all it would do is
     * pfree a small amount of control storage.  It's cheaper to leak the
     * storage until the parsing context is destroyed.  The amount of space
     * involved is usually negligible compared to the output parse tree
     * anyway.
     *
     * We do bother to pfree the scanbuf and literal buffer, but only if they
     * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
     */
    if (yyextra->scanbuflen >= 8192)
        pfree(yyextra->scanbuf);
    if (yyextra->literalalloc >= 8192)
        pfree(yyextra->literalbuf);
}
...

总结

本文主要分析postgreSQL的词法分析器代码。

你可能感兴趣的:(PostgreSQL查询SQL的语法分析(1)——词法分析)