《编译原理》中提到
编译器的第一个步骤是词法分析(Lexical Analysis)或扫描。词法分析器读入组成源程序的字符流,并且将它们组织成为有意义的词素(lexeme)的序列。对于每个词素,词法分析产生如下形式的词法单元(token)作为输出:
token-name
是一个语法分析步骤要使用的抽象符号
attribute-value
指向符号表中关于这个词法单元的条目
int main(){
@autoreleasepool {
int initial = 8;
int six = 6;
NSString* site = [[NSString alloc] initWithUTF8String:"starming"];
int rank = initial + six;
int position = initial + rank * 60;
NSLog(@"%@ rank %d", site, position);
}
return 0;
}
## 使用手工编译的clang执行如下指令
## -fmodules Enable the 'modules' language feature
## This will make any modules-enabled software libraries available as modules as well as introducing any modules-specific syntax.
## -E Only run the preprocessor
## 只运行预处理器
## -Xclang Pass to the clang compiler
## -dump-tokens -- man clang/ clang --help 都没不到
## 参考1
## http://clang.llvm.org/doxygen/namespaceclang_1_1driver_1_1options.html
## enum clang::driver::options::ClangFlags
## Flags specifically for clang options.
## 参考2
## Running the plugin
## Using the cc1 command line
## To run a plugin, the dynamic library containing the plugin registry # must be loaded via the -load command line option. This will load all plugins that are registered, and you can select the plugins to run by specifying the -plugin option. Additional parameters for the plugins can be passed with -plugin-arg-.
## Note that those options must reach clang’s cc1 process. There are two ways to do so:
## grep -r "dump-tokens" src/llvm/tools/clang
## src/llvm/tools/clang/include/clang/Driver/CC1Options.td:def dump_tokens : Flag<["-"], "dump-tokens">,
## 根据上面的两个参考链接 + grep的结果确定-dump-tokens应该就是这么来的
## grep -r "dump_tokens" src/llvm/tools/clang
## src/llvm/tools/clang/lib/Frontend/CompilerInvocation.cpp: case OPT_dump_tokens:
## static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
## DiagnosticsEngine &Diags,
## bool &IsHeaderFile) {
## ...
## case OPT_dump_tokens:
## Opts.ProgramAction = frontend::DumpTokens; break;
~% /opt/llvm/bin/clang -fmodules -E -Xclang -dump-tokens main.m
结果如下
annot_module_include '#import
int main(int argc, const char * argv[]) {
@autoreleasepool {
int initial = 8;
' Loc=
int 'int' [StartOfLine] Loc=
identifier 'main' [LeadingSpace] Loc=
l_paren '(' Loc=
int 'int' Loc=
identifier 'argc' [LeadingSpace] Loc=
comma ',' Loc=
const 'const' [LeadingSpace] Loc=
char 'char' [LeadingSpace] Loc=
star '*' [LeadingSpace] Loc=
identifier 'argv' [LeadingSpace] Loc=
l_square '[' Loc=
r_square ']' Loc=
r_paren ')' Loc=
l_brace '{' [LeadingSpace] Loc=
at '@' [StartOfLine] [LeadingSpace] Loc=
identifier 'autoreleasepool' Loc=
l_brace '{' [LeadingSpace] Loc=
int 'int' [StartOfLine] [LeadingSpace] Loc=
identifier 'initial' [LeadingSpace] Loc=
equal '=' [LeadingSpace] Loc=
numeric_constant '8' [LeadingSpace] Loc=
semi ';' Loc=
int 'int' [StartOfLine] [LeadingSpace] Loc=
identifier 'six' [LeadingSpace] Loc=
equal '=' [LeadingSpace] Loc=
numeric_constant '6' [LeadingSpace] Loc=
semi ';' Loc=
identifier 'NSString' [StartOfLine] [LeadingSpace] Loc=
star '*' Loc=
identifier 'site' [LeadingSpace] Loc=
equal '=' [LeadingSpace] Loc=
l_square '[' [LeadingSpace] Loc=
l_square '[' Loc=
identifier 'NSString' Loc=
identifier 'alloc' [LeadingSpace] Loc=
r_square ']' Loc=
identifier 'initWithUTF8String' [LeadingSpace] Loc=
colon ':' Loc=
string_literal '"starming"' Loc=
r_square ']' Loc=
semi ';' Loc=
int 'int' [StartOfLine] [LeadingSpace] Loc=
identifier 'rank' [LeadingSpace] Loc=
equal '=' [LeadingSpace] Loc=
identifier 'initial' [LeadingSpace] Loc=
plus '+' [LeadingSpace] Loc=
identifier 'six' [LeadingSpace] Loc=
semi ';' Loc=
int 'int' [StartOfLine] [LeadingSpace] Loc=
identifier 'position' [LeadingSpace] Loc=
equal '=' [LeadingSpace] Loc=
identifier 'initial' [LeadingSpace] Loc=
plus '+' [LeadingSpace] Loc=
identifier 'rank' [LeadingSpace] Loc=
star '*' [LeadingSpace] Loc=
numeric_constant '60' [LeadingSpace] Loc=
semi ';' Loc=
identifier 'NSLog' [StartOfLine] [LeadingSpace] Loc=
l_paren '(' Loc=
at '@' Loc=
string_literal '"%@ rank %d"' Loc=
comma ',' Loc=
identifier 'site' [LeadingSpace] Loc=
comma ',' Loc=
identifier 'position' [LeadingSpace] Loc=
r_paren ')' Loc=
semi ';' Loc=
r_brace '}' [StartOfLine] [LeadingSpace] Loc=
return 'return' [StartOfLine] [LeadingSpace] Loc=
numeric_constant '0' [LeadingSpace] Loc=
semi ';' Loc=
r_brace '}' [StartOfLine] Loc=
eof '' Loc=
## 《编译原理》给的例子
position = initial + rate * 60
## 对应词法序列
< = > < + > < * ><60>
## int position = initial + rank * 60;
int 'int' [StartOfLine] [LeadingSpace] Loc=
identifier 'position' [LeadingSpace] Loc=
equal '=' [LeadingSpace] Loc=
identifier 'initial' [LeadingSpace] Loc=
plus '+' [LeadingSpace] Loc=
identifier 'rank' [LeadingSpace] Loc=
star '*' [LeadingSpace] Loc=
numeric_constant '60' [LeadingSpace] Loc=
semi ';' Loc=
## 可以获得每个 token 的类型,值还有类似 StartOfLine 的位置类型和 Loc=main.m:11:1 这个样的具体位置。
## 和《编译原理》有点不同,attribute-value没有指向符号表中关于这个词法单元的条目
入门教程中中的Kaleidoscope
语言
//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//
// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
// 字符流解析词元规则,要么是如下5种类型,要么返回对应的ASCII值
enum Token {
tok_eof = -1,
// commands
tok_def = -2, tok_extern = -3,
// primary
tok_identifier = -4, tok_number = -5
};
static std::string IdentifierStr; // Filled in if tok_identifier
static double NumVal; // Filled in if tok_number
/// gettok - Return the next token from standard input.
static int gettok() {
static int LastChar = ' ';
// Skip any whitespace.
/// 忽略空格
while (isspace(LastChar))
LastChar = getchar();
/// 判定是否是identifier 满足正则条件[a-zA-Z][a-zA-Z0-9]*
if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
IdentifierStr = LastChar;
while (isalnum((LastChar = getchar())))
IdentifierStr += LastChar;
/// 排除保留的关键字
if (IdentifierStr == "def") return tok_def;
if (IdentifierStr == "extern") return tok_extern;
return tok_identifier;
}
/// 判定是否是数字 满足正则条件 [0-9.]+
if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = getchar();
} while (isdigit(LastChar) || LastChar == '.');
NumVal = strtod(NumStr.c_str(), 0);
return tok_number;
}
/// 判定是否是注释
if (LastChar == '#') {
// Comment until end of line.
do LastChar = getchar();
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
if (LastChar != EOF)
return gettok();
}
// Check for end of file. Don't eat the EOF.
if (LastChar == EOF)
return tok_eof;
// Otherwise, just return the character as its ascii value.
/// 返回字符的ascii值
int ThisChar = LastChar;
LastChar = getchar();
return ThisChar;
}
正则表达式(Regular Expression,RE)
是一种用来描述正则语言的更紧凑的表达方式。
每个正则表达式 r
定义(表示)一个语言,记为 L(r)
。这个语言也是根据 r
的子表达式所表示的语言递归定义的。
Σ 是给定的有限字符集
ε 是空串(empty string)
归纳基础:
ε
是一个正则表达式,L(ε) = {ε}
,即该语言只包含空串a ∈ Σ
,那么a
是一个正则表达式,且L(a) = {a}
。即这个语言仅包含一个长度为1的符号串a
。归纳步骤:假设 r 和 s 都是正则表达式,表示的语言分别是L®和L(s)
r|s
是一个正则表达式,L(r|s) = L(r)∪L(s)
rs
(r
和 s
的连接)是一个正则表达式,L(rs)=L(r)L(s)
r*
是一个正则表达式,L(r*) = (L(r))*
(r)
是一个正则表达式,L((r)) = L(r)
,表明表达式的两边加上括号并不影响表达式所表示的语言运算符的优先级: *
, 连接 , |
*
号代表字符可以不出现,也可以出现一次或者多次
正则表达式描述的规则人容易理解,但是要解析字符串,还需要将其转化为计算机程序能理解的模型。
有穷自动机(Finite Automata,FA)
是对一类处理系统建立的数学模型。这类系统具有一系列离散的输入输出信息和有穷数目的内部状态。
数学表达:
M = (S,Σ,δ,s0,F)
S
: 有穷状态集
Σ
: 字符表
δ
: 转换函数
s0
: 初始状态
F
: 结束/可接受状态集