Clang学习历程 编译过程-词法分析

前言

《编译原理》中提到

编译器的第一个步骤是词法分析(Lexical Analysis)或扫描。词法分析器读入组成源程序的字符流,并且将它们组织成为有意义的词素(lexeme)的序列。对于每个词素,词法分析产生如下形式的词法单元(token)作为输出:

token-name 是一个语法分析步骤要使用的抽象符号
attribute-value指向符号表中关于这个词法单元的条目

实验

int main(){
   
    @autoreleasepool {
   
        int initial = 8;
        int six = 6;
        NSString* site = [[NSString alloc] initWithUTF8String:"starming"];
        int rank = initial + six;
        int position = initial + rank * 60;
        NSLog(@"%@ rank %d", site, position);
    }
    return 0;
}
## 使用手工编译的clang执行如下指令
## -fmodules               Enable the 'modules' language feature
## This will make any modules-enabled software libraries available as modules as well as introducing any modules-specific syntax.
## -E                      Only run the preprocessor
## 只运行预处理器
## -Xclang            Pass  to the clang compiler
## -dump-tokens  -- man clang/ clang --help 都没不到
## 参考1
## http://clang.llvm.org/doxygen/namespaceclang_1_1driver_1_1options.html
## enum clang::driver::options::ClangFlags
## Flags specifically for clang options.

## 参考2
## Running the plugin
## Using the cc1 command line
## To run a plugin, the dynamic library containing the plugin registry # must be loaded via the -load command line option. This will load all plugins that are registered, and you can select the plugins to run by specifying the -plugin option. Additional parameters for the plugins can be passed with -plugin-arg-.
## Note that those options must reach clang’s cc1 process. There are two ways to do so:

## grep -r "dump-tokens" src/llvm/tools/clang
## src/llvm/tools/clang/include/clang/Driver/CC1Options.td:def dump_tokens : Flag<["-"], "dump-tokens">,
## 根据上面的两个参考链接 + grep的结果确定-dump-tokens应该就是这么来的
## grep -r "dump_tokens" src/llvm/tools/clang
## src/llvm/tools/clang/lib/Frontend/CompilerInvocation.cpp:    case OPT_dump_tokens:
## static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
##                                   DiagnosticsEngine &Diags,
##                                   bool &IsHeaderFile) {
## ...
## case OPT_dump_tokens:
##      Opts.ProgramAction = frontend::DumpTokens; break;                                    
~% /opt/llvm/bin/clang -fmodules -E -Xclang -dump-tokens main.m

结果如下

annot_module_include '#import 

int main(int argc, const char * argv[]) {
    @autoreleasepool {
        int initial = 8;
    '		Loc=
int 'int'	 [StartOfLine]	Loc=
identifier 'main'	 [LeadingSpace]	Loc=
l_paren '('		Loc=
int 'int'		Loc=
identifier 'argc'	 [LeadingSpace]	Loc=
comma ','		Loc=
const 'const'	 [LeadingSpace]	Loc=
char 'char'	 [LeadingSpace]	Loc=
star '*'	 [LeadingSpace]	Loc=
identifier 'argv'	 [LeadingSpace]	Loc=
l_square '['		Loc=
r_square ']'		Loc=
r_paren ')'		Loc=
l_brace '{'	 [LeadingSpace]	Loc=
at '@'	 [StartOfLine] [LeadingSpace]	Loc=
identifier 'autoreleasepool'		Loc=
l_brace '{'	 [LeadingSpace]	Loc=
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=
identifier 'initial'	 [LeadingSpace]	Loc=
equal '='	 [LeadingSpace]	Loc=
numeric_constant '8'	 [LeadingSpace]	Loc=
semi ';'		Loc=
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=
identifier 'six'	 [LeadingSpace]	Loc=
equal '='	 [LeadingSpace]	Loc=
numeric_constant '6'	 [LeadingSpace]	Loc=
semi ';'		Loc=
identifier 'NSString'	 [StartOfLine] [LeadingSpace]	Loc=
star '*'		Loc=
identifier 'site'	 [LeadingSpace]	Loc=
equal '='	 [LeadingSpace]	Loc=
l_square '['	 [LeadingSpace]	Loc=
l_square '['		Loc=
identifier 'NSString'		Loc=
identifier 'alloc'	 [LeadingSpace]	Loc=
r_square ']'		Loc=
identifier 'initWithUTF8String'	 [LeadingSpace]	Loc=
colon ':'		Loc=
string_literal '"starming"'		Loc=
r_square ']'		Loc=
semi ';'		Loc=
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=
identifier 'rank'	 [LeadingSpace]	Loc=
equal '='	 [LeadingSpace]	Loc=
identifier 'initial'	 [LeadingSpace]	Loc=
plus '+'	 [LeadingSpace]	Loc=
identifier 'six'	 [LeadingSpace]	Loc=
semi ';'		Loc=
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=
identifier 'position'	 [LeadingSpace]	Loc=
equal '='	 [LeadingSpace]	Loc=
identifier 'initial'	 [LeadingSpace]	Loc=
plus '+'	 [LeadingSpace]	Loc=
identifier 'rank'	 [LeadingSpace]	Loc=
star '*'	 [LeadingSpace]	Loc=
numeric_constant '60'	 [LeadingSpace]	Loc=
semi ';'		Loc=
identifier 'NSLog'	 [StartOfLine] [LeadingSpace]	Loc=
l_paren '('		Loc=
at '@'		Loc=
string_literal '"%@ rank %d"'		Loc=
comma ','		Loc=
identifier 'site'	 [LeadingSpace]	Loc=
comma ','		Loc=
identifier 'position'	 [LeadingSpace]	Loc=
r_paren ')'		Loc=
semi ';'		Loc=
r_brace '}'	 [StartOfLine] [LeadingSpace]	Loc=
return 'return'	 [StartOfLine] [LeadingSpace]	Loc=
numeric_constant '0'	 [LeadingSpace]	Loc=
semi ';'		Loc=
r_brace '}'	 [StartOfLine]	Loc=
eof ''		Loc=
## 《编译原理》给的例子
position = initial + rate * 60
## 对应词法序列
 < = >  < + >  < * ><60>

Clang学习历程 编译过程-词法分析_第1张图片

## int position = initial + rank * 60;
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=
identifier 'position'	 [LeadingSpace]	Loc=
equal '='	 [LeadingSpace]	Loc=
identifier 'initial'	 [LeadingSpace]	Loc=
plus '+'	 [LeadingSpace]	Loc=
identifier 'rank'	 [LeadingSpace]	Loc=
star '*'	 [LeadingSpace]	Loc=
numeric_constant '60'	 [LeadingSpace]	Loc=
semi ';'		Loc=

## 可以获得每个 token 的类型,值还有类似 StartOfLine 的位置类型和 Loc=main.m:11:1 这个样的具体位置。
## 和《编译原理》有点不同,attribute-value没有指向符号表中关于这个词法单元的条目

实现

  1. 定义词元
  2. 遍历字符流 && 输出词元

案例

定义词元

入门教程中中的Kaleidoscope语言

//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//

// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
// 字符流解析词元规则,要么是如下5种类型,要么返回对应的ASCII值
enum Token {
  tok_eof = -1,

  // commands
  tok_def = -2, tok_extern = -3,

  // primary
  tok_identifier = -4, tok_number = -5
};

static std::string IdentifierStr;  // Filled in if tok_identifier
static double NumVal;              // Filled in if tok_number
遍历字符流 && 输出词元
/// gettok - Return the next token from standard input.
static int gettok() {
  static int LastChar = ' ';

  // Skip any whitespace.
  /// 忽略空格
  while (isspace(LastChar))
    LastChar = getchar();

  /// 判定是否是identifier 满足正则条件[a-zA-Z][a-zA-Z0-9]*
  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
    IdentifierStr = LastChar;
    while (isalnum((LastChar = getchar())))
      IdentifierStr += LastChar;
    
    /// 排除保留的关键字
    if (IdentifierStr == "def") return tok_def;
    if (IdentifierStr == "extern") return tok_extern;
    return tok_identifier;
  }

  /// 判定是否是数字 满足正则条件 [0-9.]+
  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
    std::string NumStr;
    do {
      NumStr += LastChar;
      LastChar = getchar();
    } while (isdigit(LastChar) || LastChar == '.');

    NumVal = strtod(NumStr.c_str(), 0);
    return tok_number;
  }

  /// 判定是否是注释
  if (LastChar == '#') {
    // Comment until end of line.
    do LastChar = getchar();
    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
    
    if (LastChar != EOF)
      return gettok();
  }

  // Check for end of file.  Don't eat the EOF.
  if (LastChar == EOF)
    return tok_eof;

  // Otherwise, just return the character as its ascii value.
  /// 返回字符的ascii值
  int ThisChar = LastChar;
  LastChar = getchar();
  return ThisChar;
}

理论

正则表达式

正则表达式(Regular Expression,RE)是一种用来描述正则语言的更紧凑的表达方式。

每个正则表达式 r 定义(表示)一个语言,记为 L(r)。这个语言也是根据 r 的子表达式所表示的语言递归定义的。

Σ 是给定的有限字符集
ε 是空串(empty string)

归纳基础:

  1. ε是一个正则表达式,L(ε) = {ε},即该语言只包含空串
  2. 如果 a ∈ Σ,那么a是一个正则表达式,且L(a) = {a}。即这个语言仅包含一个长度为1的符号串a

归纳步骤:假设 r 和 s 都是正则表达式,表示的语言分别是L®和L(s)

  1. r|s 是一个正则表达式,L(r|s) = L(r)∪L(s)
  2. rs(rs 的连接)是一个正则表达式,L(rs)=L(r)L(s)
  3. r* 是一个正则表达式,L(r*) = (L(r))*
  4. (r)是一个正则表达式,L((r)) = L(r),表明表达式的两边加上括号并不影响表达式所表示的语言

运算符的优先级: * , 连接 , |
* 号代表字符可以不出现,也可以出现一次或者多次

有穷自动机

正则表达式描述的规则人容易理解,但是要解析字符串,还需要将其转化为计算机程序能理解的模型。

有穷自动机(Finite Automata,FA)是对一类处理系统建立的数学模型。这类系统具有一系列离散的输入输出信息有穷数目的内部状态

数学表达:

M = (S,Σ,δ,s0,F)

S: 有穷状态集
Σ: 字符表
δ: 转换函数
s0: 初始状态
F: 结束/可接受状态集

你可能感兴趣的:(iosugar,Clang,iOX,llvm)