词法分析器

维基百科介绍：词法分析是计算机科学中将字符序列转换为标记序列的过程。
进行词法分析的程序或者函数叫作词法分析器。

有如下原始程序代码

add_result = 1 + 2

通过词法分析得到以下结果

NAME   `add_result` 0,  0
SYMBOL `=`          0, 11
INT    `1`          0, 13
SYMBOL `+`          0, 15
INT    `2`          0, 17

整理成表格形式

标记类型	字面值	列号
NAME	add_result	0
SYMBOL	=	11
INT	1	13
SYMBOL	+	15
INT	2	17

我们可以利用Go语言轻松实现可用的词法分析器 ️

Go语言实现词法分析器

package main

import (
    "fmt"
    "regexp"
    "unicode/utf8"
    "os"
)

var exprs = []string{"\\d+", "[\\p{L}\\d_]+", "[\\+\\-=]"}
var names = []string{"INT",  "NAME",         "SYMBOL"}

func main() {
    rules := []*regexp.Regexp{}
    for i, expr := range exprs {
        rule, _ := regexp.Compile("^" + expr)
        rules = append(rules, rule)
        fmt.Println(names[i], rule)
    }

    fmt.Println("--------------------------------")
    for row, code := range os.Args[1:] {
        position := 0
        col := 0
        for true {
            for position < len(code) && (code[position] == ' ' || code[position] == '\t') {
                position += 1
                col += 1
            }
            if position >= len(code) {
                break
            }
            source := ""
            tokenType := -1
            for i, rule := range rules {
                source = rule.FindString(code[position:])
                if source != "" {
                    tokenType = i
                    break
                }
            }
            if tokenType >= 0 {
                fmt.Printf("%s\t`%s`\t%d\t%d\n", names[tokenType], source, row, col)
                position += len(source)
                col += utf8.RuneCountInString(source)
            } else {
                fmt.Printf("error in: %d, %d\n", row, col)
                break
            }
        }
    }

}

在命令行中运行测试

➜ go run lexer.go "数值 = PI + 100"
INT     ^\d+
NAME    ^[\p{L}\d_]+
SYMBOL  ^[\+-=]
--------------------------------
NAME    `数值`   0   0
SYMBOL  `=`     0   3
NAME    `PI`    0   5
SYMBOL  `+`     0   8
INT     `100`   0   10

Go语言代码说明

引入需要用到的包:

package main
 
import (
    "fmt"
    "regexp"
    "unicode/utf8"
    "os"
)

fmt 用于打印输出
regexp 正则表达式
unicode/utf8 统计utf8的符文数量
os 获取用户输入

指定正则表达式和字段类型名称:

var exprs = []string{"\\d+", "[\\p{L}\\d_]+", "[\\+\\-=]"}
var names = []string{"INT",  "NAME",         "SYMBOL"}

创建两个字符串数组分别用于存储正则表达式与对应的字段类型名称。

初始化字段匹配规则:

func main() {
    rules := []*regexp.Regexp{}
    for i, expr := range exprs {
        rule, _ := regexp.Compile("^" + expr)
        rules = append(rules, rule)
        fmt.Println(names[i], rule)
    }

需要注意的是必须为每一个正则表达式头前插入^用来确保匹配的字符串包括最左边的一个字符，避免“跳跃匹配”。

循环匹配字段:

for row, code := range os.Args[1:] {
    position := 0
    col := 0
    for true {
        for position < len(code) && (code[position] == ' ' || code[position] == '\t') {
            position += 1
            col += 1
        }
        if position >= len(code) {
            break
        }
        source := ""
        tokenType := -1
        for i, rule := range rules {
            source = rule.FindString(code[position:])
            if source != "" {
                tokenType = i
                break
            }
        }
        if tokenType >= 0 {
            fmt.Printf("%s\t`%s`\t%d\t%d\n", names[tokenType], source, row, col)
            position += len(source)
            col += utf8.RuneCountInString(source)
        } else {
            fmt.Printf("error in: %d, %d\n", row, col)
            break
        }
    }
}

使用遍历os.Args[1:]的方法将用户输入的每一个参数作为一行代码进行词法分析。

跳过【忽略】空字符:

for position < len(code) && (code[position] == ' ' || code[position] == '\t') {
    position += 1
    col += 1
}

因为我们的正则表达式必须匹配最左边的一个字符所以需要跳过一些常常没有意义的空字符。

判断是否需要中断循环:

if position >= len(code) {
    break
}

遍历匹配规则尝试匹配:

source := ""
tokenType := -1
for i, rule := range rules {
    source = rule.FindString(code[position:])
    if source != "" {
        tokenType = i
        break
    }
}

循环遍历设定的规则进行匹配，如果成功则将下标设定为tokenType的值，如果始终没有匹配则tokenType默认-1。

根据匹配结果判断后续行为:

if tokenType >= 0 {
    fmt.Printf("%s\t`%s`\t%d\t%d\n", names[tokenType], source, row, col)
    position += len(source)
    col += utf8.RuneCountInString(source)
} else {
    fmt.Printf("error in: %d, %d\n", row, col)
    break
}

如果tokenType不为-1，则匹配成功，将打印字段名称，字面量，行列信息，并且设置position使之跳过当前字段，
需要注意下一个字段起始的列号col的增量需要使用utf8的符文计数方法获得，否则遇到一些unicode/utf8编码将无法得到正确指向。

Python使用者也可以轻松的实现 ️

Python词法分析器

import re
import sys


exprs = ['\\d+', '\\w+', '[\\+\\-=]']
names = ['INT',  'NAME', 'SYMBOL']


def main():
    rules = []
    for i, expr in enumerate(exprs):
        rules.append(re.compile('^' + expr))
        print(names[i], rules[-1].pattern)

    print('-' * 32)
    for row, code in enumerate(sys.argv[1:]):
        position = 0
        while True:
            while position < len(code) and (code[position] == ' ' or code[position] == '\t'):
                position += 1
            if position >= len(code):
                break

            source = ''
            tokenType = -1
            for i, rule in enumerate(rules):
                result = rule.findall(code[position:])
                if len(result) > 0:
                    source = result[0]
                    tokenType = i
                    break
            if tokenType >= 0:
                print(f'{names[tokenType]}\t`{source}`\t{row}\t{position}')
                position += len(source)
            else:
                print(f'error in {row}, {position}')
                break


if __name__ == "__main__":
    main()

作为补充内容这里也提供C++方案 ️

C++实现词法分析器

#include 
#include 
#include 
#include 
#include 


std::vector exprs{L"\\d+", L"\\w+", L"[\\+\\-=]"};
std::vector names{"INT",  "NAME", "SYMBOL"};


int main(int argc, char *argv[]) {
    std::locale old;
    std::locale::global(std::locale("en_US.UTF-8"));
    std::wstring_convert> codecvt_utf8;

    std::vector rules;
    for (size_t i = 0, count = exprs.size(); i < count; ++i) {
        rules.push_back(std::wregex(L"^" + exprs[i]));
        printf("%s ^%s\n", names[i].c_str(), codecvt_utf8.to_bytes(exprs[i]).c_str());
    }

    printf("--------------------------------\n");
    for (int row = 0; row < argc - 1; ++row) {
        std::wstring code = codecvt_utf8.from_bytes(argv[row + 1]);
        size_t position = 0;
        while (true) {
            while (position < code.size() && (code[position] == L' ' || code[position] == L'\t'))
                position += 1;
            if (position >= code.size())
                break;

            auto subcode = code.substr(position);
            std::wsmatch match;
            int tokenType = -1;
            for (size_t i = 0, count = rules.size(); i < count; ++i) {
                if (std::regex_search(subcode, match, rules[i])) {
                    tokenType = i;
                    break;
                }
            }

            if (tokenType >= 0) {
                auto source = match.str(0);
                printf("%s\t`%s`\t%d\t%ld\n",
                    names[tokenType].c_str(), codecvt_utf8.to_bytes(source).c_str(), row, position);
                position += source.size();
            } else {
                printf("error in: %d, %ld\n", row, position);
                break;
            }
        }
    }

    std::locale::global(old);
    return 0;
}

下一篇《使用BKLexer进行词法分析》