python实现一个编译器_一个编译器最简前端的python实现(二)

全部代码在这里:

import queue

from typing import Set

grammar = [("startsup", ("start", )),

("start", ("stmt", )),

("stmt", ("if", "(", "C", ")", "S1", "else", "S2")),

]

terminals = ("if", "(", "C", ")", "S1", "else", "S2", '$')

n_terminals = ("startsup", "start", "stmt")

all_symbols = terminals + n_terminals

class Item(object):

"""The Canonical LR(1) Item definition.

:param symbol: str, the left part of production.

:param body: str, the right part of production.

:param dot: int, current position in the item.

:param follow: str, possible input for the current configuration.

"""

def __init__(self, symbol, body, dot, follow):

self.symbol = symbol

self.body = body

self.pos = dot

self.follow = follow

def __str__(self):

p = list(self.body)

p.insert(self.pos, '◆')

pr = ' '.join(p)

return "[{}] {} -> {}".format( self.follow, self.symbol, pr)

def __repr__(self):

return "\n".format(self.__str__())

def __eq__(self, other):

if isinstance(other, Item):

return ((self.symbol == other.symbol) and

(self.body == other.body) and

(self.pos == other.pos) and

(self.follow == other.follow))

else:

return False

def __ne__(self, other):

return not self.__eq__(other)

def __hash__(self):

return hash(self.__str__())

class Closure(object):

def __init__(self, sets: Set[Item], label: int = None):

self.label = label

self.sets = sets

self.goto = dict() # type: dict[str, int]

def __len__(self):

return len(self.sets)

def __iter__(self):

return self.sets.__iter__()

def __str__(self):

return "\n".join([i.__str__() for i in self.sets])

def __repr__(self):

return ":{}\n{}\n\n".format(self.label,

self.__str__())

def __eq__(self, other):

return self.sets == other.sets

def __ne__(self, other):

return not self.__eq__(other)

def __hash__(self):

return hash(self.__str__())

def __contains__(self, item):

return item in self.sets

def isnterm(symbol):

return symbol in n_terminals

def isterm(symbol):

return symbol in terminals

def produce_epsilon(none_terminal):

return 'EPSILON' in [i[1] for i in grammar if i[0] == none_terminal]

def first(symbol):

"""Return the first terminal sets that may occur in the Symbol."""

first_sets = set()

if isterm(symbol):

return set(symbol)

elif produce_epsilon(symbol):

first_sets = first_sets.union('EPSILON')

elif isnterm(symbol):

for i in grammar:

if i[0] == symbol:

body = i[1]

epsilons = True

current = 0

while epsilons is True and current < len(body):

if body[current] != symbol:

first_sets = first_sets.union(first(body[current]))

if not produce_epsilon(body[current]):

epsilons = False

current += 1

return first_sets

def firsts(suffix):

if len(suffix) == 1:

return first(suffix[0])

else:

if not produce_epsilon(suffix[0]):

return first(suffix[0])

else:

return first(suffix[0]).union(firsts(suffix[1:]))

def get_closure(cl: Closure, label: int) -> Closure:

"""get all Item of a Closure from given Items, by adding implied Items.

The implied Items are the productions of the None terminals after the

current position, which put a dot on the head."""

def get_nterm(item):

pos, prod = (item.pos, item.body)

if pos < len(prod):

symbol = prod[pos]

if isnterm(symbol):

return symbol

return None

item_set = set()

q = queue.Queue()

for i in cl.sets:

item_set.add(i)

q.put(i)

while not q.empty():

item = q.get()

symbol = get_nterm(item)

if symbol:

products = [i for i in grammar if i[0] == symbol]

suffix = item.body[item.pos+1:] + tuple(item.follow)

termins = firsts(suffix)

for product in products:

for terminal in termins:

new_item = Item(symbol, product[1], 0, terminal)

if new_item not in item_set:

item_set.add(new_item)

q.put(new_item)

c = Closure(item_set, label)

return c

def goto(clos: Closure, letter: str) -> Closure:

"""a closure that could get from the current closure by input a letter.

:param clos: the current closure.

:param letter: the input letter.

:return: Closure.

"""

item_set = set()

for item in clos.sets:

dot, prod = (item.pos, item.body)

if dot < len(prod) and prod[dot] == letter:

new_item = Item(item.symbol,

item.body,

item.pos + 1,

item.follow)

item_set.add(new_item)

c = Closure(item_set)

return get_closure(c, label=None)

def closure_groups():

def find_label(closure, group):

for i in group:

if closure == i:

return i.label

return None

group = set()

label = 0

start_item = Item('startsup', ('start',), 0, '$')

start = get_closure(Closure({start_item}), label)

q = queue.Queue()

q.put(start)

group.add(start)

while not q.empty():

c = q.get()

for literal in all_symbols: # terminals + n_terminals:

go_clos = goto(c, literal)

if go_clos:

if go_clos not in group:

label += 1

go_clos.label = label

q.put(go_clos)

group.add(go_clos)

c.goto[literal] = label

# print('add closure', go_clos)

else:

go_label = find_label(go_clos, group)

if go_label:

c.goto[literal] = go_label

return group

def get_states_map(closure_group):

def get_state_map(closure):

""" table row like all_symbols list state maps."""

row = ["." for i in all_symbols]

# None terminals GOTO action and Terminals shift action.

for input, goto_label in closure.goto.items():

row_pos = all_symbols.index(input)

for item in closure:

if item.pos < len(item.body): # shape like [A -> ⍺.aβ b]

if item.body[item.pos] == input:

# None terminals GOTO state

if input in n_terminals:

row[row_pos] = str(goto_label)

# Terminals action shift state

elif input in terminals:

row[row_pos] = "s" + str(goto_label)

# Terminals reduce action. shape like [A -> ⍺. a]

for row_pos, input in enumerate(all_symbols):

for item in closure:

if item.pos == len(item.body) and \

item.follow == input and \

item.symbol != 'startsup':

# 'R' should be replaced with start_symbol

#if item.follow != '*':

production_num = grammar.index((item.symbol, item.body))

row[row_pos] = 'r' + str(production_num)

#else:

# pass

# accept condition 'startsup -> start. , $'

acc_item = Item('startsup', ('start',), 1, '$')

if acc_item in closure:

input = '$'

row_pos = all_symbols.index('$')

row[row_pos] = '$'

return row

state_map = [None for i in range(len(closure_group))]

for closure in closure_group:

row = get_state_map(closure)

state_map[closure.label] = row

return state_map

def generate_syntax_table():

g = closure_groups()

state_map = get_states_map(g)

return state_map

看下结果:

from parser import *

n = generate_syntax_table()

n

state if ( C ) S1 else S2 $ startsup start stmt

0 s1 . . . . . . . . 2 3

1 . s4 . . . . . . . . .

2 . . . . . . . $ . . .

3 . . . . . . . r1 . . .

4 . . s5 . . . . . . . .

5 . . . s6 . . . . . . .

6 . . . . s7 . . . . . .

7 . . . . . s8 . . . . .

8 . . . . . . s9 . . . .

9 . . . . . . . r2 . . .

语法分析和翻译

语法分析

语法分析器在一个状态栈上工作,这个栈存储了移入的状态,它代表了已经输入,尚未规约的词法单元。语法分析器对token_stream(经过词法器解析后的代码)的词法单元逐个进行4种操作。分析器在分析开始前移入状态0。分析器以状态栈上的最后一个状态(栈顶)为当前状态,并且根据输入字符查分析表,来获得当前操作。

四种分析操作:移入,将目标状态移入到状态栈顶。进入下一个词法单元。

规约,规约目标产生式,当前词法单元不变,继续查表进行下一个操作,直到当前词法单状态元被移入。

接受,在含有增广文法开始符号产生式的项 [startsup -> start◆, '\$'],如果当前输入为 '\$', 分析成功进入接受状态,并结束。

错误, 目前我们忽略错误处理。

代码如下:

class SDT:

def __init__(self):

self.syntax_table = generate_syntax_table()

self.state_stack = [0]

self.accept = False

def get_action(self, state, literal):

return self.syntax_table[state][all_symbols.index(literal)]

def ahead(self, token):

action = self.get_action(self.state_stack[-1], token.typ)

# shift action push a current state into state_stack

if action[0] == 's':

current_state = int(action[1:])

self.state_stack.append(current_state)

elif action[0] == '$':

self.accept = True # success

# reduce action reduct a production and push

elif action[0] == 'r':

# get the production in grammar

number = int(action[1:])

production = grammar[number]

head, body = production

# pop the states of production body

for _ in body:

self.state_stack.pop()

# push the state of head GOTO(I,X)

state = self.get_action(self.state_stack[-1], head)

self.state_stack.append(int(state))

# reduce actions does not consume a token,

# only when shifting, a token was consume and passed

self.ahead(token)

else:

raise SyntaxError(f"Not a correct token '{token.__str__()}'.")

def parse(self, token_stream):

while True:

try:

token = next(token_stream)

self.ahead(token)

except StopIteration:

# patch "$" in the end of token stream

# to match the augmented grammar

self.ahead(Token("$", "$"))

break

它接受一个词法单元流,并且分析,如果分析成功,accept就设置为True

from tokenizer import tokenizer

token_stream = tokenizer("if (C) S1 else S2")

sdt = SDT()

sdt.parse(token_stream)

sdt.accept

Out[8]: True

翻译方案

翻译方案一般插入到分析过程当中。

每个非终结符号都会形成一个函数,我们这里暂时在代码中预定义好非终结符号的翻译函数。

因为LR分析器是从右到左规约,而在移入的时候并不判断目前在哪个产生式的内部,因此翻译方案用后缀翻译来实现,就是在规约的时候翻译。产生式头部的名称作为函数名,规约的内容作为参数来进行调用,向上返回函数的结果。

建立一个参数栈:

self.arg_stack = []

token在移入的时候作为值移入到栈中。

self.push_arg(token)

规约时,将值移出,作为规约函数的参数。返回的结果,就是非终结符号的值,移入到栈中。

# translations

args = []

for _ in body:

arg = self.arg_stack.pop()

args.insert(0, arg)

translation = globals().get(head).__call__(*args)

self.arg_stack.append(translation)

然而后缀翻译方案只适用于综合属性(S属性),对于继承属性并不适用。比如 stmt -> if (C) S1 else S2 大致会形成如下翻译方案:

C.code

S1.scode

goto stmt.next

label L1

S2.code

其中,stmt.next 由外部传入,是stmt作为产生式的体时的继承属性,LL分析器通过预测分析表已经获取了头部,所以可以预先分配一个值。这里由于分析器是规约方式的,因此尚不知道继承属性的值。一般采取用一个空产生式来替代翻译内容并先生成继承属性的方法来解决,不过会带来语法分析时的复杂性。

我们在这里采用延迟调用的方法,就是 stmt 规约完成后并不直接返回翻译的字符串值(因为还有一些属性不知道), 而是返回一个函数,通过将未知的内容包装成参数向上返回,在进行规约 start -> stmt 时, 再将start 生成的必要值作为参数来调用 stmt 规约的返回值,就可以获得正确的翻译方案了。

def stmt(IF, LPAR, c, RPAR, s1, ELSE, s2):

def call(next_label):

L1 = get_label()

C_code = c.code(f_cond=L1)

S1_code = s1.code()

S2_code = s2.code()

inter_code = """

{}

{}

goto {}

label {}

{}""".format(C_code, S1_code, next_label, L1, S2_code)

return inter_code

return call

添加对结束状态的处理,和一些其他必要动作。这样,分析和翻译方案就变成了:

class SDT:

def __init__(self):

self.syntax_table = generate_syntax_table()

self.state_stack = [0]

self.arg_stack = []

self.accept = False

self.translation = ''

def get_action(self, state, literal):

return self.syntax_table[state][all_symbols.index(literal)]

def ahead(self, token):

action = self.get_action(self.state_stack[-1], token.typ)

# shift action push a current state into state_stack

if action[0] == 's':

current_state = int(action[1:])

self.state_stack.append(current_state)

self.push_arg(token)

elif action[0] == '$':

self.translation = startsup(self.arg_stack[-1])

self.accept = True # success

print('SUCCESS')

print(self.translation)

# reduce action reduct a production and push

elif action[0] == 'r':

# get the production in grammar

number = int(action[1:])

production = grammar[number]

head, body = production

# pop the states of production body

for _ in body:

self.state_stack.pop()

# push the state of head GOTO(I,X)

state = self.get_action(self.state_stack[-1], head)

self.state_stack.append(int(state))

# translations

args = []

for _ in body:

arg = self.arg_stack.pop()

args.insert(0, arg)

translation = globals().get(head).__call__(*args)

self.arg_stack.append(translation)

# reduce actions does not consume a token,

# only when shifting, a token was consume and passed

self.ahead(token)

else:

raise SyntaxError(f"Not a correct token '{token.__str__()}'.")

def parse(self, token_stream):

while True:

try:

token = next(token_stream)

self.ahead(token)

except StopIteration:

# patch "$" in the end of token stream

# to match the augmented grammar

self.ahead(Token("$", "$"))

break

def push_arg(self, token):

if token.typ == 'C':

token.code = lambda f_cond: 'Ccode Cfalse = {}'.format(f_cond)

elif token.typ == 'S1':

token.code = lambda : 'S1code'

elif token.typ == 'S2':

token.code = lambda : 'S2code'

self.arg_stack.append(token)

all_labels = []

def get_label():

n = 'L' + str(len(all_labels))

all_labels.append(n)

return n

def stmt(IF, LPAR, c, RPAR, s1, ELSE, s2):

def call(next_label):

L1 = get_label()

C_code = c.code(f_cond=L1)

S1_code = s1.code()

S2_code = s2.code()

inter_code = """

{}

{}

goto {}

label {}

{}""".format(C_code, S1_code, next_label, L1, S2_code)

return inter_code

return call

def start(stmt):

def call():

L = get_label()

return stmt(L)

return call

def startsup(f):

return f()

运行一下,

from parser import SDT

from tokenizer import tokenizer

token_stream = tokenizer('if (C) S1 else S2')

sdt = SDT()

sdt.parse(token_stream)

成功翻译:

Ccode Cfalse = L1

S1code

goto L0

label L1

S2code

这是个简陋的过程,但是核心功能完整,我们可以在之后的过程中,逐步完善它。

通常,词法规则和语法规则是由单独的文件定义的。所以需要对词法规则和语法规则进行解析的构件,来完成从源文本到python对象的转换。翻译方案通常嵌入到语法规则中。

错误处理可以在适当的情况引入到编译过程当中。

另外,二义性文法,空产生式等情况的转换在语法添加的过程当中会浮现。 当然还有为语法规则添加基本的语句,使之逐渐成为一个完善的编译前端。

不论如何,我们已经完成了编译前端从源语言到目标语言的全部流程,是一个成功的开始。

你可能感兴趣的:(python实现一个编译器)