全部代码在这里:
import queue
from typing import Set
grammar = [("startsup", ("start", )),
("start", ("stmt", )),
("stmt", ("if", "(", "C", ")", "S1", "else", "S2")),
]
terminals = ("if", "(", "C", ")", "S1", "else", "S2", '$')
n_terminals = ("startsup", "start", "stmt")
all_symbols = terminals + n_terminals
class Item(object):
"""The Canonical LR(1) Item definition.
:param symbol: str, the left part of production.
:param body: str, the right part of production.
:param dot: int, current position in the item.
:param follow: str, possible input for the current configuration.
"""
def __init__(self, symbol, body, dot, follow):
self.symbol = symbol
self.body = body
self.pos = dot
self.follow = follow
def __str__(self):
p = list(self.body)
p.insert(self.pos, '◆')
pr = ' '.join(p)
return "[{}] {} -> {}".format( self.follow, self.symbol, pr)
def __repr__(self):
return "\n".format(self.__str__())
def __eq__(self, other):
if isinstance(other, Item):
return ((self.symbol == other.symbol) and
(self.body == other.body) and
(self.pos == other.pos) and
(self.follow == other.follow))
else:
return False
def __ne__(self, other):
return not self.__eq__(other)
def __hash__(self):
return hash(self.__str__())
class Closure(object):
def __init__(self, sets: Set[Item], label: int = None):
self.label = label
self.sets = sets
self.goto = dict() # type: dict[str, int]
def __len__(self):
return len(self.sets)
def __iter__(self):
return self.sets.__iter__()
def __str__(self):
return "\n".join([i.__str__() for i in self.sets])
def __repr__(self):
return ":{}\n{}\n\n".format(self.label,
self.__str__())
def __eq__(self, other):
return self.sets == other.sets
def __ne__(self, other):
return not self.__eq__(other)
def __hash__(self):
return hash(self.__str__())
def __contains__(self, item):
return item in self.sets
def isnterm(symbol):
return symbol in n_terminals
def isterm(symbol):
return symbol in terminals
def produce_epsilon(none_terminal):
return 'EPSILON' in [i[1] for i in grammar if i[0] == none_terminal]
def first(symbol):
"""Return the first terminal sets that may occur in the Symbol."""
first_sets = set()
if isterm(symbol):
return set(symbol)
elif produce_epsilon(symbol):
first_sets = first_sets.union('EPSILON')
elif isnterm(symbol):
for i in grammar:
if i[0] == symbol:
body = i[1]
epsilons = True
current = 0
while epsilons is True and current < len(body):
if body[current] != symbol:
first_sets = first_sets.union(first(body[current]))
if not produce_epsilon(body[current]):
epsilons = False
current += 1
return first_sets
def firsts(suffix):
if len(suffix) == 1:
return first(suffix[0])
else:
if not produce_epsilon(suffix[0]):
return first(suffix[0])
else:
return first(suffix[0]).union(firsts(suffix[1:]))
def get_closure(cl: Closure, label: int) -> Closure:
"""get all Item of a Closure from given Items, by adding implied Items.
The implied Items are the productions of the None terminals after the
current position, which put a dot on the head."""
def get_nterm(item):
pos, prod = (item.pos, item.body)
if pos < len(prod):
symbol = prod[pos]
if isnterm(symbol):
return symbol
return None
item_set = set()
q = queue.Queue()
for i in cl.sets:
item_set.add(i)
q.put(i)
while not q.empty():
item = q.get()
symbol = get_nterm(item)
if symbol:
products = [i for i in grammar if i[0] == symbol]
suffix = item.body[item.pos+1:] + tuple(item.follow)
termins = firsts(suffix)
for product in products:
for terminal in termins:
new_item = Item(symbol, product[1], 0, terminal)
if new_item not in item_set:
item_set.add(new_item)
q.put(new_item)
c = Closure(item_set, label)
return c
def goto(clos: Closure, letter: str) -> Closure:
"""a closure that could get from the current closure by input a letter.
:param clos: the current closure.
:param letter: the input letter.
:return: Closure.
"""
item_set = set()
for item in clos.sets:
dot, prod = (item.pos, item.body)
if dot < len(prod) and prod[dot] == letter:
new_item = Item(item.symbol,
item.body,
item.pos + 1,
item.follow)
item_set.add(new_item)
c = Closure(item_set)
return get_closure(c, label=None)
def closure_groups():
def find_label(closure, group):
for i in group:
if closure == i:
return i.label
return None
group = set()
label = 0
start_item = Item('startsup', ('start',), 0, '$')
start = get_closure(Closure({start_item}), label)
q = queue.Queue()
q.put(start)
group.add(start)
while not q.empty():
c = q.get()
for literal in all_symbols: # terminals + n_terminals:
go_clos = goto(c, literal)
if go_clos:
if go_clos not in group:
label += 1
go_clos.label = label
q.put(go_clos)
group.add(go_clos)
c.goto[literal] = label
# print('add closure', go_clos)
else:
go_label = find_label(go_clos, group)
if go_label:
c.goto[literal] = go_label
return group
def get_states_map(closure_group):
def get_state_map(closure):
""" table row like all_symbols list state maps."""
row = ["." for i in all_symbols]
# None terminals GOTO action and Terminals shift action.
for input, goto_label in closure.goto.items():
row_pos = all_symbols.index(input)
for item in closure:
if item.pos < len(item.body): # shape like [A -> ⍺.aβ b]
if item.body[item.pos] == input:
# None terminals GOTO state
if input in n_terminals:
row[row_pos] = str(goto_label)
# Terminals action shift state
elif input in terminals:
row[row_pos] = "s" + str(goto_label)
# Terminals reduce action. shape like [A -> ⍺. a]
for row_pos, input in enumerate(all_symbols):
for item in closure:
if item.pos == len(item.body) and \
item.follow == input and \
item.symbol != 'startsup':
# 'R' should be replaced with start_symbol
#if item.follow != '*':
production_num = grammar.index((item.symbol, item.body))
row[row_pos] = 'r' + str(production_num)
#else:
# pass
# accept condition 'startsup -> start. , $'
acc_item = Item('startsup', ('start',), 1, '$')
if acc_item in closure:
input = '$'
row_pos = all_symbols.index('$')
row[row_pos] = '$'
return row
state_map = [None for i in range(len(closure_group))]
for closure in closure_group:
row = get_state_map(closure)
state_map[closure.label] = row
return state_map
def generate_syntax_table():
g = closure_groups()
state_map = get_states_map(g)
return state_map
看下结果:
from parser import *
n = generate_syntax_table()
n
state if ( C ) S1 else S2 $ startsup start stmt
0 s1 . . . . . . . . 2 3
1 . s4 . . . . . . . . .
2 . . . . . . . $ . . .
3 . . . . . . . r1 . . .
4 . . s5 . . . . . . . .
5 . . . s6 . . . . . . .
6 . . . . s7 . . . . . .
7 . . . . . s8 . . . . .
8 . . . . . . s9 . . . .
9 . . . . . . . r2 . . .
语法分析和翻译
语法分析
语法分析器在一个状态栈上工作,这个栈存储了移入的状态,它代表了已经输入,尚未规约的词法单元。语法分析器对token_stream(经过词法器解析后的代码)的词法单元逐个进行4种操作。分析器在分析开始前移入状态0。分析器以状态栈上的最后一个状态(栈顶)为当前状态,并且根据输入字符查分析表,来获得当前操作。
四种分析操作:移入,将目标状态移入到状态栈顶。进入下一个词法单元。
规约,规约目标产生式,当前词法单元不变,继续查表进行下一个操作,直到当前词法单状态元被移入。
接受,在含有增广文法开始符号产生式的项 [startsup -> start◆, '\$'],如果当前输入为 '\$', 分析成功进入接受状态,并结束。
错误, 目前我们忽略错误处理。
代码如下:
class SDT:
def __init__(self):
self.syntax_table = generate_syntax_table()
self.state_stack = [0]
self.accept = False
def get_action(self, state, literal):
return self.syntax_table[state][all_symbols.index(literal)]
def ahead(self, token):
action = self.get_action(self.state_stack[-1], token.typ)
# shift action push a current state into state_stack
if action[0] == 's':
current_state = int(action[1:])
self.state_stack.append(current_state)
elif action[0] == '$':
self.accept = True # success
# reduce action reduct a production and push
elif action[0] == 'r':
# get the production in grammar
number = int(action[1:])
production = grammar[number]
head, body = production
# pop the states of production body
for _ in body:
self.state_stack.pop()
# push the state of head GOTO(I,X)
state = self.get_action(self.state_stack[-1], head)
self.state_stack.append(int(state))
# reduce actions does not consume a token,
# only when shifting, a token was consume and passed
self.ahead(token)
else:
raise SyntaxError(f"Not a correct token '{token.__str__()}'.")
def parse(self, token_stream):
while True:
try:
token = next(token_stream)
self.ahead(token)
except StopIteration:
# patch "$" in the end of token stream
# to match the augmented grammar
self.ahead(Token("$", "$"))
break
它接受一个词法单元流,并且分析,如果分析成功,accept就设置为True
from tokenizer import tokenizer
token_stream = tokenizer("if (C) S1 else S2")
sdt = SDT()
sdt.parse(token_stream)
sdt.accept
Out[8]: True
翻译方案
翻译方案一般插入到分析过程当中。
每个非终结符号都会形成一个函数,我们这里暂时在代码中预定义好非终结符号的翻译函数。
因为LR分析器是从右到左规约,而在移入的时候并不判断目前在哪个产生式的内部,因此翻译方案用后缀翻译来实现,就是在规约的时候翻译。产生式头部的名称作为函数名,规约的内容作为参数来进行调用,向上返回函数的结果。
建立一个参数栈:
self.arg_stack = []
token在移入的时候作为值移入到栈中。
self.push_arg(token)
规约时,将值移出,作为规约函数的参数。返回的结果,就是非终结符号的值,移入到栈中。
# translations
args = []
for _ in body:
arg = self.arg_stack.pop()
args.insert(0, arg)
translation = globals().get(head).__call__(*args)
self.arg_stack.append(translation)
然而后缀翻译方案只适用于综合属性(S属性),对于继承属性并不适用。比如 stmt -> if (C) S1 else S2 大致会形成如下翻译方案:
C.code
S1.scode
goto stmt.next
label L1
S2.code
其中,stmt.next 由外部传入,是stmt作为产生式的体时的继承属性,LL分析器通过预测分析表已经获取了头部,所以可以预先分配一个值。这里由于分析器是规约方式的,因此尚不知道继承属性的值。一般采取用一个空产生式来替代翻译内容并先生成继承属性的方法来解决,不过会带来语法分析时的复杂性。
我们在这里采用延迟调用的方法,就是 stmt 规约完成后并不直接返回翻译的字符串值(因为还有一些属性不知道), 而是返回一个函数,通过将未知的内容包装成参数向上返回,在进行规约 start -> stmt 时, 再将start 生成的必要值作为参数来调用 stmt 规约的返回值,就可以获得正确的翻译方案了。
def stmt(IF, LPAR, c, RPAR, s1, ELSE, s2):
def call(next_label):
L1 = get_label()
C_code = c.code(f_cond=L1)
S1_code = s1.code()
S2_code = s2.code()
inter_code = """
{}
{}
goto {}
label {}
{}""".format(C_code, S1_code, next_label, L1, S2_code)
return inter_code
return call
添加对结束状态的处理,和一些其他必要动作。这样,分析和翻译方案就变成了:
class SDT:
def __init__(self):
self.syntax_table = generate_syntax_table()
self.state_stack = [0]
self.arg_stack = []
self.accept = False
self.translation = ''
def get_action(self, state, literal):
return self.syntax_table[state][all_symbols.index(literal)]
def ahead(self, token):
action = self.get_action(self.state_stack[-1], token.typ)
# shift action push a current state into state_stack
if action[0] == 's':
current_state = int(action[1:])
self.state_stack.append(current_state)
self.push_arg(token)
elif action[0] == '$':
self.translation = startsup(self.arg_stack[-1])
self.accept = True # success
print('SUCCESS')
print(self.translation)
# reduce action reduct a production and push
elif action[0] == 'r':
# get the production in grammar
number = int(action[1:])
production = grammar[number]
head, body = production
# pop the states of production body
for _ in body:
self.state_stack.pop()
# push the state of head GOTO(I,X)
state = self.get_action(self.state_stack[-1], head)
self.state_stack.append(int(state))
# translations
args = []
for _ in body:
arg = self.arg_stack.pop()
args.insert(0, arg)
translation = globals().get(head).__call__(*args)
self.arg_stack.append(translation)
# reduce actions does not consume a token,
# only when shifting, a token was consume and passed
self.ahead(token)
else:
raise SyntaxError(f"Not a correct token '{token.__str__()}'.")
def parse(self, token_stream):
while True:
try:
token = next(token_stream)
self.ahead(token)
except StopIteration:
# patch "$" in the end of token stream
# to match the augmented grammar
self.ahead(Token("$", "$"))
break
def push_arg(self, token):
if token.typ == 'C':
token.code = lambda f_cond: 'Ccode Cfalse = {}'.format(f_cond)
elif token.typ == 'S1':
token.code = lambda : 'S1code'
elif token.typ == 'S2':
token.code = lambda : 'S2code'
self.arg_stack.append(token)
all_labels = []
def get_label():
n = 'L' + str(len(all_labels))
all_labels.append(n)
return n
def stmt(IF, LPAR, c, RPAR, s1, ELSE, s2):
def call(next_label):
L1 = get_label()
C_code = c.code(f_cond=L1)
S1_code = s1.code()
S2_code = s2.code()
inter_code = """
{}
{}
goto {}
label {}
{}""".format(C_code, S1_code, next_label, L1, S2_code)
return inter_code
return call
def start(stmt):
def call():
L = get_label()
return stmt(L)
return call
def startsup(f):
return f()
运行一下,
from parser import SDT
from tokenizer import tokenizer
token_stream = tokenizer('if (C) S1 else S2')
sdt = SDT()
sdt.parse(token_stream)
成功翻译:
Ccode Cfalse = L1
S1code
goto L0
label L1
S2code
这是个简陋的过程,但是核心功能完整,我们可以在之后的过程中,逐步完善它。
通常,词法规则和语法规则是由单独的文件定义的。所以需要对词法规则和语法规则进行解析的构件,来完成从源文本到python对象的转换。翻译方案通常嵌入到语法规则中。
错误处理可以在适当的情况引入到编译过程当中。
另外,二义性文法,空产生式等情况的转换在语法添加的过程当中会浮现。 当然还有为语法规则添加基本的语句,使之逐渐成为一个完善的编译前端。
不论如何,我们已经完成了编译前端从源语言到目标语言的全部流程,是一个成功的开始。