一个模型Model
拥有两个元类型Term
终结符类型和Uterm
非终结符类型,对应于文法中的概念 G ( N , T , P , S ) G(N,T,P,S) G(N,T,P,S)如下:
template<typename term_t = int32_t, typename uterm_t = int32_t>
class Model {
friend class Processor;
public:
using string = std::string;
using strvec = std::vector<string>;
using symbol_t = Symbol<term_t, uterm_t>;
using model = Model<term_t, uterm_t>;
std::map<string, symbol_t> sym_table;
std::vector<Production<symbol_t>> prods;
symbol_t begin_symbol;
}
其中symtable对应 N ∪ T N\cup T N∪T,prods对应 P P P, begin symbol对应 S S S。
这个模型类只负责将文件内的模型读入到内存中,不负责具体的语法构建。
递归调用分析写出的代码只针对具体一种语言,因此实用性不强。代码如下:
template<typename token_t, class Source, class TokenTable>
class RecursiveAParser {
using istream = Source;
using result_t = Result<token_t, UTerm>;
using node_t = ASTNode<token_t, UTerm>;
istream &ref;
token_t token;
result_t *result;
public:
RecursiveAParser(istream &ref): ref(ref) {}
result_t* parse() {
result = new result_t();
result->code = ResultCode::Ok;
auto hdl = result;
read();
parseE(result->rt = result->alloc(UTerm::E, true));
result = nullptr;
return hdl;
}
private:
void read() {
ref >> token;
}
void error() {
result->code = ResultCode::Error;
std::cout << "error " << token << std::endl;
}
void parseE(node_t* &rt) {
parseT(rt->insert(result->alloc(UTerm::T, true)));
if (token != TokenTable::eof) {
parseED(rt->insert(result->alloc(UTerm::ED, true)));
}
}
void parseED(node_t* &rt) {
if (token == TokenTable::add || token == TokenTable::sub) {
rt->insert(result->alloc(token));
read();
parseT(rt->insert(result->alloc(UTerm::T, true)));
parseED(rt->insert(result->alloc(UTerm::ED, true)));
}
}
void parseT(node_t* &rt) {
parseF(rt->insert(result->alloc(UTerm::F, true)));
if (token != TokenTable::eof) {
parseTD(rt->insert(result->alloc(UTerm::TD, true)));
}
}
void parseTD(node_t* &rt) {
if (token == TokenTable::mul || token == TokenTable::div) {
rt->insert(result->alloc(token));
read();
parseF(rt->insert(result->alloc(UTerm::F, true)));
parseTD(rt->insert(result->alloc(UTerm::TD, true)));
}
}
void parseF(node_t* &rt) {
if (token == TokenTable::lbr) {
rt->insert(result->alloc(token));
read();
parseE(rt->insert(result->alloc(UTerm::E, true)));
if (token == TokenTable::rbr) {
rt->insert(result->alloc(token));
read();
} else {
error();
}
} else if (token == TokenTable::num) {
rt->insert(result->alloc(token));
read();
} else {
error();
}
}
};
istream
是lexer
的输出流,经过递归函数组织以后生成语法树。生成结果与树节点数据结构如下:
template<typename term_t, typename uterm_t>
struct Result {
using node_t = ASTNode<term_t, uterm_t>;
node_t *rt;
ResultCode code;
}
template<typename term_t, typename uterm_t>
struct ASTNode {
using symbol_t = Symbol<term_t, uterm_t>;
symbol_t symbol;
std::vector<ASTNode*> ch;
};
( num + num / ( num + num * num ) )
{ code: Ok, node: {
{ut,E}, {{{ut,T}, {{{ut,F}, {
{{t,Lbr}, {}},
{{ut,E}, {{{ut,T}, {
{{ut,F}, {
{{t,Num}, {}}, }},
{{ut,TD}, {}}, }},
{{ut,ED}, {
{{t,Add}, {}},
{{ut,T}, {{{ut,F},
{{{t,Num}, {}}, }}, {{ut,TD},
{{{t,Div}, {}},{{ut,F}, {
{{t,Lbr}, {}},
{{ut,E}, {{{ut,T}, {{{ut,F}, {
{{t,Num}, {}}, }}, {{ut,TD}, {}}, }}, {{ut,ED}, {
{{t,Add}, {}}, {{ut,T}, {{{ut,F}, {{
{t,Num}, {}}, }}, {{ut,TD}, {
{{t,Mul}, {}}, {{ut,F}, {
{{t,Num}, {}}, }}, {{ut,TD}, {}}, }}, }},
{{ut,ED}, {}}, }}, }},
{{t,Rbr}, {}}, }}, {{ut,TD}, {}}, }}, }},
{{ut,ED}, {}}, }}, }},
{{t,Rbr}, {}}, }}, }}, }}}
每个结点都以{{t/ut, symbol}, {ch1, ch2, ...}}
表示,其中t表示它是个终结符结点,否则是个非终结符结点,symbol
是以字符串表示的符号,ch1
、ch2
…是它的第一个直接儿子、第二个直接儿子…。虽然因为并非抽取非终结符以后的抽象语法树,所以结果非常杂乱,但保存了完整的推导过程。
在介绍LL(1)分析和LR(1)之前,先给出几个计算函数,这是所有语法通用的算法函数。
求First集合的伪算法如下:
for symbol in Symbols do
let First[symbol] = { symbol } if symbol is term else {}
end
do
for symbol in uterm Symbols do
let F be First[symbol]
for production in Productions do
let production be A -> X1 X2 ... Xn where A = symbol
for i in 1 ... n do
F = F merge (First[X1 X2 ... Xi] - {epsilon})
if epsilon not in First[X1 X2 ... Xi] then
break
end
end
if epsilon in First[X1 X2 ... Xn] then
F = F merge {epsilon}
end
end
end
while any First[symbol] changed
实现代码如下:
template<class Grammar, class grammar_traits>
void calculate_first_fixed_point(Grammar &g) {
using symbol_t = typename Grammar::symbol_t;
for (auto &x : g.sym_table) {
auto &sym = x.second;
auto s = new std::set<symbol_t>();
if (!sym.is_unterm()) {
s->insert(sym);
}
g.first[sym] = s;
}
bool changed;
do {
changed = false;
for (auto &symset : g.first) {
if (!symset.first.is_unterm()) {
continue;
}
auto &sym = symset.first;
auto set = symset.second;
size_t ls = set->size();
for (auto &prod : g.prods) {
if (prod.lhs == sym) {
bool flag = true;
for (auto &rsym : prod.rhs) {
auto &rset = *g.first[rsym];
if (rset.count(grammar_traits::epsilon) && !set->count(grammar_traits::epsilon)) {
set->insert(rset.begin(), rset.end());
set->erase(grammar_traits::epsilon);
} else {
set->insert(rset.begin(), rset.end());
}
if (!rset.count(grammar_traits::epsilon)) {
flag = false;
break;
}
}
if (flag) {
set->insert(grammar_traits::epsilon);
}
}
}
if (set->size() != ls) {
changed = true;
}
}
} while (changed);
}
求Follow集合的伪算法如下:
for symbol in Symbols do
let Follow[symbol] = { $ } if symbol is begin symbol else {}
end
do
for production in Productions do
let production be A -> X1 X2 ... Xn
for i in 1 ... n do
Follow[Xi] = Follow[Xi] merge (First[Xi+1 ... Xn] - {epsilon})
end
let production be any A -> X B Y
if epsilon in First[Y] then
Follow[B] = Follow[B] merge Follow[A]
end
end
while any Follow[symbol] changed
实现代码如下:
template<class Grammar, class grammar_traits>
void calculate_follow_fixed_point(Grammar &g) {
using symbol_t = typename Grammar::symbol_t;
auto &beg = g.begin_symbol;
for (auto &x : g.sym_table) {
auto &sym = x.second;
auto s = new std::set<symbol_t>();
if (sym == beg) {
s->insert(grammar_traits::dollar);
}
g.follow[sym] = s;
}
bool changed;
do {
changed = false;
for (auto &prod : g.prods) {
auto &lhs = prod.lhs;
auto &rhs = prod.rhs;
std::set<symbol_t> mset(*g.follow[lhs]);
for (typename std::make_signed<size_t>::type
i = rhs.size() - 1; i >= 0; i--) {
auto &rsym = rhs[i];
if (rsym.is_unterm()) {
auto sz = g.follow[rsym]->size();
g.follow[rsym]->insert(mset.begin(), mset.end());
if (g.follow[rsym]->size() - sz) {
changed = true;
}
}
if (!g.first[rsym]->count(grammar_traits::epsilon)) {
mset.clear();
}
mset.insert(g.first[rsym]->begin(), g.first[rsym]->end());
mset.erase(grammar_traits::epsilon);
}
}
} while (changed);
}
LL(1)语法的结构体如下:
template<class grammar_traits, class Policy=BasicLLGrammar<grammar_traits>>
class LL1Grammar : public Policy {
public:
using model_t = typename grammar_traits::model_t;
using string = typename grammar_traits::string;
using strvec = typename grammar_traits::strvec;
using symbol_t = typename grammar_traits::symbol_t;
using production_t = typename grammar_traits::production_t;
using grammar_t = LL1Grammar<grammar_traits>;
private:
... the same as model
std::map<symbol_t, std::set<symbol_t>* > first;
std::map<symbol_t, std::set<symbol_t>* > follow;
using action_map = std::map<symbol_t, action_space::action*>;
std::map<symbol_t, action_map*> table;
};
LL(1)语法构造的伪算法如下:
for production in Productions do
let production be A -> B
for x in First[B] do
Action[A, x] = use production A -> B
end
if epsilon in First[B] do
for x in Follow[B] do
Action[A, x] = use production A -> B
end
end
end
abort if any Action[A, x] have conflict items
对应C++算法如下:
for (auto &c : sym_table) {
table[c.second] = new action_map();
}
std::set<symbol_t> mset;
for (auto &prod : prods) {
get_first_follow1<grammar_t, grammar_traits>(*this, prod, mset);
auto &acmp = *table[prod.lhs];
for (auto &sym : mset) {
if (acmp.count(sym)) {
std::stringstream s("conflict ");
print::print("prod:", false, s);
print::print(prod, false, s);
print::print("mset:", false, s);
print::print(mset, false, s);
print::print("symbol:", false, s);
print::print(sym, false, s);
throw std::logic_error(s.str());
}
action_space::action *a =
new action_space::replace_action1<symbol_t>(prod.lhs, prod.rhs);
acmp[sym] = a;
}
}
注意BasicLLGrammar
是默认的LL语法分析器,其对应语法推导过程,伪算法如下:
let stack = [ as ]
for symbol flow from LexerResult do
if Top[stack].Symbol is term then
if Top[stack] not equal to symbol then
error()
end
else
Pop(stack)
consume this symbol
end
end
else
let action be Action[Top[stack].Symbol, symbol]
if action exists then
let action be A -> B
let AstTree be Top[stack].AstRoot
alloc node ch1, ch2, ... for B
let ch1, ch2, ... be AstTree's children
Pop(stack)
Push(stack, reverse(B))
remain this symbol in flow
else
error()
end
end
end
output result
对应C++代码如下:
template<class IStream>
result_t *work(IStream &is) {
reset();
auto result = new result_t();
auto rt = result->alloc(begin_symbol);
result->rt = rt;
stack.push(rt);
read(is, next_symbol);
while (stack.size()) {
if (next_symbol == grammar_traits::eof) {
return result;
}
auto state = stack.top();
if (state->symbol.is_unterm()) {
auto &acmp = *(*table)[state->symbol];
if (acmp.count(next_symbol)) {
auto d0 = dynamic_cast<
action_space::replace_action1<symbol_t>*>(acmp[next_symbol]);
if (d0 != nullptr) {
auto &prod = *d0;
if (state->symbol != prod.reduce) {
if (follow[state->symbol]->count(next_symbol)) {
stack.pop();
} else {
read(is, next_symbol);
}
error_count++;
} else {
stack.pop();
for (auto &sym : prod.produce) {
state->insert(result->alloc(sym));
}
for (auto iter = state->ch.rbegin(); iter != state->ch.rend();
iter++) {
stack.push(*iter);
}
}
} else {
auto d1 = dynamic_cast<action_space::error_action*>(acmp[next_symbol]);
if (d1 != nullptr) {
auto &info = *d1;
error_count++;
} else {
auto d2 = dynamic_cast<
action_space::synch_action*>(acmp[next_symbol]);
if (d2 != nullptr) {
auto &info = *d2;
stack.pop();
error_count++;
}
}
}
} else {
read(is, next_symbol);
error_count++;
}
} else {
if (next_symbol != state->symbol) {
error_count++;
} else {
stack.pop();
}
read(is, next_symbol);
}
}
return result;
}
( num + num / ( num + num * num ) )
{ code: Ok, node: {{ut,S}, {
{{ut,E}, {
{{t,Lbr}, {}}, {{ut,E}, {
{{t,Num}, {}}, {{ut,ED}, {
{{t,Add}, {}}, {{ut,T}, {
{{t,Num}, {}}, {{ut,TD}, {
{{t,Div}, {}}, {{ut,F}, {
{{t,Lbr}, {}}, {{ut,E}, {
{{t,Num}, {}}, {{ut,ED}, {
{{t,Add}, {}}, {{ut,T}, {
{{t,Num}, {}}, {{ut,TD}, {
{{t,Mul}, {}}, {{ut,F}, {
{{t,Num}, {}}, }}, {{ut,TD}, {}}, }}, }}, {{ut,ED}, {}}, }}, }},
{{t,Rbr}, {}}, }}, {{ut,TD}, {}}, }}, }}, {{ut,ED}, {}}, }}, }},
{{t,Rbr}, {}}, {{ut,ED}, {}}, }}, }}}
可见结果相比递归调用分析的要简化很多,减少了很多不必要的推导过程。
LR(1)语法的结构体如下:
template<class grammar_traits, class Policy=BasicLRGrammar<grammar_traits> >
class LR1Grammar : public Policy {
public:
using model_t = typename grammar_traits::model_t;
using string = typename grammar_traits::string;
using strvec = typename grammar_traits::strvec;
using symbol_t = typename grammar_traits::symbol_t;
using production_t = typename grammar_traits::production_t;
using state_id_t = typename grammar_traits::state_id_t;
using grammar_t = LR1Grammar<grammar_traits>;
private:
std::map<string, symbol_t> &sym_table;
std::vector<production_t> &prods;
symbol_t begin_symbol;
std::map<symbol_t, std::set<symbol_t>* > first;
using action_map = std::map<symbol_t, action_space::action*>;
std::map<state_id_t, action_map*> table;
}
伪算法如下:
do
if Items is empty then
Items = { · S, Dollar> as }
end
for I in Items do
let J be move(I, x)
do
for {A -> B · C X, L} in J do
for b in First [X L] do
for production in Productions do
let production be U -> V where U = C
J = J merge { · V, b>}
end
end
end
while J changed
if J in Items then
Let K be Item in Items where K = J
let go(I, x) = K
end
else
Items = Items merge J
let go(I, x) = J
end
end
while Items extended
计算函数的入口函数如下:
template<typename grammar_traits>
void calculate_LR_1_items(
std::vector<typename grammar_traits::production_t> &prods,
std::map<typename grammar_traits::symbol_t,
std::set<typename grammar_traits::symbol_t>* > &first,
typename grammar_traits::symbol_t &begin_symbol,
const std::function<void(LR1ActionCalculationContext<grammar_traits>&)> &callback) {
LR1ActionCalculationContext<grammar_traits>(prods, first, begin_symbol).build().
callback(callback);
}
具体的计算函数太过复杂就不贴出来了,但我们也把它们的函数签名贴出来并做分析:
template<class grammar_traits>
struct LR1ActionCalculationContext {
using context_t = LR1ActionCalculationContext<grammar_traits>;
using symbol_t = typename grammar_traits::symbol_t;
using state_id_t = typename grammar_traits::state_id_t;
using production_t = typename grammar_traits::production_t;
using action_map = std::map<symbol_t, action_space::action*>;
using item_t = std::pair<std::pair<int, int>, symbol_t>;
using hashed_item_t = int64_t;
using state_set = std::set<item_t>;
std::vector<item_t> items;
std::vector<state_set*> state;
std::map<hashed_item_t, state_id_t> hash_set;
graph::WeightedGraph<state_id_t, symbol_t, 500, 500 * 10> automa;
int64_t seed, seed2;
const int64_t mod = 1000000000 + 9;
std::vector<production_t> &prods;
std::map<symbol_t, std::set<symbol_t>* > first;
symbol_t &begin_symbol;
context_t &build() {...}
context_t &callback(const std::function<void(context_t &)> &cb) {...}
private:
void walk(state_id_t idx) {...}
state_id_t extend(state_id_t idx) {...}
void extend_to(state_set &mset, std::set<symbol_t> lookahead,
const symbol_t &next_sym) {...}
hashed_item_t calculate_hash(state_set &mset) {...}
};
build
函数内先将$S’\to \cdot S, $ 初 始 化 为 第 0 个 状 态 。 然 后 调 用 ‘ e x t e n d ‘ 函 数 扩 展 状 态 , 再 调 用 w a l k 函 数 拓 展 状 态 下 状 态 。 ‘ w a l k ‘ 函 数 对 于 每 个 未 发 现 的 项 目 集 移 动 游 标 , 对 每 个 新 的 项 目 集 调 用 ‘ e x t e n d ‘ 函 数 再 调 用 自 身 。 ‘ e x t e n d ‘ 函 数 对 于 每 个 可 推 导 的 非 终 结 符 都 用 ‘ e x t e n d t o ‘ 和 计 算 出 的 初始化为第0个状态。然后调用`extend`函数扩展状态,再调用walk函数拓展状态下状态。 `walk`函数对于每个未发现的项目集移动游标,对每个新的项目集调用`extend`函数再调用自身。 `extend`函数对于每个可推导的非终结符都用`extend_to`和计算出的 初始化为第0个状态。然后调用‘extend‘函数扩展状态,再调用walk函数拓展状态下状态。‘walk‘函数对于每个未发现的项目集移动游标,对每个新的项目集调用‘extend‘函数再调用自身。‘extend‘函数对于每个可推导的非终结符都用‘extendto‘和计算出的\mathrm{First}(\beta a)$展望符集合拓展,如果之前出现过该项目集,则删除新拓展的项目集并返回旧项目集编号否则保留新项目集。
伪算法如下:
for I in LR(1) Items do
let B · a C, b> be Item in I where a is term
and let J = go(I, a)
then Action[I, a] = Shift a and Goto J
let B · U C, b> be Item in I where U is uterm
and let J = go(I, U)
then Action[I, U] = Goto J
let B ·, b> be Item in I where b is not Dollar
then Action[I, b] = Reduce B to A
let A ·, Dollar> be Item in I where S is begin symbol
then Action[I, Dollar] = Accept
end
c++代码如下:
calculate_LR_1_items<grammar_traits>(prods, first, begin_symbol, [&](
context_t &context) {
for (int i = 0; i < context.state.size(); i++) {
auto acmp = new action_map();
table[i] = acmp;
for (auto j : context.automa.at_e(i)) {
auto &sym = j.w;
if (sym.is_unterm()) {
#ifdef DEBUG
if ((*acmp).count(sym)) {
std::cout << "conflict item"; print::print(sym, true);
}
#endif
(*acmp)[sym] = new action_space::goto_action<state_id_t>(j.to);
} else {
#ifdef DEBUG
if ((*acmp).count(sym)) {
std::cout << "conflict item"; print::print(sym, true);
}
#endif
(*acmp)[sym] = new action_space::shift_action<state_id_t>(j.to);
}
}
for (auto &item : *context.state[i]) {
if (item.first.second == prods[item.first.first].rhs.size()) {
#ifdef DEBUG
if ((*acmp).count(item.second)) {
std::cout << "conflict item"; print::print(item.second, true);
}
#endif
if (prods[item.first.first].lhs == begin_symbol) {
(*acmp)[item.second] = new action_space::accept_action();
} else {
(*acmp)[item.second] =
new action_space::replace_action1<symbol_t>(
prods[item.first.first].lhs, prods[item.first.first].rhs);
}
}
}
}
});
注意BasicLRGrammar
是默认的LR分析器,其对应的语法推导过程伪算法如下:
let stack = [ as ]
for symbol flow from LexerResult do
let S, I, R be extracted from Top[stack]
let action = Action[I, symbol]
if action is in form of Shift a and Goto J
Push(stack, )
end
else if action is in form of Reduce B to A
let chs = []
for Top[stack] matching B do
let T, J, ch be extracted from Top[stack]
chs = [ch, chs...]
end
let T, J, Rt be extracted from Top[stack] where T must equal to A
Push(stack, )
end
else if action is in form of Accept
return Bottom[S].AstRoot
end
else
error()
end
end
对应的c++代码如下:
template<class IStream>
result_t *work(IStream &is) {
reset();
auto result = new result_t();
auto rt = result->alloc(begin_symbol);
result->rt = rt;
stack.push(std::make_pair(rt, 0));
read(is, next_symbol);
while (stack.size()) {
// print::print(stack.top(), true);
auto acmp = (*table)[stack.top().second];
if (!acmp->count(next_symbol)) {
error_count++;
read(is, next_symbol);
continue;
}
auto norm_action = (*acmp)[next_symbol];
if (auto action = dynamic_cast<action_space::shift_action<state_id_t>*>(
norm_action)) {
stack.push({result->alloc(next_symbol), action->to_state});
read(is, next_symbol);
} else if (auto action = dynamic_cast<action_space::replace_action1<symbol_t>*>(
norm_action)) {
auto mrt = result->alloc(action->reduce);
for (typename std::make_signed<size_t>::type i = action->produce.size() - 1;
i >= 0; i--) {
mrt->ch.push_back(stack.top().first);
stack.pop();
}
std::reverse(mrt->ch.begin(), mrt->ch.end());
auto acmp2 = (*table)[stack.top().second];
stack.push({mrt, (dynamic_cast<action_space::goto_action<state_id_t>*>(
(*acmp2)[action->reduce]))->to_state});
} else if (auto action = dynamic_cast<action_space::accept_action*>(norm_action)) {
result->rt->insert(stack.top().first);
// std::cout << "stack size " << stack.size() << std::endl;
if (error_count != 0) {
result->code = ResultCode::Error;
}
return result;
}
}
if (error_count != 0) {
result->code = ResultCode::Error;
}
return result;
}
测试结果如下:
( num + num / ( num + num * num ) )
{ code: Ok, node: {{ut,S}, {{{ut,E}, {{{ut,T}, {{
{ut,F}, {
{{t,Lbr}, {}}, {{ut,E}, {{{ut,E}, {{{ut,T}, {{{ut,F}, {
{{t,Num}, {}}, }}, }}, }},
{{t,Add}, {}}, {{ut,T}, {{{ut,T}, {{{ut,F}, {
{{t,Num}, {}}, }}, }},
{{t,Div}, {}}, {{ut,F}, {
{{t,Lbr}, {}}, {{ut,E}, {{{ut,E}, {{{ut,T}, {{{ut,F}, {
{{t,Num}, {}}, }}, }}, }},
{{t,Add}, {}}, {{ut,T}, {{{ut,T}, {{{ut,F}, {
{{t,Num}, {}}, }}, }},
{{t,Mul}, {}}, {{ut,F}, {
{{t,Num}, {}}, }}, }}, }},
{{t,Rbr}, {}}, }}, }}, }},
{{t,Rbr}, {}}, }}, }}, }}, }}}