上文介绍了rds_la.proto生成的头文件rds_la_pb.hrl和源文件rds_la_pb.erl的内容,这里将继续分析生成它们的原理。
首先看看protoc-erl的内容:
#!/usr/bin/env escript
%% -*- erlang -*-
%%! -sasl errlog_type error -boot start_sasl -noshell
main ([File]) ->
protobuffs_compile:generate_source (File);
main (_) ->
io:format ("usage: ~s <protofile>~n",
[filename:basename (escript:script_name())]),
halt (1).
这是一个escript,调用核心函数protobuffs_compile:generate_source/1生成文件。
protobuffs_compile.erl
generate_source(ProtoFile,Options) when is_list (ProtoFile) ->
Basename = filename:basename(ProtoFile, ".proto") ++ "_pb",
{ok,FirstParsed} = parse(ProtoFile),
ImportPaths = ["./", "src/" | proplists:get_value(imports_dir, Options, [])],
Parsed = parse_imports(FirstParsed, ImportPaths),
Collected = collect_full_messages(Parsed),
Messages = resolve_types(Collected#collected.msg,Collected#collected.enum),
output_source (Basename, Messages, Collected#collected.enum, Options).
generate_source首先计算生成文件的basename,即在proto文件的basename后加上"_pb",然后开始一个parse过程:
parse(FileName) ->
{ok, InFile} = file:open(FileName, [read]),
Acc = loop(InFile,[]),
file:close(InFile),
protobuffs_parser:parse(Acc).
loop(InFile,Acc) ->
case io:request(InFile,{get_until,prompt,protobuffs_scanner,token,[1]}) of
{ok,Token,_EndLine} ->
loop(InFile,Acc ++ [Token]);
{error,token} ->
exit(scanning_error);
{eof,_} ->
Acc
end.
可以看出,这里利用io:request读取proto文件的内容,读取时使用protobuffs_scanner:token/1进行词法分析,得到token后,交由protobuffs_parser:parse/1进行语法分析。
protobuffs_scanner与protobuffs_parser分别由leex和yecc生成,因此可以直接分析其对应的protobuffs_scanner.xrl和protobuffs_parser.yrl文件。
protobuffs_scanner.xrl
Definitions.
L = [A-Za-z_\.]
D = [0-9]
F = (\+|-)?[0-9]+\.[0-9]+((E|e)(\+|-)?[0-9]+)?
HEX = 0x[0-9]+
WS = ([\000-\s]|%.*)
S = [\(\)\]\[\{\};=]
Rules.
{L}({L}|{D})* : {token, {var, TokenLine,list_to_atom(TokenChars)}}.
'({L}|{D})+' : S = strip(TokenChars,TokenLen),
{token,{string,TokenLine,S}}.
"({L}|{D}|/)+" : S = strip(TokenChars,TokenLen),
{token,{string,TokenLine,S}}.
{S} : {token, {list_to_atom(TokenChars),TokenLine}}.
{WS}+ : skip_token.
//.* : skip_token.
/\*([^\*]|\*[^/])*\*/ : skip_token.
{D}+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}.
{F} : {token, {float, TokenLine, list_to_float(TokenChars)}}.
{HEX} : {token, {integer, TokenLine, hex_to_int(TokenChars)}}.
...
可以看出://开头和/* */内部是注释,()[]{};=都将作为原子,'...'和"..."都将作为字符串,以[A-Za-z_\.]开头的[A-Za-z_\.]|[0-9]的字符串作为标识符,其余的读者可自行分析,值得注意的是,标识符都将被转换为atom。
Rootsymbol g_protobuffs.
Endsymbol '$end'.
g_protobuffs -> '$empty' : [].
g_protobuffs -> g_header g_protobuffs : ['$1'|'$2'].
g_protobuffs -> g_message g_protobuffs : ['$1'|'$2'].
g_header -> g_var string ';' : {'$1', unwrap('$2')}.
g_header -> g_var g_var ';' : {'$1', safe_string('$2')}.
g_header -> g_var g_var '=' g_value ';' : {'$1', '$2', '$4'}.
g_message -> g_var g_var '{' g_elements '}' : {'$1', safe_string('$2'), '$4'}.
g_message -> g_var g_var '{' g_rpcs '}' : {'$1', safe_string('$2'), '$4'}.
g_rpcs -> g_rpc : ['$1'].
g_rpcs -> g_rpc g_rpcs : ['$1' | '$2'].
g_rpc -> g_var g_var '(' g_var ')' g_var '(' g_var ')' ';' : {'$1', safe_string('$2'), safe_string('$4'),
safe_string('$8')}.
g_elements -> g_element : ['$1'].
g_elements -> g_element g_elements : ['$1' | '$2'].
g_element -> g_var g_var g_var '=' integer g_default ';' : {unwrap('$5'),
pack_repeated('$1','$6'),
safe_string('$2'),
safe_string('$3'),
default('$6')}.
g_element -> g_var '=' integer ';' : {'$1', unwrap('$3')}.
g_element -> g_var integer g_var integer ';' : {'$1', unwrap('$2'), unwrap('$4')}.
g_element -> g_var integer g_var g_var ';' : {'$1', unwrap('$2'), '$4'}.
g_element -> g_var g_var '=' g_value ';' : {'$1', '$2', '$4'}.
g_element -> g_message : '$1'.
g_var -> var : unwrap('$1').
g_value -> g_var : '$1'.
g_value -> integer : unwrap('$1').
g_value -> string : unwrap('$1').
g_value -> float : unwrap('$1').
g_default -> '$empty' : none.
g_default -> '[' g_var '=' g_value ']' : {'$2', '$4'}.
safe_string(A) -> make_safe(atom_to_list(A)).
reserved_words() ->
["after", "and", "andalso", "band", "begin", "bnot", "bor", "bsl", "bsr", "bxor", "case", "catch", "cond", "div", "end", "fun",
"if", "let", "not", "of", "or", "orelse", "query", "receive", "rem", "try", "when", "xor"].
make_safe(String) ->
case lists:any(fun(Elem) -> string:equal(String,Elem) end, reserved_words()) of
true -> "pb_"++String;
false -> String
end.
unwrap({_,_,V}) -> V;
unwrap({V,_}) -> V.
default({default,D}) ->
D;
default(_) ->
none.
pack_repeated(repeated,{packed,true}) ->
repeated_packed;
pack_repeated(Type,_) ->
Type.
parse的语法定义与通常的yacc/yecc定义有点不同,因为它的语法定义中没有包含任何通过标识符定义的终结符,仅仅通过()[]{}=;来区分不同的语法,诸如message等protocol buffers的保留字,一概没有出现,而是直接转换成了atom,由将来的语义分析过程进行处理。
今天的主角是message,因此着重观察与它相关的语法,类似于
required string name = 1;
optional int32 id = 2;
repeated int64 ts = 3;
等的语法,都将被
g_element -> g_var g_var g_var '=' integer g_default ';' : {unwrap('$5'),
pack_repeated('$1','$6'),
safe_string('$2'),
safe_string('$3'),
default('$6')}.
匹配,生成一条记录FieldRecord = {FieldID,required/optional/repeated/repeated_packed,FieldType,FieldName,DefaultValue},
并最终归并到message体中,生成message的原始语法树:
[Message1 = {message,MessageName,[FieldRecord1,FieldRecord2,...,FieldRecordn]},Message2,...,Messagen]。
protobuffs_compile.erl。
此时需要注意两点:
1任何proto文件中的保留字,如message、import、package等,都已经被转换为atom了,将来可以直接使用;
2如果用户定义的标识符"Identifier"与erlang的保留字冲突,则将被替换为"pb_Identifier",la_record的query域便与erlang的query保留字冲突,因此被替换成了pb_query。
generate_source(ProtoFile,Options) when is_list (ProtoFile) ->
Basename = filename:basename(ProtoFile, ".proto") ++ "_pb",
{ok,FirstParsed} = parse(ProtoFile),
ImportPaths = ["./", "src/" | proplists:get_value(imports_dir, Options, [])],
Parsed = parse_imports(FirstParsed, ImportPaths),
Collected = collect_full_messages(Parsed),
Messages = resolve_types(Collected#collected.msg,Collected#collected.enum),
output_source (Basename, Messages, Collected#collected.enum, Options).
parse_imports(Parsed, Path) ->
parse_imports(Parsed, Path, []).
parse_imports([], _Path, Acc) ->
lists:reverse(Acc);
parse_imports([{import, File} = Head | Tail], Path, Acc) ->
case file:path_open(Path, File, [read]) of
{ok, F, Fullname} ->
file:close(F),
{ok,FirstParsed} = parse(Fullname),
Parsed = lists:append(FirstParsed, Tail),
parse_imports(Parsed, Path, [Head | Acc]);
{error, Error} ->
error_logger:error_report([
"Could not do import",
{import, File},
{error, Error},
{path, Path}
]),
parse_imports(Tail, Path, [Head | Acc])
end;
parse_imports([Head | Tail], Path, Acc) ->
parse_imports(Tail, Path, [Head | Acc]).
在parse完目标proto文件后,转而对proto文件中的任何import声明进行处理,对import指明的文件也进行parse,然后合并到已经parse的语法树中。
至此proto文件的scan和parse过程也分析完了,此处已经收集到了所有proto文件及其import文件的语法树。
未完待续...