xmerl is a full XML functionality in Erlang, with a lot of features like XPATH, XSLT, event_function, acc_function etc. Well, now I just want to get icalendar to be parsed to form of xmerl tree, which will contain #xmlElement, #xmlAttribute, #xmlText etc, and easily to apply XPATH on it.
How about an approach that the parser just generates SAX events, and then, by attaching to a callback state machine to build a JSON or XML tree, or anything else?
I hoped xmerl is something like this, i.e. a parser to generate SAX events, and a state machine to accept the events and build the XML tree. I digged into xmerl's code, but, unfortunately, the parser and state machine are coupled together.
So I wrote a simple state machine which just receives SAX events to build a xmerl compitable XML tree. And, I applied it to icalendar.
I like this idea, by using SAX events as the common interface, I only need to write a another JSON state machine later, then, the result will be JSON of icalendar. I can share the same parser which just generates SAX events.
Here's the code, which is not completed yet, just to show how a SAX interface can serve a lot.
%%% A state machine which receives sax events and builds a xmerl compitable tree -module(xml_sm). -include_lib("xmerl/include/xmerl.hrl"). -export([state/2]). -export([test/0 ]). -record(xmlsmState, { name = undefined, attributes = [], content = [], parents = [] }). receive_events(Events) -> receive_events(Events, undefined). receive_events([], _States) -> {ok, [], []}; receive_events([Event|T], States) -> case state(Event, States) of {ok, TopElement} -> {ok, TopElement, T}; {error, Reason} -> {error, Reason}; States1 -> receive_events(T, States1) end. state({startDocument}, _StateStack) -> State = #xmlsmState{}, [State]; state({endDocument}, StateStack) -> %io:fwrite(user, "endDocument, states: ~p~n", [StateStack]), case StateStack of {ok, TopElement} -> {ok, TopElement}; _ -> {error, "Bad element match"} end; state({startElement, _Uri, LocalName, _QName, Attrs}, StateStack) -> %io:fwrite(user, "startElement~n", []), %% pop current State [State|_StatesPrev] = StateStack, #xmlsmState{attributes=_Attributes, content=_Content, parents=Parents} = State, {_Pos, Attributes1} = lists:foldl( fun ({Name, Value}, {Pos, AccAttrs}) -> Pos1 = Pos + 1, Attr = #xmlAttribute{name = Name, value = Value, parents = [{LocalName, Pos1}|Parents]}, {Pos1, [Attr|AccAttrs]} end, {0, []}, Attrs), Parents1 = [{LocalName, 0}|Parents], %% push new state of Attributes, Content and Parents to StateStack NewState = #xmlsmState{name = LocalName, attributes = Attributes1, content = [], parents = Parents1}, [NewState|StateStack]; state({endElement, _Uri, LocalName, _QName}, StateStack) -> %% pop current State [State|StatesPrev] = StateStack, #xmlsmState{name=Name, attributes=Attributes, content=Content, parents=Parents} = State, %io:fwrite(user, "Element end with Name: ~p~n", [Name]), if LocalName == undefined -> %% don't care undefined; LocalName /= Name -> throw(lists:flatten(io_lib:format( "Element name match error: ~p should be ~p~n", [LocalName, Name]))); true -> undefined end, %% composite a new element [_|ParentsPrev] = Parents, Element = #xmlElement{name = Name, attributes = Attributes, content = lists:reverse(Content), parents = ParentsPrev}, %io:fwrite(user, "Element: ~p~n", [Element]), %% put Element to parent's content and return new state stack case StatesPrev of [ParentState|[]] -> %% reached the top now, return final result {ok, Element}; [ParentState|Other] -> #xmlsmState{attributes=ParentAttributes, content=ParentContent, parents=ParentParents} = ParentState, ParentContent1 = [Element|ParentContent], %% update parent state and backward to it: ParentState1 = ParentState#xmlsmState{content = ParentContent1}, %io:fwrite(user, "endElement, state: ~p~n", [State1]), [ParentState1|Other] end; state({characters, Characters}, StateStack) -> %% pop current State [State|StatesPrev] = StateStack, #xmlsmState{attributes=_Attributes, content=Content, parents=Parents} = State, [{Parent, Pos}|ParentsPrev] = Parents, Pos1 = Pos + 1, Text = #xmlText{value = Characters, parents = [{Parent, Pos1}|ParentsPrev]}, Content1 = [Text|Content], Parents1 = [{Parent, Pos1}|ParentsPrev], UpdatedState = State#xmlsmState{content = Content1, parents = Parents1}, [UpdatedState|StatesPrev]. test() -> Events = [ {startDocument}, {startElement, [], feed, [], [{link, "http://lightpole.net"}, {author, "Caoyuan"}]}, {characters, "feed text"}, {startElement, [], entry, [], [{tag, "Erlang, Function"}]}, {characters, "Entry1's text"}, {endElement, [], entry, []}, {startElement, [], entry, [], []}, {characters, "Entry2's text"}, {endElement, [], entry, []}, {endElement, [], feed, []}, {endDocument} ], %% Streaming: {ok, Xml1, _Rest} = receive_events(Events), io:fwrite(user, "Streaming Result: ~n~p~n", [Xml1]), %% Stepped: FunCallback = fun xml_sm:state/2, FinalStates = lists:foldl( fun (Event, States) -> FunCallback(Event, States) end, undefined, Events), {ok, Xml2} = FinalStates, XmlText = lists:flatten(xmerl:export_simple([Xml2], xmerl_xml)), io:fwrite(user, "Stepped Result: ~n~p~n", [XmlText]).
And the primary icalendar front end:
-module(ical_parser). -include_lib("xmerl/include/xmerl.hrl"). -export([parse/1 ]). -export([test/0 ]). -define(stateMachine, fun xml_sm:state/2). parse(Text) -> States1 = ?stateMachine({startDocument}, undefined), States2 = parse_line(skip_ws(Text), 0, States1), States3 = ?stateMachine({endDocument}, States2). parse_line([], _Line, States) -> States; parse_line([$\s|T], Line, States) -> parse_line(T, Line, States); parse_line([$\t|T], Line, States) -> parse_line(T, Line, States); parse_line([$\r|T], Line, States) -> parse_line(T, Line, States); parse_line([$\n|T], Line, States) -> parse_line(T, Line + 1, States); parse_line("BEGIN"++T, Line, States) -> case skip_ws(T) of [$:|T1] -> {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []), %io:fwrite(user, "Component started: ~p~n", [Name]), States1 = ?stateMachine({startElement, [], Name, [], []}, States), parse_line(skip_ws(Rest), Line1, States1); _ -> error end; parse_line("END"++T, Line, States) -> case skip_ws(T) of [$:|T1] -> {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []), States1 = ?stateMachine({endElement, [], Name, []}, States), parse_line(skip_ws(Rest), Line1, States1); _ -> error end; parse_line(Text, Line, States) -> {Rest, Line1, {Name, Params}, Value} = parse_prop(skip_ws(Text), Line, States, {[], []}), States1 = ?stateMachine({startElement, [], Name, [], Params}, States), States2 = ?stateMachine({characters, Value}, States1), States3 = ?stateMachine({endElement, [], Name, []}, States2), parse_line(skip_ws(Rest), Line1, States3). parse_component_name([$\r|T], Line, States, Name) -> parse_component_name(T, Line, States, Name); parse_component_name([$\n|T], Line, States, Name) -> case unfolding_line(T) of {true, Rest} -> parse_component_name(Rest, Line, States, Name); {false, Rest} -> {Rest, Line + 1, list_to_atom(string:to_lower(lists:reverse(Name)))} end; parse_component_name([H|T], Line, States, Name) -> parse_component_name(skip_ws(T), Line, States, [H|Name]). parse_prop([$:|T], Line, States, {Name, NameParams}) -> PropName = list_to_atom(string:to_lower(lists:reverse(Name))), PropNameParams = lists:reverse(NameParams), %io:fwrite(user, "parsed prop name: ~p, with params: ~p~n", [PropName, NameParams]), {Rest, Line1, Value} = parse_prop_value(T, Line, States, []), %io:fwrite(user, "parsed prop : ~p~n", [{PropName, NameParams, Value}]), {Rest, Line1, {PropName, PropNameParams}, Value}; parse_prop([$;|T], Line, States, {Name, NameParams}) -> {Rest, Line1, ParamName, ParamValue} = parse_param(T, Line, States, []), parse_prop(Rest, Line1, States, {Name, [{ParamName, ParamValue}|NameParams]}); parse_prop([H|T], Line, States, {Name, NameParams}) -> parse_prop(skip_ws(T), Line, States, {[H|Name], NameParams}). parse_prop_value([$\r|T], Line, States, Value) -> parse_prop_value(T, Line, States, Value); parse_prop_value([$\n|T], Line, States, Value) -> case unfolding_line(T) of {true, Rest} -> parse_prop_value(Rest, Line, States, Value); {false, Rest} -> {Rest, Line + 1, lists:reverse(Value)} end; parse_prop_value([H|T], Line, States, Value) -> parse_prop_value(T, Line, States, [H|Value]). parse_param([$=|T], Line, States, Name) -> ParamName = list_to_atom(string:to_lower(lists:reverse(Name))), {Rest, Line1, Value} = parse_param_value(T, Line, States, []), {Rest, Line1, ParamName, Value}; parse_param([H|T], Line, States, Name) -> parse_param(skip_ws(T), Line, States, [H|Name]). parse_param_value([$;|T], Line, States, Value) -> {T, Line, lists:reverse(Value)}; parse_param_value([$:|T], Line, States, Value) -> %% keep $: for end of prop name {[$:|T], Line, lists:reverse(Value)}; parse_param_value([H|T], Line, States, Value) -> parse_param_value(T, Line, States, [H|Value]). unfolding_line([$\s|T]) -> {true, T}; %% space unfolding_line([$\t|T]) -> {true, T}; %% htab unfolding_line(Chars) -> {false, Chars}. skip_ws([$\s|T]) -> skip_ws(T); skip_ws([$\t|T]) -> skip_ws(T); skip_ws(Text) -> Text. test() -> Text = " BEGIN:VCALENDAR METHOD:PUBLISH X-WR-CALNAME:Mi's Calendar VERSION:2.0 PRODID:Spongecell CALSCALE:GREGORIAN BEGIN:VEVENT DTSTART;TZID=America/Los_Angeles:20061206T120000 DTSTAMP:20070728T004842 LOCATION:Gordon Biersch, 640 Emerson St, Palo Alto, CA URL: UID:295803:spongecell.com SUMMARY:All hands meeting RRULE:FREQ=WEEKLY;INTERVAL=1 DTEND;TZID=America/Los_Angeles:20061206T130000 DESCRIPTION: END:VEVENT BEGIN:VEVENT DTSTART;TZID=America/Los_Angeles:20061207T120000 DTSTAMP:20070728T004842 LOCATION:395 ano nuevo ave\, sunnyvale\, ca URL: UID:295802:spongecell.com SUMMARY:Company lunch RRULE:FREQ=WEEKLY;INTERVAL=1 DTEND;TZID=America/Los_Angeles:20061207T130000 DESCRIPTION:Let's have lots of beer!! (well\, and some code review :) END:VEVENT BEGIN:VEVENT DTSTART;TZID=America/Los_Angeles:20061213T123000 DTSTAMP:20070728T004842 LOCATION:369 S California Ave\, Palo Alto\, CA URL: UID:295714:spongecell.com SUMMARY:Ben is back.. want to meet again DTEND;TZID=America/Los_Angeles:20061213T133000 DESCRIPTION:Re: Ben is back.. want to meet again\n Marc END:VEVENT BEGIN:VEVENT DTSTART;TZID=America/Los_Angeles:20070110T200000 DTSTAMP:20070728T004842 LOCATION: URL: UID:304529:spongecell.com SUMMARY:flight back home DTEND;TZID=America/Los_Angeles:20070110T210000 DESCRIPTION: END:VEVENT BEGIN:VTIMEZONE TZID:America/Los_Angeles BEGIN:STANDARD DTSTART:20071104T000000 TZNAME:PST RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU TZOFFSETFROM:-0700 TZOFFSETTO:-0800 END:STANDARD BEGIN:DAYLIGHT DTSTART:20070311T000000 TZNAME:PDT RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=1SU TZOFFSETFROM:-0800 TZOFFSETTO:-0700 END:DAYLIGHT END:VTIMEZONE END:VCALENDAR ", io:fwrite(user, "Text: ~s~n", [Text]), {ok, Xml} = parse(Text), XmlText = lists:flatten(xmerl:export_simple([Xml], xmerl_xml)), io:fwrite(user, "Parsed: ~n~p~n", [XmlText]).
You may have noticed, the ?stateMachine can be pointed to a json_machine:state/2 some day, and we can get a JSON result without modification of icalendar.erl.
This also can be applied on JSON<->XML transform. Actually, I think SAX events is a good interface for various formats transform of data object. It's also a bit Erlang Style (Event passing). The parser/state-machine can communicate via SAX events as two separate processes and live with send/receive.