A Simple XML State Machine Accepting SAX Events to Build xmerl Compitable XML Tree: icalendar demo

xmerl is a full XML functionality in Erlang, with a lot of features like XPATH, XSLT, event_function, acc_function etc. Well, now I just want to get icalendar to be parsed to form of xmerl tree, which will contain #xmlElement, #xmlAttribute, #xmlText etc, and easily to apply XPATH on it.

How about an approach that the parser just generates SAX events, and then, by attaching to a callback state machine to build a JSON or XML tree, or anything else?

I hoped xmerl is something like this, i.e. a parser to generate SAX events, and a state machine to accept the events and build the XML tree. I digged into xmerl's code, but, unfortunately, the parser and state machine are coupled together.

So I wrote a simple state machine which just receives SAX events to build a xmerl compitable XML tree. And, I applied it to icalendar.

I like this idea, by using SAX events as the common interface, I only need to write a another JSON state machine later, then, the result will be JSON of icalendar. I can share the same parser which just generates SAX events.

Here's the code, which is not completed yet, just to show how a SAX interface can serve a lot.

%%% A state machine which receives sax events and builds a xmerl compitable tree


-module(xml_sm).

-include_lib("xmerl/include/xmerl.hrl").

-export([state/2]).

-export([test/0
        ]).

-record(xmlsmState, {
    name = undefined,
    attributes = [],
    content = [],
    parents = []
}).

receive_events(Events) -> receive_events(Events, undefined).

receive_events([], _States) -> {ok, [], []};
receive_events([Event|T], States) ->
    case state(Event, States) of 
        {ok, TopElement} -> 
            {ok, TopElement, T};
        {error, Reason} -> 
            {error, Reason};
        States1 -> 
            receive_events(T, States1)    
    end.

state({startDocument}, _StateStack) ->
    State = #xmlsmState{},
    [State];
state({endDocument}, StateStack) ->
    %io:fwrite(user, "endDocument, states: ~p~n", [StateStack]),
    case StateStack of
        {ok, TopElement} -> {ok, TopElement};
        _ -> {error, "Bad element match"}
    end;
state({startElement, _Uri, LocalName, _QName, Attrs}, StateStack) ->
    %io:fwrite(user, "startElement~n", []),
    %% pop current State
    [State|_StatesPrev] = StateStack,
    #xmlsmState{attributes=_Attributes,
                content=_Content,
                parents=Parents} = State,
    {_Pos, Attributes1} = lists:foldl(
        fun ({Name, Value}, {Pos, AccAttrs}) ->
                Pos1 = Pos + 1,
                Attr = #xmlAttribute{name = Name,
                                     value = Value, 
                                     parents = [{LocalName, Pos1}|Parents]},
                {Pos1, [Attr|AccAttrs]}
        end, {0, []}, Attrs),
    Parents1 = [{LocalName, 0}|Parents],
    %% push new state of Attributes, Content and Parents to StateStack
    NewState = #xmlsmState{name = LocalName,
                           attributes = Attributes1,
                           content = [],
                           parents = Parents1},
    [NewState|StateStack];
state({endElement, _Uri, LocalName, _QName}, StateStack) ->
    %% pop current State
    [State|StatesPrev] = StateStack,
    #xmlsmState{name=Name,
                attributes=Attributes,
                content=Content,
                parents=Parents} = State,
    %io:fwrite(user, "Element end with Name: ~p~n", [Name]),
    if  LocalName == undefined -> %% don't care 
            undefined; 
        LocalName /= Name -> 
            throw(lists:flatten(io_lib:format(
                "Element name match error: ~p should be ~p~n", [LocalName, Name])));
        true -> undefined
    end,
    %% composite a new element
    [_|ParentsPrev] = Parents,
    Element = #xmlElement{name = Name,
                          attributes = Attributes,
                          content = lists:reverse(Content),
                          parents = ParentsPrev},
    %io:fwrite(user, "Element: ~p~n", [Element]),
    %% put Element to parent's content and return new state stack
    case StatesPrev of
        [ParentState|[]] -> %% reached the top now, return final result
            {ok, Element};
        [ParentState|Other] ->
            #xmlsmState{attributes=ParentAttributes,
                        content=ParentContent,
                        parents=ParentParents} = ParentState,
            ParentContent1 = [Element|ParentContent],
            %% update parent state and backward to it:
            ParentState1 = ParentState#xmlsmState{content = ParentContent1},
            %io:fwrite(user, "endElement, state: ~p~n", [State1]),
            [ParentState1|Other]
        end;
state({characters, Characters}, StateStack) ->
    %% pop current State
    [State|StatesPrev] = StateStack,
    #xmlsmState{attributes=_Attributes,
                content=Content,
                parents=Parents} = State,
    [{Parent, Pos}|ParentsPrev] = Parents,
    Pos1 = Pos + 1,
    Text = #xmlText{value = Characters,
                    parents = [{Parent, Pos1}|ParentsPrev]},
    Content1 = [Text|Content],
    Parents1 = [{Parent, Pos1}|ParentsPrev],
    UpdatedState = State#xmlsmState{content = Content1,
                                    parents = Parents1},
    [UpdatedState|StatesPrev].

test() ->
    Events = [
        {startDocument},
        {startElement, [], feed, [], [{link, "http://lightpole.net"}, {author, "Caoyuan"}]},
        {characters, "feed text"},
        {startElement, [], entry, [], [{tag, "Erlang, Function"}]},
        {characters, "Entry1's text"},
        {endElement, [], entry, []},
        {startElement, [], entry, [], []},
        {characters, "Entry2's text"},
        {endElement, [], entry, []},
        {endElement, [], feed, []},
        {endDocument}
    ],

    %% Streaming:
    {ok, Xml1, _Rest} = receive_events(Events),
    io:fwrite(user, "Streaming Result: ~n~p~n", [Xml1]),
    
    %% Stepped:
    FunCallback = fun xml_sm:state/2,
    FinalStates = lists:foldl(
        fun (Event, States) ->
                FunCallback(Event, States)
        end, undefined, Events),
    {ok, Xml2} = FinalStates,
    XmlText = lists:flatten(xmerl:export_simple([Xml2], xmerl_xml)),
    io:fwrite(user, "Stepped Result: ~n~p~n", [XmlText]).

And the primary icalendar front end:

-module(ical_parser).

-include_lib("xmerl/include/xmerl.hrl").

-export([parse/1
        ]).

-export([test/0
        ]).

-define(stateMachine, fun xml_sm:state/2).

parse(Text) ->
    States1 = ?stateMachine({startDocument}, undefined),
    States2 = parse_line(skip_ws(Text), 0, States1),
    States3 = ?stateMachine({endDocument}, States2).

parse_line([], _Line, States) -> States;
parse_line([$\s|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\t|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\r|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\n|T], Line, States) -> parse_line(T, Line + 1, States);
parse_line("BEGIN"++T, Line, States) ->
    case skip_ws(T) of
        [$:|T1] -> 
            {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []),
            %io:fwrite(user, "Component started: ~p~n", [Name]),
            States1 = ?stateMachine({startElement, [], Name, [], []}, States),
            parse_line(skip_ws(Rest), Line1, States1);
        _ -> error
    end;
parse_line("END"++T, Line, States) ->
    case skip_ws(T) of
        [$:|T1] -> 
            {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []),
            States1 = ?stateMachine({endElement, [], Name, []}, States),
            parse_line(skip_ws(Rest), Line1, States1);
        _ -> error        
    end;
parse_line(Text, Line, States) ->
    {Rest, Line1, {Name, Params}, Value} = parse_prop(skip_ws(Text), Line, States, {[], []}),
    States1 = ?stateMachine({startElement, [], Name, [], Params}, States),
    States2 = ?stateMachine({characters, Value}, States1),
    States3 = ?stateMachine({endElement, [], Name, []}, States2),
    parse_line(skip_ws(Rest), Line1, States3).

parse_component_name([$\r|T], Line, States, Name) -> parse_component_name(T, Line, States, Name);
parse_component_name([$\n|T], Line, States, Name) ->
    case unfolding_line(T) of
        {true,  Rest} -> parse_component_name(Rest, Line, States, Name);
        {false, Rest} -> {Rest, Line + 1, list_to_atom(string:to_lower(lists:reverse(Name)))}
    end;
parse_component_name([H|T], Line, States, Name) ->
    parse_component_name(skip_ws(T), Line, States, [H|Name]).
    
parse_prop([$:|T], Line, States, {Name, NameParams}) ->
    PropName = list_to_atom(string:to_lower(lists:reverse(Name))),
    PropNameParams = lists:reverse(NameParams),
    %io:fwrite(user, "parsed prop name: ~p, with params: ~p~n", [PropName, NameParams]), 
    {Rest, Line1, Value} = parse_prop_value(T, Line, States, []),
    %io:fwrite(user, "parsed prop : ~p~n", [{PropName, NameParams, Value}]), 
    {Rest, Line1, {PropName, PropNameParams}, Value};
parse_prop([$;|T], Line, States, {Name, NameParams}) ->
    {Rest, Line1, ParamName, ParamValue} = parse_param(T, Line, States, []),
    parse_prop(Rest, Line1, States, {Name, [{ParamName, ParamValue}|NameParams]});
parse_prop([H|T], Line, States, {Name, NameParams}) ->
    parse_prop(skip_ws(T), Line, States, {[H|Name], NameParams}).

parse_prop_value([$\r|T], Line, States, Value) -> parse_prop_value(T, Line, States, Value);
parse_prop_value([$\n|T], Line, States, Value) ->
    case unfolding_line(T) of
        {true,  Rest} -> parse_prop_value(Rest, Line, States, Value);
        {false, Rest} -> {Rest, Line + 1, lists:reverse(Value)}
    end;
parse_prop_value([H|T], Line, States, Value) ->
    parse_prop_value(T, Line, States, [H|Value]).

parse_param([$=|T], Line, States, Name) ->
    ParamName = list_to_atom(string:to_lower(lists:reverse(Name))),
    {Rest, Line1, Value} = parse_param_value(T, Line, States, []),
    {Rest, Line1, ParamName, Value};
parse_param([H|T], Line, States, Name) ->
    parse_param(skip_ws(T), Line, States, [H|Name]).

parse_param_value([$;|T], Line, States, Value) ->
    {T, Line, lists:reverse(Value)};
parse_param_value([$:|T], Line, States, Value) ->
    %% keep $: for end of prop name
    {[$:|T], Line, lists:reverse(Value)};
parse_param_value([H|T], Line, States, Value) ->
    parse_param_value(T, Line, States, [H|Value]).


unfolding_line([$\s|T]) -> {true,  T}; %% space
unfolding_line([$\t|T]) -> {true,  T}; %% htab
unfolding_line(Chars)   -> {false, Chars}.
    
skip_ws([$\s|T]) -> skip_ws(T);
skip_ws([$\t|T]) -> skip_ws(T);
skip_ws(Text) -> Text.


test() ->
    Text = "
BEGIN:VCALENDAR
METHOD:PUBLISH
X-WR-CALNAME:Mi's Calendar
VERSION:2.0
PRODID:Spongecell
CALSCALE:GREGORIAN
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061206T120000
DTSTAMP:20070728T004842
LOCATION:Gordon Biersch, 640 Emerson St, Palo Alto, CA
URL:
UID:295803:spongecell.com
SUMMARY:All hands meeting
RRULE:FREQ=WEEKLY;INTERVAL=1
DTEND;TZID=America/Los_Angeles:20061206T130000
DESCRIPTION:
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061207T120000
DTSTAMP:20070728T004842
LOCATION:395 ano nuevo ave\, sunnyvale\, ca
URL:
UID:295802:spongecell.com
SUMMARY:Company lunch
RRULE:FREQ=WEEKLY;INTERVAL=1
DTEND;TZID=America/Los_Angeles:20061207T130000
DESCRIPTION:Let's have lots of beer!! (well\, and some code review :)
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061213T123000
DTSTAMP:20070728T004842
LOCATION:369 S California Ave\, Palo Alto\, CA
URL:
UID:295714:spongecell.com
SUMMARY:Ben is back.. want to meet again
DTEND;TZID=America/Los_Angeles:20061213T133000
DESCRIPTION:Re: Ben is back.. want to meet again\n Marc
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20070110T200000
DTSTAMP:20070728T004842
LOCATION:
URL:
UID:304529:spongecell.com
SUMMARY:flight back home
DTEND;TZID=America/Los_Angeles:20070110T210000
DESCRIPTION:
END:VEVENT
BEGIN:VTIMEZONE
TZID:America/Los_Angeles
BEGIN:STANDARD
DTSTART:20071104T000000
TZNAME:PST
RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU
TZOFFSETFROM:-0700
TZOFFSETTO:-0800
END:STANDARD
BEGIN:DAYLIGHT
DTSTART:20070311T000000
TZNAME:PDT
RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=1SU
TZOFFSETFROM:-0800
TZOFFSETTO:-0700
END:DAYLIGHT
END:VTIMEZONE
END:VCALENDAR


",
    io:fwrite(user, "Text: ~s~n", [Text]),
    {ok, Xml} = parse(Text),
    XmlText = lists:flatten(xmerl:export_simple([Xml], xmerl_xml)),
    io:fwrite(user, "Parsed: ~n~p~n", [XmlText]).

You may have noticed, the ?stateMachine can be pointed to a json_machine:state/2 some day, and we can get a JSON result without modification of icalendar.erl.

This also can be applied on JSON<->XML transform. Actually, I think SAX events is a good interface for various formats transform of data object. It's also a bit Erlang Style (Event passing). The parser/state-machine can communicate via SAX events as two separate processes and live with send/receive.

你可能感兴趣的:(xml,json,erlang,REST,idea)