%% @author Bob Ippolito %% @copyright 2007 Mochi Media, Inc. %% @doc Loosely tokenizes and generates parse trees for HTML 4. -module(mochiweb_html). -export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1, escape_attr/1, to_html/1]). %% This is a macro to placate syntax highlighters.. -define(QUOTE, $\"). -define(SQUOTE, $\'). -define(ADV_COL(S, N), S#decoder{column=N+S#decoder.column, offset=N+S#decoder.offset}). -define(INC_COL(S), S#decoder{column=1+S#decoder.column, offset=1+S#decoder.offset}). -define(INC_LINE(S), S#decoder{column=1, line=1+S#decoder.line, offset=1+S#decoder.offset}). -define(INC_CHAR(S, C), case C of $\n -> S#decoder{column=1, line=1+S#decoder.line, offset=1+S#decoder.offset}; _ -> S#decoder{column=1+S#decoder.column, offset=1+S#decoder.offset} end). -define(IS_WHITESPACE(C), (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)). -define(IS_LITERAL_SAFE(C), ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z) orelse (C >= $0 andalso C =< $9))). -define(PROBABLE_CLOSE(C), (C =:= $> orelse ?IS_WHITESPACE(C))). -record(decoder, {line=1, column=1, offset=0}). %% @type html_node() = {string(), [html_attr()], [html_node() | string()]} %% @type html_attr() = {string(), string()} %% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype() %% @type html_data() = {data, string(), Whitespace::boolean()} %% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()} %% @type end_tag() = {end_tag, Name} %% @type html_comment() = {comment, Comment} %% @type html_doctype() = {doctype, [Doctype]} %% @type inline_html() = {'=', iolist()} %% External API. %% @spec parse(string() | binary()) -> html_node() %% @doc tokenize and then transform the token stream into a HTML tree. parse(Input) -> parse_tokens(tokens(Input)). %% @spec parse_tokens([html_token()]) -> html_node() %% @doc Transform the output of tokens(Doc) into a HTML tree. parse_tokens(Tokens) when is_list(Tokens) -> %% Skip over doctype, processing instructions F = fun (X) -> case X of {start_tag, _, _, false} -> false; _ -> true end end, [{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens), {Tree, _} = tree(Rest, [norm({Tag, Attrs})]), Tree. %% @spec tokens(StringOrBinary) -> [html_token()] %% @doc Transform the input UTF-8 HTML into a token stream. tokens(Input) -> tokens(iolist_to_binary(Input), #decoder{}, []). %% @spec to_tokens(html_node()) -> [html_token()] %% @doc Convert a html_node() tree to a list of tokens. to_tokens({Tag0}) -> to_tokens({Tag0, [], []}); to_tokens(T={'=', _}) -> [T]; to_tokens(T={doctype, _}) -> [T]; to_tokens(T={comment, _}) -> [T]; to_tokens({Tag0, Acc}) -> %% This is only allowed in sub-tags: {p, [{"class", "foo"}]} to_tokens({Tag0, [], Acc}); to_tokens({Tag0, Attrs, Acc}) -> Tag = to_tag(Tag0), to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]). %% @spec to_html([html_token()] | html_node()) -> iolist() %% @doc Convert a list of html_token() to a HTML document. to_html(Node) when is_tuple(Node) -> to_html(to_tokens(Node)); to_html(Tokens) when is_list(Tokens) -> to_html(Tokens, []). %% @spec escape(string() | atom() | binary()) -> binary() %% @doc Escape a string such that it's safe for HTML (amp; lt; gt;). escape(B) when is_binary(B) -> escape(binary_to_list(B), []); escape(A) when is_atom(A) -> escape(atom_to_list(A), []); escape(S) when is_list(S) -> escape(S, []). %% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary() %% @doc Escape a string such that it's safe for HTML attrs %% (amp; lt; gt; quot;). escape_attr(B) when is_binary(B) -> escape_attr(binary_to_list(B), []); escape_attr(A) when is_atom(A) -> escape_attr(atom_to_list(A), []); escape_attr(S) when is_list(S) -> escape_attr(S, []); escape_attr(I) when is_integer(I) -> escape_attr(integer_to_list(I), []); escape_attr(F) when is_float(F) -> escape_attr(mochinum:digits(F), []). to_html([], Acc) -> lists:reverse(Acc); to_html([{'=', Content} | Rest], Acc) -> to_html(Rest, [Content | Acc]); to_html([{pi, Bin} | Rest], Acc) -> Open = [<<">, Bin, <<"?>">>], to_html(Rest, [Open | Acc]); to_html([{pi, Tag, Attrs} | Rest], Acc) -> Open = [<<">, Tag, attrs_to_html(Attrs, []), <<"?>">>], to_html(Rest, [Open | Acc]); to_html([{comment, Comment} | Rest], Acc) -> to_html(Rest, [[<<"">>] | Acc]); to_html([{doctype, Parts} | Rest], Acc) -> Inside = doctype_to_html(Parts, Acc), to_html(Rest, [[<<">, Inside, <<">">>] | Acc]); to_html([{data, Data, _Whitespace} | Rest], Acc) -> to_html(Rest, [escape(Data) | Acc]); to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) -> Open = [<<"<">>, Tag, attrs_to_html(Attrs, []), case Singleton of true -> <<" />">>; false -> <<">">> end], to_html(Rest, [Open | Acc]); to_html([{end_tag, Tag} | Rest], Acc) -> to_html(Rest, [[<<">, Tag, <<">">>] | Acc]). doctype_to_html([], Acc) -> lists:reverse(Acc); doctype_to_html([Word | Rest], Acc) -> case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end, binary_to_list(iolist_to_binary(Word))) of true -> doctype_to_html(Rest, [[<<" ">>, Word] | Acc]); false -> doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc]) end. attrs_to_html([], Acc) -> lists:reverse(Acc); attrs_to_html([{K, V} | Rest], Acc) -> attrs_to_html(Rest, [[<<" ">>, escape(K), <<"=\"">>, escape_attr(V), <<"\"">>] | Acc]). escape([], Acc) -> list_to_binary(lists:reverse(Acc)); escape("<" ++ Rest, Acc) -> escape(Rest, lists:reverse("<", Acc)); escape(">" ++ Rest, Acc) -> escape(Rest, lists:reverse(">", Acc)); escape("&" ++ Rest, Acc) -> escape(Rest, lists:reverse("&", Acc)); escape([C | Rest], Acc) -> escape(Rest, [C | Acc]). escape_attr([], Acc) -> list_to_binary(lists:reverse(Acc)); escape_attr("<" ++ Rest, Acc) -> escape_attr(Rest, lists:reverse("<", Acc)); escape_attr(">" ++ Rest, Acc) -> escape_attr(Rest, lists:reverse(">", Acc)); escape_attr("&" ++ Rest, Acc) -> escape_attr(Rest, lists:reverse("&", Acc)); escape_attr([?QUOTE | Rest], Acc) -> escape_attr(Rest, lists:reverse(""", Acc)); escape_attr([C | Rest], Acc) -> escape_attr(Rest, [C | Acc]). to_tag(A) when is_atom(A) -> norm(atom_to_list(A)); to_tag(L) -> norm(L). to_tokens([], Acc) -> lists:reverse(Acc); to_tokens([{Tag, []} | Rest], Acc) -> to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]); to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) -> %% Allow {br} to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc); to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) -> %% Allow {'=', iolist()} to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) -> %% Allow {comment, iolist()} to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) -> %% Allow {pi, binary()} to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) -> %% Allow {pi, binary(), list()} to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) -> %% Allow {p, [{"class", "foo"}]} to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc); to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) -> %% Allow {p, "content"} and {p, <<"content">>} to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc); to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) -> %% Allow {"p", [{"class", "foo"}], <<"content">>} to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc); to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc) when is_integer(C) -> %% Allow {"p", [{"class", "foo"}], "content"} to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc); to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) -> %% Native {"p", [{"class", "foo"}], ["content"]} Tag = to_tag(Tag0), T1 = to_tag(T0), case is_singleton(norm(T1)) of true -> to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]); false -> to_tokens([{T1, C1}, {Tag, R1} | Rest], [{start_tag, T1, A1, false} | Acc]) end; to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) -> %% List text Tag = to_tag(Tag0), to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]); to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) -> %% Binary text Tag = to_tag(Tag0), to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]). tokens(B, S=#decoder{offset=O}, Acc) -> case B of <<_:O/binary>> -> lists:reverse(Acc); _ -> {Tag, S1} = tokenize(B, S), case parse_flag(Tag) of script -> {Tag2, S2} = tokenize_script(B, S1), tokens(B, S2, [Tag2, Tag | Acc]); textarea -> {Tag2, S2} = tokenize_textarea(B, S1), tokens(B, S2, [Tag2, Tag | Acc]); none -> tokens(B, S1, [Tag | Acc]) end end. parse_flag({start_tag, B, _, false}) -> case string:to_lower(binary_to_list(B)) of "script" -> script; "textarea" -> textarea; _ -> none end; parse_flag(_) -> none. tokenize(B, S=#decoder{offset=O}) -> case B of <<_:O/binary, "", _/binary>> -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{comment, Raw}, ?ADV_COL(S, 3)}; <<_:O/binary, C, _/binary>> -> tokenize_comment(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{comment, Raw}, S} end. tokenize_script(Bin, S=#decoder{offset=O}) -> tokenize_script(Bin, S, O). tokenize_script(Bin, S=#decoder{offset=O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>> when (SS =:= $s orelse SS =:= $S) andalso (CC =:= $c orelse CC =:= $C) andalso (RR =:= $r orelse RR =:= $R) andalso (II =:= $i orelse II =:= $I) andalso (PP =:= $p orelse PP =:= $P) andalso (TT=:= $t orelse TT =:= $T) andalso ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; <<_:O/binary, C, _/binary>> -> tokenize_script(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end. tokenize_textarea(Bin, S=#decoder{offset=O}) -> tokenize_textarea(Bin, S, O). tokenize_textarea(Bin, S=#decoder{offset=O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>> when (TT =:= $t orelse TT =:= $T) andalso (EE =:= $e orelse EE =:= $E) andalso (XX =:= $x orelse XX =:= $X) andalso (TT2 =:= $t orelse TT2 =:= $T) andalso (AA =:= $a orelse AA =:= $A) andalso (RR =:= $r orelse RR =:= $R) andalso (EE2 =:= $e orelse EE2 =:= $E) andalso (AA2 =:= $a orelse AA2 =:= $A) andalso ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; <<_:O/binary, C, _/binary>> -> tokenize_textarea(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end. %% %% Tests %% -include_lib("eunit/include/eunit.hrl"). -ifdef(TEST). to_html_test() -> ?assertEqual( <<"hey!

what's up

sucka
RAW!">>, iolist_to_binary( to_html({html, [], [{<<"head">>, [], [{title, <<"hey!">>}]}, {body, [], [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]}, {'div', <<"sucka">>}, {'=', <<"RAW!">>}, {comment, <<" comment! ">>}]}]}))), ?assertEqual( <<"">>, iolist_to_binary( to_html({doctype, [<<"html">>, <<"PUBLIC">>, <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>, <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))), ?assertEqual( <<"">>, iolist_to_binary( to_html({<<"html">>,[], [{pi, <<"xml:namespace">>, [{<<"prefix">>,<<"o">>}, {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))), ok. escape_test() -> ?assertEqual( <<"&quot;\"word ><<up!&quot;">>, escape(<<""\"word ><>)), ?assertEqual( <<"&quot;\"word ><<up!&quot;">>, escape(""\"word ><>, escape('"\"word >< ?assertEqual( <<"&quot;"word ><<up!&quot;">>, escape_attr(<<""\"word ><>)), ?assertEqual( <<"&quot;"word ><<up!&quot;">>, escape_attr(""\"word ><>, escape_attr('"\"word ><>, escape_attr(12345)), ?assertEqual( <<"1.5">>, escape_attr(1.5)), ok. tokens_test() -> ?assertEqual( [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, {<<"wibble">>, <<"wibble">>}, {<<"alice">>, <<"bob">>}], true}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, {<<"wibble">>, <<"wibble">>}, {<<"alice">>, <<"bob">>}], true}], tokens(<<"">>)), ?assertEqual( [{comment, <<"[if lt IE 7]>\n\n>}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, {data, <<" A= B <= C ">>, false}, {end_tag, <<"script">>}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, {data, <<" A= B <= C ">>, false}, {end_tag, <<"script">>}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, {data, <<" A= B <= C ">>, false}, {end_tag, <<"script">>}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, {data, <<" A= B <= C ">>, false}, {end_tag, <<"script">>}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"textarea">>, [], false}, {data, <<"">>, false}, {end_tag, <<"textarea">>}], tokens(<<"">>)), ?assertEqual( [{start_tag, <<"textarea">>, [], false}, {data, <<"">>, false}], tokens(<<"