diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index db47d22..b73a4c0 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -52,6 +52,8 @@ next_token(Tk, [C | _] = String) when ?IS_NUM(C) -> num_token(Tk, Tk, String, [], 0); next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) -> bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []); +next_token({tk, Row, Col}, "\"" ++ Rest) -> + string_token({tk, Row, Col}, {tk, Row + 1, Col}, Rest, "\"", <<>>); next_token({tk, Row, Col}, [Char | Rest]) -> Token = {character, [Char], Char, Row, Col, Col}, {ok, {Token, {tk, Row + 1, Col}, Rest}}. @@ -101,6 +103,46 @@ reverse_combine_nibbles([D1], Acc) -> reverse_combine_nibbles([], Acc) -> Acc. +string_token(Start, {tk, Row, Col}, [$\\, $x, A, B | Rest], SourceChars, Value) -> + case escape_hex_code(A, B) of + {ok, ByteVal} -> + string_token(Start, {tk, Row + 4, Col}, Rest, [B, A, $x, $\ | SourceChars], <>); + error -> + {error, {invalid_escape_code, [$\\, $x, A, B], Row, Col}} + end; +string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) -> + case escape_char(C) of + {ok, ByteVal} -> + string_token(Start, {tk, Row + 2, Col}, Rest, [C, $\ | SourceChars], <>); + error -> + {error, {invalid_escape_code, [C], Row, Col}} + end; +string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) -> + SourceStr = lists:reverse([$" | SourceChars]), + Token = {string, SourceStr, Value, Row, Start, End}, + {ok, {Token, {tk, Row, End}, Rest}}; +string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) -> + string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <>). + +escape_hex_code(A, B) when ?IS_HEX(A), ?IS_HEX(B) -> + % As of writing this, the Sophia compiler will convert this byte from + % extended ASCII to unicode... But it really shouldn't. The literal parser + % does what the compiler should do. + Byte = convert_digit(A) * 16 + convert_digit(B), + {ok, Byte}; +escape_hex_code(_, _) -> + error. + +escape_char($b) -> {ok, $\b}; +escape_char($e) -> {ok, $\e}; +escape_char($f) -> {ok, $\f}; +escape_char($n) -> {ok, $\n}; +escape_char($r) -> {ok, $\r}; +escape_char($t) -> {ok, $\t}; +escape_char($v) -> {ok, $\v}; +escape_char($") -> {ok, $\"}; +escape_char($\\) -> {ok, $\\}; +escape_char(_) -> error. %%% Sophia Literal Parser @@ -121,8 +163,12 @@ reverse_combine_nibbles([], Acc) -> %%% pushdown automaton that we want. parse_expression(Type, Tk, String) -> - {ok, {Token, NewTk, NewString}} = next_token(Tk, String), - parse_expression2(Type, NewTk, NewString, Token). + case next_token(Tk, String) of + {ok, {Token, NewTk, NewString}} -> + parse_expression2(Type, NewTk, NewString, Token); + {error, Reason} -> + {error, Reason} + end. parse_expression2(Type, Tk, String, {integer, _, Value, Row, Start, End}) -> case Type of @@ -146,7 +192,16 @@ parse_expression2(Type, Tk, String, {bytes, _, Value, Row, Start, End}) -> {_, _, unknown_type} -> {ok, {Result, Tk, String}}; {O, N, _} -> - {error, {wrong_type, O, N, integer, Row, Start, End}} + {error, {wrong_type, O, N, {bytes, [Len]}, Row, Start, End}} + end; +parse_expression2(Type, Tk, String, {string, _, Value, Row, Start, End}) -> + case Type of + {_, _, string} -> + {ok, {Value, Tk, String}}; + {_, _, unknown_type} -> + {ok, {Value, Tk, String}}; + {O, N, _} -> + {error, {wrong_type, O, N, string, Row, Start, End}} end; parse_expression2(Type, Tk, String, {character, "[", _, Row, Start, _}) -> parse_list(Type, Tk, String, Row, Start); @@ -545,6 +600,10 @@ anon_types_test() -> % Bytes. check_parser("#DEAD000BEEF"), check_parser("#DE_AD0_00B_EEF"), + % Strings. + check_parser("\"hello world\""), + check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), + check_parser("\"\\x00\\x11\\x77\""), % List of integers. check_parser("[1, 2, 3]"), % List of lists.