diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index e84e7a7..c73ef06 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -29,6 +29,11 @@ parse_literal2(Result, Tk, String) -> %%% Tokenizer +-define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))). +-define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))). +-define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))). +-define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))). + next_token({tk, Row, Col}, []) -> {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}}; next_token({tk, Row, Col}, " " ++ Rest) -> @@ -41,46 +46,36 @@ next_token({tk, _, Col}, "\r" ++ Rest) -> next_token({tk, 1, Col + 1}, Rest); next_token({tk, _, Col}, "\n" ++ Rest) -> next_token({tk, 1, Col + 1}, Rest); -next_token(Tk, [N | _] = String) when N >= $A, N =< $Z -> +next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) -> alphanum_token(Tk, Tk, String, []); -next_token(Tk, [N | _] = String) when N >= $a, N =< $z -> - alphanum_token(Tk, Tk, String, []); -next_token(Tk, "_" ++ _ = String) -> - alphanum_token(Tk, Tk, String, []); -next_token(Tk, [N | _] = String) when N >= $0, N =< $9 -> +next_token(Tk, [C | _] = String) when ?IS_NUM(C) -> num_token(Tk, Tk, String, []); -next_token({tk, Row, Col}, "#" ++ Rest) -> - bytes_token({tk, Row, Col}, {tk, Row + 1, col}, Rest, "#"); +next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) -> + bytes_token({tk, Row, Col}, {tk, Row + 2, Col}, Rest, [C, $#]); next_token({tk, Row, Col}, [Char | Rest]) -> Token = {character, [Char], Row, Col, Col}, {ok, {Token, {tk, Row + 1, Col}, Rest}}. -alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z -> +alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) -> alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); -alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z -> - alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); -alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 -> - alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); -alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) -> - alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]); alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> AlphaString = lists:reverse(Acc), Token = {alphanum, AlphaString, Row, Start, End}, {ok, {Token, {tk, Row, End}, String}}. -num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> - num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +num_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_NUM(C) -> + num_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]); +num_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_NUM(C) -> + num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Acc]); num_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> NumString = lists:reverse(Acc), Token = {integer, NumString, Row, Start, End}, {ok, {Token, {tk, Row, End}, String}}. -bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> - bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); -bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $A, N =< $F -> - bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); -bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $a, N =< $f -> - bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +bytes_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_HEX(C) -> + bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]); +bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_HEX(C) -> + bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Acc]); bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> BytesString = lists:reverse(Acc), Token = {bytes, BytesString, Row, Start, End}, @@ -110,7 +105,7 @@ parse_expression(Type, Tk, String) -> parse_expression2(Type, NewTk, NewString, Token). parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) -> - Value = list_to_integer(S), + Value = convert_int(S), case Type of {_, _, integer} -> {ok, {Value, Tk, String}}; @@ -159,15 +154,38 @@ expect_tokens([Str | Rest], Tk, String) -> {error, {unexpected_token, Actual, Row, Start, End}} end. +convert_int(Chars) -> + convert_int(Chars, 0). + +convert_int("_" ++ Chars, Result) -> + convert_int(Chars, Result); +convert_int([N | Chars], Result) -> + Digit = N - $0, + NewResult = Result * 10 + Digit, + convert_int(Chars, NewResult); +convert_int([], Result) -> + Result. + convert_bytes(Chars) -> - Digits = lists:foldl(fun(C, Acc) -> [convert_nibble(C) | Acc] end, [], Chars), + % We do this as two reversing foldl type loops. One removes underscores and + % converts the ASCII into integers, and the other peels off pairs of + % numbers to form bytes. + Digits = reverse_convert_digits(Chars, []), reverse_combine_nibbles(Digits, <<>>). -convert_nibble(C) when C >= $0, C =< $9 -> +reverse_convert_digits("_" ++ Rest, Acc) -> + reverse_convert_digits(Rest, Acc); +reverse_convert_digits([C | Rest], Acc) -> + Digit = convert_digit(C), + reverse_convert_digits(Rest, [Digit | Acc]); +reverse_convert_digits([], Acc) -> + Acc. + +convert_digit(C) when C >= $0, C =< $9 -> C - $0; -convert_nibble(C) when C >= $A, C =< $Z -> +convert_digit(C) when C >= $A, C =< $Z -> C - $A + 10; -convert_nibble(C) when C >= $a, C =< $z -> +convert_digit(C) when C >= $a, C =< $z -> C - $a + 10. reverse_combine_nibbles([D1, D2 | Rest], Acc) -> @@ -544,23 +562,23 @@ check_parser_with_typedef(Typedef, Sophia) -> % definitions in the AACI, so untyped parses probably don't work. check_sophia_to_fate(Type, Sophia, Fate). -int_test() -> - check_parser("123"). +anon_types_test() -> + % Integers. + check_parser("123"), + check_parser("1_2_3"), + % Bytes. + check_parser("#DEAD000BEEF"), + check_parser("#DE_AD0_00B_EEF"), + % List of integers. + check_parser("[1, 2, 3]"), + % List of lists. + check_parser("[[], [1], [2, 3]]"), + % Tuple. + check_parser("(1, [2, 3], (4, 5))"), + % Map. + check_parser("{[1] = 2, [3] = 4}"), -list_test() -> - check_parser("[1, 2, 3]"). - -list_of_lists_test() -> - check_parser("[[], [1], [2, 3]]"). - -tuple_test() -> - check_parser("(1, [2, 3], (4, 5))"). - -maps_test() -> - check_parser("{[1] = 2, [3] = 4}"). - -bytes_test() -> - check_parser("#DEAD000BEEF"). + ok. records_test() -> TypeDef = "record pair = {x: int, y: int}",