Handle underscores in integers/bytes

This forces us to test for alpha/num/hex enough times that it's now worth making macros for these things.
This commit is contained in:
Jarvis Carroll 2026-01-29 03:03:11 +00:00
parent f1696e2b9e
commit fe182a5233

View File

@ -29,6 +29,11 @@ parse_literal2(Result, Tk, String) ->
%%% Tokenizer %%% Tokenizer
-define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))).
-define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
-define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))).
-define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
next_token({tk, Row, Col}, []) -> next_token({tk, Row, Col}, []) ->
{ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}}; {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
next_token({tk, Row, Col}, " " ++ Rest) -> next_token({tk, Row, Col}, " " ++ Rest) ->
@ -41,46 +46,36 @@ next_token({tk, _, Col}, "\r" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest); next_token({tk, 1, Col + 1}, Rest);
next_token({tk, _, Col}, "\n" ++ Rest) -> next_token({tk, _, Col}, "\n" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest); next_token({tk, 1, Col + 1}, Rest);
next_token(Tk, [N | _] = String) when N >= $A, N =< $Z -> next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
alphanum_token(Tk, Tk, String, []); alphanum_token(Tk, Tk, String, []);
next_token(Tk, [N | _] = String) when N >= $a, N =< $z -> next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
alphanum_token(Tk, Tk, String, []);
next_token(Tk, "_" ++ _ = String) ->
alphanum_token(Tk, Tk, String, []);
next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
num_token(Tk, Tk, String, []); num_token(Tk, Tk, String, []);
next_token({tk, Row, Col}, "#" ++ Rest) -> next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
bytes_token({tk, Row, Col}, {tk, Row + 1, col}, Rest, "#"); bytes_token({tk, Row, Col}, {tk, Row + 2, Col}, Rest, [C, $#]);
next_token({tk, Row, Col}, [Char | Rest]) -> next_token({tk, Row, Col}, [Char | Rest]) ->
Token = {character, [Char], Row, Col, Col}, Token = {character, [Char], Row, Col, Col},
{ok, {Token, {tk, Row + 1, Col}, Rest}}. {ok, {Token, {tk, Row + 1, Col}, Rest}}.
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z -> alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) ->
alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]);
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
AlphaString = lists:reverse(Acc), AlphaString = lists:reverse(Acc),
Token = {alphanum, AlphaString, Row, Start, End}, Token = {alphanum, AlphaString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}. {ok, {Token, {tk, Row, End}, String}}.
num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> num_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_NUM(C) ->
num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); num_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]);
num_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_NUM(C) ->
num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Acc]);
num_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
NumString = lists:reverse(Acc), NumString = lists:reverse(Acc),
Token = {integer, NumString, Row, Start, End}, Token = {integer, NumString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}. {ok, {Token, {tk, Row, End}, String}}.
bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> bytes_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_HEX(C) ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]);
bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $A, N =< $F -> bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_HEX(C) ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Acc]);
bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $a, N =< $f ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
BytesString = lists:reverse(Acc), BytesString = lists:reverse(Acc),
Token = {bytes, BytesString, Row, Start, End}, Token = {bytes, BytesString, Row, Start, End},
@ -110,7 +105,7 @@ parse_expression(Type, Tk, String) ->
parse_expression2(Type, NewTk, NewString, Token). parse_expression2(Type, NewTk, NewString, Token).
parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) -> parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
Value = list_to_integer(S), Value = convert_int(S),
case Type of case Type of
{_, _, integer} -> {_, _, integer} ->
{ok, {Value, Tk, String}}; {ok, {Value, Tk, String}};
@ -159,15 +154,38 @@ expect_tokens([Str | Rest], Tk, String) ->
{error, {unexpected_token, Actual, Row, Start, End}} {error, {unexpected_token, Actual, Row, Start, End}}
end. end.
convert_int(Chars) ->
convert_int(Chars, 0).
convert_int("_" ++ Chars, Result) ->
convert_int(Chars, Result);
convert_int([N | Chars], Result) ->
Digit = N - $0,
NewResult = Result * 10 + Digit,
convert_int(Chars, NewResult);
convert_int([], Result) ->
Result.
convert_bytes(Chars) -> convert_bytes(Chars) ->
Digits = lists:foldl(fun(C, Acc) -> [convert_nibble(C) | Acc] end, [], Chars), % We do this as two reversing foldl type loops. One removes underscores and
% converts the ASCII into integers, and the other peels off pairs of
% numbers to form bytes.
Digits = reverse_convert_digits(Chars, []),
reverse_combine_nibbles(Digits, <<>>). reverse_combine_nibbles(Digits, <<>>).
convert_nibble(C) when C >= $0, C =< $9 -> reverse_convert_digits("_" ++ Rest, Acc) ->
reverse_convert_digits(Rest, Acc);
reverse_convert_digits([C | Rest], Acc) ->
Digit = convert_digit(C),
reverse_convert_digits(Rest, [Digit | Acc]);
reverse_convert_digits([], Acc) ->
Acc.
convert_digit(C) when C >= $0, C =< $9 ->
C - $0; C - $0;
convert_nibble(C) when C >= $A, C =< $Z -> convert_digit(C) when C >= $A, C =< $Z ->
C - $A + 10; C - $A + 10;
convert_nibble(C) when C >= $a, C =< $z -> convert_digit(C) when C >= $a, C =< $z ->
C - $a + 10. C - $a + 10.
reverse_combine_nibbles([D1, D2 | Rest], Acc) -> reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
@ -544,23 +562,23 @@ check_parser_with_typedef(Typedef, Sophia) ->
% definitions in the AACI, so untyped parses probably don't work. % definitions in the AACI, so untyped parses probably don't work.
check_sophia_to_fate(Type, Sophia, Fate). check_sophia_to_fate(Type, Sophia, Fate).
int_test() -> anon_types_test() ->
check_parser("123"). % Integers.
check_parser("123"),
check_parser("1_2_3"),
% Bytes.
check_parser("#DEAD000BEEF"),
check_parser("#DE_AD0_00B_EEF"),
% List of integers.
check_parser("[1, 2, 3]"),
% List of lists.
check_parser("[[], [1], [2, 3]]"),
% Tuple.
check_parser("(1, [2, 3], (4, 5))"),
% Map.
check_parser("{[1] = 2, [3] = 4}"),
list_test() -> ok.
check_parser("[1, 2, 3]").
list_of_lists_test() ->
check_parser("[[], [1], [2, 3]]").
tuple_test() ->
check_parser("(1, [2, 3], (4, 5))").
maps_test() ->
check_parser("{[1] = 2, [3] = 4}").
bytes_test() ->
check_parser("#DEAD000BEEF").
records_test() -> records_test() ->
TypeDef = "record pair = {x: int, y: int}", TypeDef = "record pair = {x: int, y: int}",