Handle underscores in integers/bytes

This forces us to test for alpha/num/hex enough times that it's now worth making macros for these things.
2026-01-29 03:03:11 +00:00
parent f1696e2b9e
commit fe182a5233
1 changed files with 62 additions and 44 deletions
@@ -29,6 +29,11 @@ parse_literal2(Result, Tk, String) ->
 %%% Tokenizer
 -define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))).
 -define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
 -define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))).
 -define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
 next_token({tk, Row, Col}, []) ->
    {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
 next_token({tk, Row, Col}, " " ++ Rest) ->
@@ -41,46 +46,36 @@ next_token({tk, _, Col}, "\r" ++ Rest) ->
    next_token({tk, 1, Col + 1}, Rest);
 next_token({tk, _, Col}, "\n" ++ Rest) ->
    next_token({tk, 1, Col + 1}, Rest);
-next_token(Tk, [N | _] = String) when N >= $A, N =< $Z ->
+next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
    alphanum_token(Tk, Tk, String, []);
-next_token(Tk, [N | _] = String) when N >= $a, N =< $z ->
+next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, "_" ++ _ = String) ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
    num_token(Tk, Tk, String, []);
-next_token({tk, Row, Col}, "#" ++ Rest) ->
+next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
-    bytes_token({tk, Row, Col}, {tk, Row + 1, col}, Rest, "#");
+    bytes_token({tk, Row, Col}, {tk, Row + 2, Col}, Rest, [C, $#]);
 next_token({tk, Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Row, Col, Col},
    {ok, {Token, {tk, Row + 1, Col}, Rest}}.
-alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z ->
+alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]);
 alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    AlphaString = lists:reverse(Acc),
    Token = {alphanum, AlphaString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
-num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
+num_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_NUM(C) ->
-    num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
+    num_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]);
 num_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_NUM(C) ->
    num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Acc]);
 num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    NumString = lists:reverse(Acc),
    Token = {integer, NumString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
-bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
+bytes_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_HEX(C) ->
-    bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
+    bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]);
-bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $A, N =< $F ->
+bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_HEX(C) ->
-    bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
+    bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Acc]);
 bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $a, N =< $f ->
    bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    BytesString = lists:reverse(Acc),
    Token = {bytes, BytesString, Row, Start, End},
@@ -110,7 +105,7 @@ parse_expression(Type, Tk, String) ->
    parse_expression2(Type, NewTk, NewString, Token).
 parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
-    Value = list_to_integer(S),
+    Value = convert_int(S),
    case Type of
        {_, _, integer} ->
            {ok, {Value, Tk, String}};
@@ -159,15 +154,38 @@ expect_tokens([Str | Rest], Tk, String) ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.
 convert_int(Chars) ->
    convert_int(Chars, 0).
 convert_int("_" ++ Chars, Result) ->
    convert_int(Chars, Result);
 convert_int([N | Chars], Result) ->
    Digit = N - $0,
    NewResult = Result * 10 + Digit,
    convert_int(Chars, NewResult);
 convert_int([], Result) ->
    Result.
 convert_bytes(Chars) ->
-    Digits = lists:foldl(fun(C, Acc) -> [convert_nibble(C) | Acc] end, [], Chars),
+    % We do this as two reversing foldl type loops. One removes underscores and
    % converts the ASCII into integers, and the other peels off pairs of
    % numbers to form bytes.
    Digits = reverse_convert_digits(Chars, []),
    reverse_combine_nibbles(Digits, <<>>).
-convert_nibble(C) when C >= $0, C =< $9 ->
+reverse_convert_digits("_" ++ Rest, Acc) ->
    reverse_convert_digits(Rest, Acc);
 reverse_convert_digits([C | Rest], Acc) ->
    Digit = convert_digit(C),
    reverse_convert_digits(Rest, [Digit | Acc]);
 reverse_convert_digits([], Acc) ->
    Acc.
 convert_digit(C) when C >= $0, C =< $9 ->
    C - $0;
-convert_nibble(C) when C >= $A, C =< $Z ->
+convert_digit(C) when C >= $A, C =< $Z ->
    C - $A + 10;
-convert_nibble(C) when C >= $a, C =< $z ->
+convert_digit(C) when C >= $a, C =< $z ->
    C - $a + 10.
 reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
@@ -544,23 +562,23 @@ check_parser_with_typedef(Typedef, Sophia) ->
    % definitions in the AACI, so untyped parses probably don't work.
    check_sophia_to_fate(Type, Sophia, Fate).
-int_test() ->
+anon_types_test() ->
-    check_parser("123").
+    % Integers.
    check_parser("123"),
    check_parser("1_2_3"),
    % Bytes.
    check_parser("#DEAD000BEEF"),
    check_parser("#DE_AD0_00B_EEF"),
    % List of integers.
    check_parser("[1, 2, 3]"),
    % List of lists.
    check_parser("[[], [1], [2, 3]]"),
    % Tuple.
    check_parser("(1, [2, 3], (4, 5))"),
    % Map.
    check_parser("{[1] = 2, [3] = 4}"),
-list_test() ->
+    ok.
    check_parser("[1, 2, 3]").
 list_of_lists_test() ->
    check_parser("[[], [1], [2, 3]]").
 tuple_test() ->
    check_parser("(1, [2, 3], (4, 5))").
 maps_test() ->
    check_parser("{[1] = 2, [3] = 4}").
 bytes_test() ->
    check_parser("#DEAD000BEEF").
 records_test() ->
    TypeDef = "record pair = {x: int, y: int}",