Bytes lexing

I don't handle underscores in bytes correctly... Nor in integers, for that matter.
2026-01-29 02:01:16 +00:00
parent 2bf384ca82
commit f1696e2b9e
1 changed files with 75 additions and 15 deletions
@@ -4,6 +4,8 @@
 -copyright("Jarvis Carroll <spiveehere@gmail.com>").
 -license("GPL-3.0-or-later").
 -export([check_parser/1]).
 -include_lib("eunit/include/eunit.hrl").
 parse_literal(Type, String) ->
@@ -33,25 +35,26 @@ next_token({tk, Row, Col}, " " ++ Rest) ->
    next_token({tk, Row + 1, Col}, Rest);
 next_token({tk, Row, Col}, "\t" ++ Rest) ->
    next_token({tk, Row + 1, Col}, Rest);
-next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
+next_token({tk, _, Col}, "\r\n" ++ Rest) ->
-    num_token(Tk, Tk, String, []);
+    next_token({tk, 1, Col + 1}, Rest);
 next_token({tk, _, Col}, "\r" ++ Rest) ->
    next_token({tk, 1, Col + 1}, Rest);
 next_token({tk, _, Col}, "\n" ++ Rest) ->
    next_token({tk, 1, Col + 1}, Rest);
 next_token(Tk, [N | _] = String) when N >= $A, N =< $Z ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, [N | _] = String) when N >= $a, N =< $z ->
    alphanum_token(Tk, Tk, String, []);
-next_token(Tk, [$_ | _] = String) ->
+next_token(Tk, "_" ++ _ = String) ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
    num_token(Tk, Tk, String, []);
 next_token({tk, Row, Col}, "#" ++ Rest) ->
    bytes_token({tk, Row, Col}, {tk, Row + 1, col}, Rest, "#");
 next_token({tk, Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Row, Col, Col},
    {ok, {Token, {tk, Row + 1, Col}, Rest}}.
 num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
    num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    NumString = lists:reverse(Acc),
    Token = {integer, NumString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z ->
    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
@@ -65,6 +68,24 @@ alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    Token = {alphanum, AlphaString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
 num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
    num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    NumString = lists:reverse(Acc),
    Token = {integer, NumString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
 bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
    bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $A, N =< $F ->
    bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $a, N =< $f ->
    bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
 bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    BytesString = lists:reverse(Acc),
    Token = {bytes, BytesString, Row, Start, End},
    {ok, {Token, {tk, Row, End}, String}}.
 %%% Sophia Literal Parser
@@ -98,6 +119,22 @@ parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
        {O, N, _} ->
            {error, {wrong_type, O, N, integer, Row, Start, End}}
    end;
 parse_expression2(Type, Tk, String, {bytes, "#" ++ S, Row, Start, End}) ->
    Value = convert_bytes(S),
    Len = byte_size(Value),
    Result = {bytes, Value},
    case Type of
        {_, _, {bytes, [any]}} ->
            {ok, {Result, Tk, String}};
        {_, _, {bytes, [Len]}} ->
            {ok, {Result, Tk, String}};
        {_, _, {bytes, [ExpectedLen]}} ->
            {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
        {_, _, unknown_type} ->
            {ok, {Result, Tk, String}};
        {O, N, _} ->
            {error, {wrong_type, O, N, integer, Row, Start, End}}
    end;
 parse_expression2(Type, Tk, String, {character, "[", Row, Start, _}) ->
    parse_list(Type, Tk, String, Row, Start);
 parse_expression2(Type, Tk, String, {character, "(", Row, Start, _}) ->
@@ -122,6 +159,25 @@ expect_tokens([Str | Rest], Tk, String) ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.
 convert_bytes(Chars) ->
    Digits = lists:foldl(fun(C, Acc) -> [convert_nibble(C) | Acc] end, [], Chars),
    reverse_combine_nibbles(Digits, <<>>).
 convert_nibble(C) when C >= $0, C =< $9 ->
    C - $0;
 convert_nibble(C) when C >= $A, C =< $Z ->
    C - $A + 10;
 convert_nibble(C) when C >= $a, C =< $z ->
    C - $a + 10.
 reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
    NewAcc = <<D2:4, D1:4, Acc/binary>>,
    reverse_combine_nibbles(Rest, NewAcc);
 reverse_combine_nibbles([D1], Acc) ->
    <<0:4, D1:4, Acc/binary>>;
 reverse_combine_nibbles([], Acc) ->
    Acc.
 %%% List Parsing
 parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
@@ -430,12 +486,13 @@ wrap_error(Reason, _) -> Reason.
 %%% Tests
 check_sophia_to_fate(Type, Sophia, Fate) ->
-    {ok, FateActual} = parse_literal(Type, Sophia),
+    case parse_literal(Type, Sophia) of
-    case FateActual of
+        {ok, Fate} ->
        Fate ->
            ok;
-        _ ->
+        {ok, FateActual} ->
-            erlang:error({to_fate_failed, Fate, FateActual})
+            erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}});
        {error, Reason} ->
            erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
    end.
 compile_entrypoint_code_and_type(Source, Entrypoint) ->
@@ -502,6 +559,9 @@ tuple_test() ->
 maps_test() ->
    check_parser("{[1] = 2, [3] = 4}").
 bytes_test() ->
    check_parser("#DEAD000BEEF").
 records_test() ->
    TypeDef = "record pair = {x: int, y: int}",
    Sophia = "{x = 1, y = 2}",