Bytes lexing

I don't handle underscores in bytes correctly... Nor in integers, for that matter.
This commit is contained in:
Jarvis Carroll 2026-01-29 02:01:16 +00:00
parent 2bf384ca82
commit f1696e2b9e

View File

@ -4,6 +4,8 @@
-copyright("Jarvis Carroll <spiveehere@gmail.com>"). -copyright("Jarvis Carroll <spiveehere@gmail.com>").
-license("GPL-3.0-or-later"). -license("GPL-3.0-or-later").
-export([check_parser/1]).
-include_lib("eunit/include/eunit.hrl"). -include_lib("eunit/include/eunit.hrl").
parse_literal(Type, String) -> parse_literal(Type, String) ->
@ -33,25 +35,26 @@ next_token({tk, Row, Col}, " " ++ Rest) ->
next_token({tk, Row + 1, Col}, Rest); next_token({tk, Row + 1, Col}, Rest);
next_token({tk, Row, Col}, "\t" ++ Rest) -> next_token({tk, Row, Col}, "\t" ++ Rest) ->
next_token({tk, Row + 1, Col}, Rest); next_token({tk, Row + 1, Col}, Rest);
next_token(Tk, [N | _] = String) when N >= $0, N =< $9 -> next_token({tk, _, Col}, "\r\n" ++ Rest) ->
num_token(Tk, Tk, String, []); next_token({tk, 1, Col + 1}, Rest);
next_token({tk, _, Col}, "\r" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest);
next_token({tk, _, Col}, "\n" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest);
next_token(Tk, [N | _] = String) when N >= $A, N =< $Z -> next_token(Tk, [N | _] = String) when N >= $A, N =< $Z ->
alphanum_token(Tk, Tk, String, []); alphanum_token(Tk, Tk, String, []);
next_token(Tk, [N | _] = String) when N >= $a, N =< $z -> next_token(Tk, [N | _] = String) when N >= $a, N =< $z ->
alphanum_token(Tk, Tk, String, []); alphanum_token(Tk, Tk, String, []);
next_token(Tk, [$_ | _] = String) -> next_token(Tk, "_" ++ _ = String) ->
alphanum_token(Tk, Tk, String, []); alphanum_token(Tk, Tk, String, []);
next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
num_token(Tk, Tk, String, []);
next_token({tk, Row, Col}, "#" ++ Rest) ->
bytes_token({tk, Row, Col}, {tk, Row + 1, col}, Rest, "#");
next_token({tk, Row, Col}, [Char | Rest]) -> next_token({tk, Row, Col}, [Char | Rest]) ->
Token = {character, [Char], Row, Col, Col}, Token = {character, [Char], Row, Col, Col},
{ok, {Token, {tk, Row + 1, Col}, Rest}}. {ok, {Token, {tk, Row + 1, Col}, Rest}}.
num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
NumString = lists:reverse(Acc),
Token = {integer, NumString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z -> alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z -> alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
@ -65,6 +68,24 @@ alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
Token = {alphanum, AlphaString, Row, Start, End}, Token = {alphanum, AlphaString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}. {ok, {Token, {tk, Row, End}, String}}.
num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
NumString = lists:reverse(Acc),
Token = {integer, NumString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $A, N =< $F ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $a, N =< $f ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
BytesString = lists:reverse(Acc),
Token = {bytes, BytesString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
%%% Sophia Literal Parser %%% Sophia Literal Parser
@ -98,6 +119,22 @@ parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
{O, N, _} -> {O, N, _} ->
{error, {wrong_type, O, N, integer, Row, Start, End}} {error, {wrong_type, O, N, integer, Row, Start, End}}
end; end;
parse_expression2(Type, Tk, String, {bytes, "#" ++ S, Row, Start, End}) ->
Value = convert_bytes(S),
Len = byte_size(Value),
Result = {bytes, Value},
case Type of
{_, _, {bytes, [any]}} ->
{ok, {Result, Tk, String}};
{_, _, {bytes, [Len]}} ->
{ok, {Result, Tk, String}};
{_, _, {bytes, [ExpectedLen]}} ->
{error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
{_, _, unknown_type} ->
{ok, {Result, Tk, String}};
{O, N, _} ->
{error, {wrong_type, O, N, integer, Row, Start, End}}
end;
parse_expression2(Type, Tk, String, {character, "[", Row, Start, _}) -> parse_expression2(Type, Tk, String, {character, "[", Row, Start, _}) ->
parse_list(Type, Tk, String, Row, Start); parse_list(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {character, "(", Row, Start, _}) -> parse_expression2(Type, Tk, String, {character, "(", Row, Start, _}) ->
@ -122,6 +159,25 @@ expect_tokens([Str | Rest], Tk, String) ->
{error, {unexpected_token, Actual, Row, Start, End}} {error, {unexpected_token, Actual, Row, Start, End}}
end. end.
convert_bytes(Chars) ->
Digits = lists:foldl(fun(C, Acc) -> [convert_nibble(C) | Acc] end, [], Chars),
reverse_combine_nibbles(Digits, <<>>).
convert_nibble(C) when C >= $0, C =< $9 ->
C - $0;
convert_nibble(C) when C >= $A, C =< $Z ->
C - $A + 10;
convert_nibble(C) when C >= $a, C =< $z ->
C - $a + 10.
reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
NewAcc = <<D2:4, D1:4, Acc/binary>>,
reverse_combine_nibbles(Rest, NewAcc);
reverse_combine_nibbles([D1], Acc) ->
<<0:4, D1:4, Acc/binary>>;
reverse_combine_nibbles([], Acc) ->
Acc.
%%% List Parsing %%% List Parsing
parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) -> parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
@ -430,12 +486,13 @@ wrap_error(Reason, _) -> Reason.
%%% Tests %%% Tests
check_sophia_to_fate(Type, Sophia, Fate) -> check_sophia_to_fate(Type, Sophia, Fate) ->
{ok, FateActual} = parse_literal(Type, Sophia), case parse_literal(Type, Sophia) of
case FateActual of {ok, Fate} ->
Fate ->
ok; ok;
_ -> {ok, FateActual} ->
erlang:error({to_fate_failed, Fate, FateActual}) erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}});
{error, Reason} ->
erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
end. end.
compile_entrypoint_code_and_type(Source, Entrypoint) -> compile_entrypoint_code_and_type(Source, Entrypoint) ->
@ -502,6 +559,9 @@ tuple_test() ->
maps_test() -> maps_test() ->
check_parser("{[1] = 2, [3] = 4}"). check_parser("{[1] = 2, [3] = 4}").
bytes_test() ->
check_parser("#DEAD000BEEF").
records_test() -> records_test() ->
TypeDef = "record pair = {x: int, y: int}", TypeDef = "record pair = {x: int, y: int}",
Sophia = "{x = 1, y = 2}", Sophia = "{x = 1, y = 2}",