From f1696e2b9ef76300c8bf7cfa4bd64118f0811d64 Mon Sep 17 00:00:00 2001 From: Jarvis Carroll Date: Thu, 29 Jan 2026 02:01:16 +0000 Subject: [PATCH] Bytes lexing I don't handle underscores in bytes correctly... Nor in integers, for that matter. --- src/hz_sophia.erl | 90 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index a158ba8..e84e7a7 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -4,6 +4,8 @@ -copyright("Jarvis Carroll "). -license("GPL-3.0-or-later"). +-export([check_parser/1]). + -include_lib("eunit/include/eunit.hrl"). parse_literal(Type, String) -> @@ -33,25 +35,26 @@ next_token({tk, Row, Col}, " " ++ Rest) -> next_token({tk, Row + 1, Col}, Rest); next_token({tk, Row, Col}, "\t" ++ Rest) -> next_token({tk, Row + 1, Col}, Rest); -next_token(Tk, [N | _] = String) when N >= $0, N =< $9 -> - num_token(Tk, Tk, String, []); +next_token({tk, _, Col}, "\r\n" ++ Rest) -> + next_token({tk, 1, Col + 1}, Rest); +next_token({tk, _, Col}, "\r" ++ Rest) -> + next_token({tk, 1, Col + 1}, Rest); +next_token({tk, _, Col}, "\n" ++ Rest) -> + next_token({tk, 1, Col + 1}, Rest); next_token(Tk, [N | _] = String) when N >= $A, N =< $Z -> alphanum_token(Tk, Tk, String, []); next_token(Tk, [N | _] = String) when N >= $a, N =< $z -> alphanum_token(Tk, Tk, String, []); -next_token(Tk, [$_ | _] = String) -> +next_token(Tk, "_" ++ _ = String) -> alphanum_token(Tk, Tk, String, []); +next_token(Tk, [N | _] = String) when N >= $0, N =< $9 -> + num_token(Tk, Tk, String, []); +next_token({tk, Row, Col}, "#" ++ Rest) -> + bytes_token({tk, Row, Col}, {tk, Row + 1, col}, Rest, "#"); next_token({tk, Row, Col}, [Char | Rest]) -> Token = {character, [Char], Row, Col, Col}, {ok, {Token, {tk, Row + 1, Col}, Rest}}. -num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> - num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); -num_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> - NumString = lists:reverse(Acc), - Token = {integer, NumString, Row, Start, End}, - {ok, {Token, {tk, Row, End}, String}}. - alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z -> alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z -> @@ -65,6 +68,24 @@ alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> Token = {alphanum, AlphaString, Row, Start, End}, {ok, {Token, {tk, Row, End}, String}}. +num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> + num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +num_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> + NumString = lists:reverse(Acc), + Token = {integer, NumString, Row, Start, End}, + {ok, {Token, {tk, Row, End}, String}}. + +bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 -> + bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $A, N =< $F -> + bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +bytes_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $a, N =< $f -> + bytes_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]); +bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> + BytesString = lists:reverse(Acc), + Token = {bytes, BytesString, Row, Start, End}, + {ok, {Token, {tk, Row, End}, String}}. + %%% Sophia Literal Parser @@ -98,6 +119,22 @@ parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) -> {O, N, _} -> {error, {wrong_type, O, N, integer, Row, Start, End}} end; +parse_expression2(Type, Tk, String, {bytes, "#" ++ S, Row, Start, End}) -> + Value = convert_bytes(S), + Len = byte_size(Value), + Result = {bytes, Value}, + case Type of + {_, _, {bytes, [any]}} -> + {ok, {Result, Tk, String}}; + {_, _, {bytes, [Len]}} -> + {ok, {Result, Tk, String}}; + {_, _, {bytes, [ExpectedLen]}} -> + {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}}; + {_, _, unknown_type} -> + {ok, {Result, Tk, String}}; + {O, N, _} -> + {error, {wrong_type, O, N, integer, Row, Start, End}} + end; parse_expression2(Type, Tk, String, {character, "[", Row, Start, _}) -> parse_list(Type, Tk, String, Row, Start); parse_expression2(Type, Tk, String, {character, "(", Row, Start, _}) -> @@ -122,6 +159,25 @@ expect_tokens([Str | Rest], Tk, String) -> {error, {unexpected_token, Actual, Row, Start, End}} end. +convert_bytes(Chars) -> + Digits = lists:foldl(fun(C, Acc) -> [convert_nibble(C) | Acc] end, [], Chars), + reverse_combine_nibbles(Digits, <<>>). + +convert_nibble(C) when C >= $0, C =< $9 -> + C - $0; +convert_nibble(C) when C >= $A, C =< $Z -> + C - $A + 10; +convert_nibble(C) when C >= $a, C =< $z -> + C - $a + 10. + +reverse_combine_nibbles([D1, D2 | Rest], Acc) -> + NewAcc = <>, + reverse_combine_nibbles(Rest, NewAcc); +reverse_combine_nibbles([D1], Acc) -> + <<0:4, D1:4, Acc/binary>>; +reverse_combine_nibbles([], Acc) -> + Acc. + %%% List Parsing parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) -> @@ -430,12 +486,13 @@ wrap_error(Reason, _) -> Reason. %%% Tests check_sophia_to_fate(Type, Sophia, Fate) -> - {ok, FateActual} = parse_literal(Type, Sophia), - case FateActual of - Fate -> + case parse_literal(Type, Sophia) of + {ok, Fate} -> ok; - _ -> - erlang:error({to_fate_failed, Fate, FateActual}) + {ok, FateActual} -> + erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}}); + {error, Reason} -> + erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}}) end. compile_entrypoint_code_and_type(Source, Entrypoint) -> @@ -502,6 +559,9 @@ tuple_test() -> maps_test() -> check_parser("{[1] = 2, [3] = 4}"). +bytes_test() -> + check_parser("#DEAD000BEEF"). + records_test() -> TypeDef = "record pair = {x: int, y: int}", Sophia = "{x = 1, y = 2}",