From 9bc0ffafd1fb2d25b60288ee0a07f968c85425e5 Mon Sep 17 00:00:00 2001 From: Jarvis Carroll Date: Fri, 13 Feb 2026 05:52:27 +0000 Subject: [PATCH] bool/char literals Character literals were the main complexity here, but I threw booleans in as well, since that covers all the major literals. --- src/hz_sophia.erl | 131 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 29 deletions(-) diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index 13ba1e7..ef6a55e 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -65,6 +65,8 @@ next_token({Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) -> bytes_token({Row, Col}, {Row, Col + 1}, [C | Rest], "#", []); next_token({Row, Col}, "\"" ++ Rest) -> string_token({Row, Col}, {Row, Col + 1}, Rest, "\"", <<>>); +next_token({Row, Col}, "'" ++ Rest) -> + character_token({Row, Col}, {Row, Col + 1}, Rest, "'"); next_token({Row, Col}, [Char | Rest]) -> Token = {character, [Char], Char, Row, Col, Col}, {ok, {Token, {Row, Col + 1}, Rest}}. @@ -115,41 +117,70 @@ reverse_combine_nibbles([D1], Acc) -> reverse_combine_nibbles([], Acc) -> Acc. -string_token(Start, {Row, Col}, "\\x" ++ String, SourceChars, Value) -> - case escape_hex_code({Row, Col}, {Row, Col + 2}, String, "x\\" ++ SourceChars) of - {ok, {Codepoint, NewSourceChars, NewPos, NewString}} -> - NewValue = <>, - string_token(Start, NewPos, NewString, NewSourceChars, NewValue); - {error, Reason} -> - {error, Reason} - end; -string_token(Start, {Row, Col}, [$\\, C | Rest], SourceChars, Value) -> - case escape_char(C) of - {ok, ByteVal} -> - string_token(Start, {Row, Col + 2}, Rest, [C, $\ | SourceChars], <>); - error -> - {error, {invalid_escape_code, [C], Row, Col}} - end; string_token({_, Start}, {Row, Col}, [$" | Rest], SourceChars, Value) -> SourceStr = lists:reverse([$" | SourceChars]), Token = {string, SourceStr, Value, Row, Start, Col}, {ok, {Token, {Row, Col + 1}, Rest}}; -string_token(Start, {Row, Col}, [C | Rest], SourceChars, Value) -> - % TODO: ERTS probably had to convert this FROM utf8 at some point, so why - % bother, if we need to convert it back? I guess we could accept iolists if - % we really wanted to waste time on this point... - string_token(Start, {Row, Col + 1}, Rest, [C | SourceChars], <>). +string_token({_, Start}, {Row, Col}, [], SourceChars, _) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}}; +string_token({_, Start}, {Row, Col}, [$\r | _], SourceChars, _) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}}; +string_token({_, Start}, {Row, Col}, [$\n | _], SourceChars, _) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}}; +string_token(Start, Pos, String, SourceChars, Value) -> + case parse_char(Start, Pos, String, SourceChars) of + {ok, {Char, NewSourceChars, NewPos, NewString}} -> + % TODO: ERTS probably had to convert this FROM utf8 at some point, + % so why bother, if we need to convert it back? I guess we could + % accept iolists if we really wanted to waste time on this point... + NewValue = <>, + string_token(Start, NewPos, NewString, NewSourceChars, NewValue); + {error, Reason} -> + {error, Reason} + end. -escape_hex_code(Start, {Row, Col}, "{" ++ String, SourceChars) -> - escape_long_hex_code(Start, {Row, Col + 1}, String, "{" ++ SourceChars, 0); -escape_hex_code(_, {Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) -> - % As of writing this, the Sophia compiler will convert this byte from - % extended ASCII to unicode... But it really shouldn't. The literal parser - % does what the compiler should do. +character_token({_, Start}, {Row, Col}, [], SourceChars) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}; +character_token({_, Start}, {Row, Col}, [$\r | _], SourceChars) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}; +character_token({_, Start}, {Row, Col}, [$\n | _], SourceChars) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}; +character_token(Start, Pos, String, SourceChars) -> + case parse_char(Start, Pos, String, SourceChars) of + {ok, {Char, NewSourceChars, NewPos, NewString}} -> + character_token2(Start, NewPos, NewString, NewSourceChars, Char); + {error, Reason} -> + {error, Reason} + end. + +character_token2({_, Start}, {Row, Col}, [$' | Rest], SourceChars, Value) -> + SourceStr = lists:reverse([$' | SourceChars]), + Token = {char_literal, SourceStr, Value, Row, Start, Col}, + {ok, {Token, {Row, Col + 1}, Rest}}; +character_token2({_, Start}, {Row, Col}, _, SourceChars, _) -> + SourceStr = lists:reverse(SourceChars), + {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}. + +parse_char(Start, {Row, Col}, "\\x{" ++ String, SourceChars) -> + escape_long_hex_code(Start, {Row, Col + 3}, String, "{x\\" ++ SourceChars, 0); +parse_char(_, {Row, Col}, [$\\, $x, A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) -> Byte = convert_digit(A) * 16 + convert_digit(B), - {ok, {Byte, [B, A | SourceChars], {Row, Col + 2}, String}}; -escape_hex_code({Row1, Col1}, _, _, _) -> - {error, {invalid_escape_code, "\\x", Row1, Col1}}. + {ok, {Byte, [B, A, $x, $\\ | SourceChars], {Row, Col + 4}, String}}; +parse_char({Row, Start}, {Row, Col}, [$\\, C | Rest], SourceChars) -> + case escape_char(C) of + {ok, ByteVal} -> + {ok, {ByteVal, [C, $\ | SourceChars], {Row, Col + 2}, Rest}}; + error -> + {error, {invalid_escape_code, [$\\, C], Row, Start, Col + 1}} + end; +parse_char(_, {Row, Col}, [C | Rest], SourceChars) -> + {ok, {C, [C | SourceChars], {Row, Col + 1}, Rest}}. escape_long_hex_code(_, {Row, Col}, "}" ++ String, SourceChars, Value) -> {ok, {Value, "}" ++ SourceChars, {Row, Col + 1}, String}}; @@ -171,7 +202,10 @@ escape_char($n) -> {ok, $\n}; escape_char($r) -> {ok, $\r}; escape_char($t) -> {ok, $\t}; escape_char($v) -> {ok, $\v}; +% Technically \" and \' are only valid inside their own quote characters, not +% each other, but whatever, we will just be permissive here. escape_char($") -> {ok, $\"}; +escape_char($') -> {ok, $\'}; escape_char($\\) -> {ok, $\\}; escape_char(_) -> error. @@ -234,6 +268,15 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) -> {O, N, _} -> {error, {wrong_type, O, N, string, Row, Start, End}} end; +parse_expression2(Type, Pos, String, {char_literal, _, Value, Row, Start, End}) -> + case Type of + {_, _, char} -> + {ok, {Value, Pos, String}}; + {_, _, unknown_type} -> + {ok, {Value, Pos, String}}; + {O, N, _} -> + {error, {wrong_type, O, N, char, Row, Start, End}} + end; parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) -> parse_list(Type, Pos, String, Row, Start); parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) -> @@ -276,6 +319,10 @@ unexpected_token({_, S, _, Row, Start, End}) -> %%% Ambiguous Chain Object vs Identifier Parsing +parse_alphanum(Type, Pos, String, ["true"], Row, Start, End) -> + typecheck_bool(Type, Pos, String, true, Row, Start, End); +parse_alphanum(Type, Pos, String, ["false"], Row, Start, End) -> + typecheck_bool(Type, Pos, String, false, Row, Start, End); parse_alphanum(Type, Pos, String, [[C | _] = S], Row, Start, End) when ?IS_LATIN_LOWER(C) -> % From a programming perspective, we are trying to parse a constant, so % an alphanum token can really only be a constructor, or a chain object. @@ -303,6 +350,13 @@ parse_alphanum(Type, Pos, String, Path, Row, Start, End) -> % must be a variant constructor, or invalid. parse_variant(Type, Pos, String, Path, Row, Start, End). +typecheck_bool({_, _, unknown_type}, Pos, String, Value, _, _, _) -> + {ok, {Value, Pos, String}}; +typecheck_bool({_, _, boolean}, Pos, String, Value, _, _, _) -> + {ok, {Value, Pos, String}}; +typecheck_bool({O, N, _}, _, _, _, Row, Start, End) -> + {error, {wrong_type, O, N, boolean, Row, Start, End}}. + typecheck_address({_, _, address}, Pos, String, Data, _, _, _) -> {ok, {{address, Data}, Pos, String}}; typecheck_address({_, _, contract}, Pos, String, Data, _, _, _) -> @@ -885,11 +939,23 @@ anon_types_test() -> % Integers. check_parser("123"), check_parser("1_2_3"), + % Booleans. + check_parser("true"), + check_parser("false"), + check_parser("[true, false]"), % Bytes. check_parser("#DEAD000BEEF"), check_parser("#DE_AD0_00B_EEF"), % Strings. check_parser("\"hello world\""), + % The Sophia compiler doesn't handle this right, but we should still. + %check_parser("\"ÿ\""), + %check_parser("\"♣\""), + % Characters. + check_parser("'A'"), + check_parser("['a', ' ', '[']"), + %check_parser("'ÿ'"), + %check_parser("'♣'"), % List of integers. check_parser("[1, 2, 3]"), % List of lists. @@ -905,6 +971,13 @@ string_escape_codes_test() -> check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""), check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""), + check_parser("\"'\""), + + check_parser("['\\b', '\\e', '\\f', '\\n', '\\r', '\\t', '\\v', '\"', '\\'', '\\\\']"), + check_parser("['\\x00', '\\x11', '\\x77', '\\x4a', '\\x4A']"), + check_parser("['\\x{0}', '\\x{7}', '\\x{7F}', '\\x{07F}', '\\x{007F}', '\\x{0007F}', '\\x{0000007F}']"), + check_parser("'\"'"), + ok. records_test() ->