Compare commits

..

2 Commits

Author SHA1 Message Date
Jarvis Carroll
78c9c67f38 typecheck bits
Sophia bitstrings aren't really something you initialize manually, so we have to make up a literal format for them. Failing that, we just accept arbitrary integers and bytearrays as bitstrings.
2026-02-13 06:25:24 +00:00
Jarvis Carroll
9bc0ffafd1 bool/char literals
Character literals were the main complexity here, but I threw booleans in as well, since that covers all the major literals.
2026-02-13 06:25:24 +00:00

View File

@ -65,6 +65,8 @@ next_token({Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
bytes_token({Row, Col}, {Row, Col + 1}, [C | Rest], "#", []);
next_token({Row, Col}, "\"" ++ Rest) ->
string_token({Row, Col}, {Row, Col + 1}, Rest, "\"", <<>>);
next_token({Row, Col}, "'" ++ Rest) ->
character_token({Row, Col}, {Row, Col + 1}, Rest, "'");
next_token({Row, Col}, [Char | Rest]) ->
Token = {character, [Char], Char, Row, Col, Col},
{ok, {Token, {Row, Col + 1}, Rest}}.
@ -115,41 +117,70 @@ reverse_combine_nibbles([D1], Acc) ->
reverse_combine_nibbles([], Acc) ->
Acc.
string_token(Start, {Row, Col}, "\\x" ++ String, SourceChars, Value) ->
case escape_hex_code({Row, Col}, {Row, Col + 2}, String, "x\\" ++ SourceChars) of
{ok, {Codepoint, NewSourceChars, NewPos, NewString}} ->
NewValue = <<Value/binary, Codepoint/utf8>>,
string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
{error, Reason} ->
{error, Reason}
end;
string_token(Start, {Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
case escape_char(C) of
{ok, ByteVal} ->
string_token(Start, {Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
error ->
{error, {invalid_escape_code, [C], Row, Col}}
end;
string_token({_, Start}, {Row, Col}, [$" | Rest], SourceChars, Value) ->
SourceStr = lists:reverse([$" | SourceChars]),
Token = {string, SourceStr, Value, Row, Start, Col},
{ok, {Token, {Row, Col + 1}, Rest}};
string_token(Start, {Row, Col}, [C | Rest], SourceChars, Value) ->
% TODO: ERTS probably had to convert this FROM utf8 at some point, so why
% bother, if we need to convert it back? I guess we could accept iolists if
% we really wanted to waste time on this point...
string_token(Start, {Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
string_token({_, Start}, {Row, Col}, [], SourceChars, _) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
string_token({_, Start}, {Row, Col}, [$\r | _], SourceChars, _) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
string_token({_, Start}, {Row, Col}, [$\n | _], SourceChars, _) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
string_token(Start, Pos, String, SourceChars, Value) ->
case parse_char(Start, Pos, String, SourceChars) of
{ok, {Char, NewSourceChars, NewPos, NewString}} ->
% TODO: ERTS probably had to convert this FROM utf8 at some point,
% so why bother, if we need to convert it back? I guess we could
% accept iolists if we really wanted to waste time on this point...
NewValue = <<Value/binary, Char/utf8>>,
string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
{error, Reason} ->
{error, Reason}
end.
escape_hex_code(Start, {Row, Col}, "{" ++ String, SourceChars) ->
escape_long_hex_code(Start, {Row, Col + 1}, String, "{" ++ SourceChars, 0);
escape_hex_code(_, {Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
% As of writing this, the Sophia compiler will convert this byte from
% extended ASCII to unicode... But it really shouldn't. The literal parser
% does what the compiler should do.
character_token({_, Start}, {Row, Col}, [], SourceChars) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
character_token({_, Start}, {Row, Col}, [$\r | _], SourceChars) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
character_token({_, Start}, {Row, Col}, [$\n | _], SourceChars) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
character_token(Start, Pos, String, SourceChars) ->
case parse_char(Start, Pos, String, SourceChars) of
{ok, {Char, NewSourceChars, NewPos, NewString}} ->
character_token2(Start, NewPos, NewString, NewSourceChars, Char);
{error, Reason} ->
{error, Reason}
end.
character_token2({_, Start}, {Row, Col}, [$' | Rest], SourceChars, Value) ->
SourceStr = lists:reverse([$' | SourceChars]),
Token = {char_literal, SourceStr, Value, Row, Start, Col},
{ok, {Token, {Row, Col + 1}, Rest}};
character_token2({_, Start}, {Row, Col}, _, SourceChars, _) ->
SourceStr = lists:reverse(SourceChars),
{error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}.
parse_char(Start, {Row, Col}, "\\x{" ++ String, SourceChars) ->
escape_long_hex_code(Start, {Row, Col + 3}, String, "{x\\" ++ SourceChars, 0);
parse_char(_, {Row, Col}, [$\\, $x, A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
Byte = convert_digit(A) * 16 + convert_digit(B),
{ok, {Byte, [B, A | SourceChars], {Row, Col + 2}, String}};
escape_hex_code({Row1, Col1}, _, _, _) ->
{error, {invalid_escape_code, "\\x", Row1, Col1}}.
{ok, {Byte, [B, A, $x, $\\ | SourceChars], {Row, Col + 4}, String}};
parse_char({Row, Start}, {Row, Col}, [$\\, C | Rest], SourceChars) ->
case escape_char(C) of
{ok, ByteVal} ->
{ok, {ByteVal, [C, $\ | SourceChars], {Row, Col + 2}, Rest}};
error ->
{error, {invalid_escape_code, [$\\, C], Row, Start, Col + 1}}
end;
parse_char(_, {Row, Col}, [C | Rest], SourceChars) ->
{ok, {C, [C | SourceChars], {Row, Col + 1}, Rest}}.
escape_long_hex_code(_, {Row, Col}, "}" ++ String, SourceChars, Value) ->
{ok, {Value, "}" ++ SourceChars, {Row, Col + 1}, String}};
@ -171,7 +202,10 @@ escape_char($n) -> {ok, $\n};
escape_char($r) -> {ok, $\r};
escape_char($t) -> {ok, $\t};
escape_char($v) -> {ok, $\v};
% Technically \" and \' are only valid inside their own quote characters, not
% each other, but whatever, we will just be permissive here.
escape_char($") -> {ok, $\"};
escape_char($') -> {ok, $\'};
escape_char($\\) -> {ok, $\\};
escape_char(_) -> error.
@ -202,13 +236,13 @@ parse_expression(Type, Pos, String) ->
end.
parse_expression2(Type, Pos, String, {integer, _, Value, Row, Start, End}) ->
case Type of
{_, _, integer} ->
{ok, {Value, Pos, String}};
{_, _, unknown_type} ->
{ok, {Value, Pos, String}};
{O, N, _} ->
{error, {wrong_type, O, N, integer, Row, Start, End}}
typecheck_integer(Type, Pos, String, Value, Row, Start, End);
parse_expression2(Type, Pos, String, {character, "-", _, _, _, _}) ->
case next_token(Pos, String) of
{ok, {{integer, _, Value, Row, Start, End}, NewPos, NewString}} ->
typecheck_integer(Type, NewPos, NewString, -Value, Row, Start, End);
{error, Reason} ->
{error, Reason}
end;
parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
Len = byte_size(Value),
@ -220,6 +254,10 @@ parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
{ok, {Result, Pos, String}};
{_, _, {bytes, [ExpectedLen]}} ->
{error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
{_, _, bits} ->
Size = bit_size(Value),
<<IntValue:Size>> = Value,
{ok, {{bits, IntValue}, Pos, String}};
{_, _, unknown_type} ->
{ok, {Result, Pos, String}};
{O, N, _} ->
@ -234,6 +272,15 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) ->
{O, N, _} ->
{error, {wrong_type, O, N, string, Row, Start, End}}
end;
parse_expression2(Type, Pos, String, {char_literal, _, Value, Row, Start, End}) ->
case Type of
{_, _, char} ->
{ok, {Value, Pos, String}};
{_, _, unknown_type} ->
{ok, {Value, Pos, String}};
{O, N, _} ->
{error, {wrong_type, O, N, char, Row, Start, End}}
end;
parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) ->
parse_list(Type, Pos, String, Row, Start);
parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) ->
@ -276,6 +323,14 @@ unexpected_token({_, S, _, Row, Start, End}) ->
%%% Ambiguous Chain Object vs Identifier Parsing
parse_alphanum(Type, Pos, String, ["true"], Row, Start, End) ->
typecheck_bool(Type, Pos, String, true, Row, Start, End);
parse_alphanum(Type, Pos, String, ["false"], Row, Start, End) ->
typecheck_bool(Type, Pos, String, false, Row, Start, End);
parse_alphanum(Type, Pos, String, ["Bits", "all"], Row, Start, End) ->
typecheck_bits(Type, Pos, String, -1, Row, Start, End);
parse_alphanum(Type, Pos, String, ["Bits", "none"], Row, Start, End) ->
typecheck_bits(Type, Pos, String, 0, Row, Start, End);
parse_alphanum(Type, Pos, String, [[C | _] = S], Row, Start, End) when ?IS_LATIN_LOWER(C) ->
% From a programming perspective, we are trying to parse a constant, so
% an alphanum token can really only be a constructor, or a chain object.
@ -303,6 +358,29 @@ parse_alphanum(Type, Pos, String, Path, Row, Start, End) ->
% must be a variant constructor, or invalid.
parse_variant(Type, Pos, String, Path, Row, Start, End).
typecheck_integer({_, _, integer}, Pos, String, Value, _, _, _) ->
{ok, {Value, Pos, String}};
typecheck_integer({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
{ok, {Value, Pos, String}};
typecheck_integer({_, _, bits}, Pos, String, Value, _, _, _) ->
{ok, {{bits, Value}, Pos, String}};
typecheck_integer({O, N, _}, _, _, _, Row, Start, End) ->
{error, {wrong_type, O, N, integer, Row, Start, End}}.
typecheck_bool({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
{ok, {Value, Pos, String}};
typecheck_bool({_, _, boolean}, Pos, String, Value, _, _, _) ->
{ok, {Value, Pos, String}};
typecheck_bool({O, N, _}, _, _, _, Row, Start, End) ->
{error, {wrong_type, O, N, boolean, Row, Start, End}}.
typecheck_bits({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
{ok, {{bits, Value}, Pos, String}};
typecheck_bits({_, _, bits}, Pos, String, Value, _, _, _) ->
{ok, {{bits, Value}, Pos, String}};
typecheck_bits({O, N, _}, _, _, _, Row, Start, End) ->
{error, {wrong_type, O, N, bits, Row, Start, End}}.
typecheck_address({_, _, address}, Pos, String, Data, _, _, _) ->
{ok, {{address, Data}, Pos, String}};
typecheck_address({_, _, contract}, Pos, String, Data, _, _, _) ->
@ -885,11 +963,24 @@ anon_types_test() ->
% Integers.
check_parser("123"),
check_parser("1_2_3"),
check_parser("-123"),
% Booleans.
check_parser("true"),
check_parser("false"),
check_parser("[true, false]"),
% Bytes.
check_parser("#DEAD000BEEF"),
check_parser("#DE_AD0_00B_EEF"),
% Strings.
check_parser("\"hello world\""),
% The Sophia compiler doesn't handle this right, but we should still.
%check_parser("\"ÿ\""),
%check_parser("\"\""),
% Characters.
check_parser("'A'"),
check_parser("['a', ' ', '[']"),
%check_parser("'ÿ'"),
%check_parser("'♣'"),
% List of integers.
check_parser("[1, 2, 3]"),
% List of lists.
@ -905,6 +996,13 @@ string_escape_codes_test() ->
check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""),
check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
check_parser("\"'\""),
check_parser("['\\b', '\\e', '\\f', '\\n', '\\r', '\\t', '\\v', '\"', '\\'', '\\\\']"),
check_parser("['\\x00', '\\x11', '\\x77', '\\x4a', '\\x4A']"),
check_parser("['\\x{0}', '\\x{7}', '\\x{7F}', '\\x{07F}', '\\x{007F}', '\\x{0007F}', '\\x{0000007F}']"),
check_parser("'\"'"),
ok.
records_test() ->
@ -965,6 +1063,15 @@ chain_objects_test() ->
ok.
bits_test() ->
check_parser("Bits.all"),
check_parser("Bits.none"),
{_, Type} = compile_entrypoint_value_and_type("contract C = entrypoint f() = Bits.all", "f"),
check_sophia_to_fate(Type, "5", {bits, 5}),
check_sophia_to_fate(Type, "-5", {bits, -5}),
check_sophia_to_fate(Type, "#123", {bits, 256 + 32 + 3}),
ok.
singleton_records_test() ->
TypeDef = "record singleton('a) = {it: 'a}",
check_parser_with_typedef(TypeDef, "{it = 123}"),