typecheck bits

Sophia bitstrings aren't really something you initialize manually, so we have to make up a literal format for them. Failing that, we just accept arbitrary integers and bytearrays as bitstrings.
bool/char literals
2026-02-13 06:25:24 +00:00 · 2026-02-13 06:25:24 +00:00
1 changed files with 143 additions and 36 deletions
@@ -65,6 +65,8 @@ next_token({Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
    bytes_token({Row, Col}, {Row, Col + 1}, [C | Rest], "#", []);
 next_token({Row, Col}, "\"" ++ Rest) ->
    string_token({Row, Col}, {Row, Col + 1}, Rest, "\"", <<>>);
+next_token({Row, Col}, "'" ++ Rest) ->
+    character_token({Row, Col}, {Row, Col + 1}, Rest, "'");
 next_token({Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Char, Row, Col, Col},
    {ok, {Token, {Row, Col + 1}, Rest}}.
@@ -115,41 +117,70 @@ reverse_combine_nibbles([D1], Acc) ->
 reverse_combine_nibbles([], Acc) ->
    Acc.

-string_token(Start, {Row, Col}, "\\x" ++ String, SourceChars, Value) ->
-    case escape_hex_code({Row, Col}, {Row, Col + 2}, String, "x\\" ++ SourceChars) of
-        {ok, {Codepoint, NewSourceChars, NewPos, NewString}} ->
-            NewValue = <<Value/binary, Codepoint/utf8>>,
-            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
-        {error, Reason} ->
-            {error, Reason}
-    end;
-string_token(Start, {Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
-    case escape_char(C) of
-        {ok, ByteVal} ->
-            string_token(Start, {Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
-        error ->
-            {error, {invalid_escape_code, [C], Row, Col}}
-    end;
 string_token({_, Start}, {Row, Col}, [$" | Rest], SourceChars, Value) ->
    SourceStr = lists:reverse([$" | SourceChars]),
    Token = {string, SourceStr, Value, Row, Start, Col},
    {ok, {Token, {Row, Col + 1}, Rest}};
-string_token(Start, {Row, Col}, [C | Rest], SourceChars, Value) ->
-    % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
-    % bother, if we need to convert it back? I guess we could accept iolists if
-    % we really wanted to waste time on this point...
-    string_token(Start, {Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
+string_token({_, Start}, {Row, Col}, [], SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
+string_token({_, Start}, {Row, Col}, [$\r | _], SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
+string_token({_, Start}, {Row, Col}, [$\n | _], SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
+string_token(Start, Pos, String, SourceChars, Value) ->
+    case parse_char(Start, Pos, String, SourceChars) of
+        {ok, {Char, NewSourceChars, NewPos, NewString}} ->
+            % TODO: ERTS probably had to convert this FROM utf8 at some point,
+            % so why bother, if we need to convert it back? I guess we could
+            % accept iolists if we really wanted to waste time on this point...
+            NewValue = <<Value/binary, Char/utf8>>,
+            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
+        {error, Reason} ->
+            {error, Reason}
+    end.

-escape_hex_code(Start, {Row, Col}, "{" ++ String, SourceChars) ->
-    escape_long_hex_code(Start, {Row, Col + 1}, String, "{" ++ SourceChars, 0);
-escape_hex_code(_, {Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
-    % As of writing this, the Sophia compiler will convert this byte from
-    % extended ASCII to unicode... But it really shouldn't. The literal parser
-    % does what the compiler should do.
+character_token({_, Start}, {Row, Col}, [], SourceChars) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
+character_token({_, Start}, {Row, Col}, [$\r | _], SourceChars) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
+character_token({_, Start}, {Row, Col}, [$\n | _], SourceChars) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
+character_token(Start, Pos, String, SourceChars) ->
+    case parse_char(Start, Pos, String, SourceChars) of
+        {ok, {Char, NewSourceChars, NewPos, NewString}} ->
+            character_token2(Start, NewPos, NewString, NewSourceChars, Char);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+character_token2({_, Start}, {Row, Col}, [$' | Rest], SourceChars, Value) ->
+    SourceStr = lists:reverse([$' | SourceChars]),
+    Token = {char_literal, SourceStr, Value, Row, Start, Col},
+    {ok, {Token, {Row, Col + 1}, Rest}};
+character_token2({_, Start}, {Row, Col}, _, SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}.
+
+parse_char(Start, {Row, Col}, "\\x{" ++ String, SourceChars) ->
+    escape_long_hex_code(Start, {Row, Col + 3}, String, "{x\\" ++ SourceChars, 0);
+parse_char(_, {Row, Col}, [$\\, $x, A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
    Byte = convert_digit(A) * 16 + convert_digit(B),
-    {ok, {Byte, [B, A | SourceChars], {Row, Col + 2}, String}};
-escape_hex_code({Row1, Col1}, _, _, _) ->
-    {error, {invalid_escape_code, "\\x", Row1, Col1}}.
+    {ok, {Byte, [B, A, $x, $\\ | SourceChars], {Row, Col + 4}, String}};
+parse_char({Row, Start}, {Row, Col}, [$\\, C | Rest], SourceChars) ->
+    case escape_char(C) of
+        {ok, ByteVal} ->
+            {ok, {ByteVal, [C, $\ | SourceChars], {Row, Col + 2}, Rest}};
+        error ->
+            {error, {invalid_escape_code, [$\\, C], Row, Start, Col + 1}}
+    end;
+parse_char(_, {Row, Col}, [C | Rest], SourceChars) ->
+    {ok, {C, [C | SourceChars], {Row, Col + 1}, Rest}}.

 escape_long_hex_code(_, {Row, Col}, "}" ++ String, SourceChars, Value) ->
    {ok, {Value, "}" ++ SourceChars, {Row, Col + 1}, String}};
@@ -171,7 +202,10 @@ escape_char($n)  -> {ok, $\n};
 escape_char($r)  -> {ok, $\r};
 escape_char($t)  -> {ok, $\t};
 escape_char($v)  -> {ok, $\v};
+% Technically \" and \' are only valid inside their own quote characters, not
+% each other, but whatever, we will just be permissive here.
 escape_char($")  -> {ok, $\"};
+escape_char($')  -> {ok, $\'};
 escape_char($\\) -> {ok, $\\};
 escape_char(_)   -> error.

@@ -202,13 +236,13 @@ parse_expression(Type, Pos, String) ->
    end.

 parse_expression2(Type, Pos, String, {integer, _, Value, Row, Start, End}) ->
-    case Type of
-        {_, _, integer} ->
-            {ok, {Value, Pos, String}};
-        {_, _, unknown_type} ->
-            {ok, {Value, Pos, String}};
-        {O, N, _} ->
-            {error, {wrong_type, O, N, integer, Row, Start, End}}
+    typecheck_integer(Type, Pos, String, Value, Row, Start, End);
+parse_expression2(Type, Pos, String, {character, "-", _, _, _, _}) ->
+    case next_token(Pos, String) of
+        {ok, {{integer, _, Value, Row, Start, End}, NewPos, NewString}} ->
+            typecheck_integer(Type, NewPos, NewString, -Value, Row, Start, End);
+        {error, Reason} ->
+            {error, Reason}
    end;
 parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
    Len = byte_size(Value),
@@ -220,6 +254,10 @@ parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
            {ok, {Result, Pos, String}};
        {_, _, {bytes, [ExpectedLen]}} ->
            {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
+        {_, _, bits} ->
+            Size = bit_size(Value),
+            <<IntValue:Size>> = Value,
+            {ok, {{bits, IntValue}, Pos, String}};
        {_, _, unknown_type} ->
            {ok, {Result, Pos, String}};
        {O, N, _} ->
@@ -234,6 +272,15 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) ->
        {O, N, _} ->
            {error, {wrong_type, O, N, string, Row, Start, End}}
    end;
+parse_expression2(Type, Pos, String, {char_literal, _, Value, Row, Start, End}) ->
+    case Type of
+        {_, _, char} ->
+            {ok, {Value, Pos, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Pos, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, char, Row, Start, End}}
+    end;
 parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) ->
    parse_list(Type, Pos, String, Row, Start);
 parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) ->
@@ -276,6 +323,14 @@ unexpected_token({_, S, _, Row, Start, End}) ->

 %%% Ambiguous Chain Object vs Identifier Parsing

+parse_alphanum(Type, Pos, String, ["true"], Row, Start, End) ->
+    typecheck_bool(Type, Pos, String, true, Row, Start, End);
+parse_alphanum(Type, Pos, String, ["false"], Row, Start, End) ->
+    typecheck_bool(Type, Pos, String, false, Row, Start, End);
+parse_alphanum(Type, Pos, String, ["Bits", "all"], Row, Start, End) ->
+    typecheck_bits(Type, Pos, String, -1, Row, Start, End);
+parse_alphanum(Type, Pos, String, ["Bits", "none"], Row, Start, End) ->
+    typecheck_bits(Type, Pos, String, 0, Row, Start, End);
 parse_alphanum(Type, Pos, String, [[C | _] = S], Row, Start, End) when ?IS_LATIN_LOWER(C) ->
    % From a programming perspective, we are trying to parse a constant, so
    % an alphanum token can really only be a constructor, or a chain object.
@@ -303,6 +358,29 @@ parse_alphanum(Type, Pos, String, Path, Row, Start, End) ->
    % must be a variant constructor, or invalid.
    parse_variant(Type, Pos, String, Path, Row, Start, End).

+typecheck_integer({_, _, integer}, Pos, String, Value, _, _, _) ->
+    {ok, {Value, Pos, String}};
+typecheck_integer({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
+    {ok, {Value, Pos, String}};
+typecheck_integer({_, _, bits}, Pos, String, Value, _, _, _) ->
+    {ok, {{bits, Value}, Pos, String}};
+typecheck_integer({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, integer, Row, Start, End}}.
+
+typecheck_bool({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
+    {ok, {Value, Pos, String}};
+typecheck_bool({_, _, boolean}, Pos, String, Value, _, _, _) ->
+    {ok, {Value, Pos, String}};
+typecheck_bool({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, boolean, Row, Start, End}}.
+
+typecheck_bits({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
+    {ok, {{bits, Value}, Pos, String}};
+typecheck_bits({_, _, bits}, Pos, String, Value, _, _, _) ->
+    {ok, {{bits, Value}, Pos, String}};
+typecheck_bits({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, bits, Row, Start, End}}.
+
 typecheck_address({_, _, address}, Pos, String, Data, _, _, _) ->
    {ok, {{address, Data}, Pos, String}};
 typecheck_address({_, _, contract}, Pos, String, Data, _, _, _) ->
@@ -885,11 +963,24 @@ anon_types_test() ->
    % Integers.
    check_parser("123"),
    check_parser("1_2_3"),
+    check_parser("-123"),
+    % Booleans.
+    check_parser("true"),
+    check_parser("false"),
+    check_parser("[true, false]"),
    % Bytes.
    check_parser("#DEAD000BEEF"),
    check_parser("#DE_AD0_00B_EEF"),
    % Strings.
    check_parser("\"hello world\""),
+    % The Sophia compiler doesn't handle this right, but we should still.
+    %check_parser("\"ÿ\""),
+    %check_parser("\"♣\""),
+    % Characters.
+    check_parser("'A'"),
+    check_parser("['a', ' ', '[']"),
+    %check_parser("'ÿ'"),
+    %check_parser("'♣'"),
    % List of integers.
    check_parser("[1, 2, 3]"),
    % List of lists.
@@ -905,6 +996,13 @@ string_escape_codes_test() ->
    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
    check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
+    check_parser("\"'\""),
+
+    check_parser("['\\b', '\\e', '\\f', '\\n', '\\r', '\\t', '\\v', '\"', '\\'', '\\\\']"),
+    check_parser("['\\x00', '\\x11', '\\x77', '\\x4a', '\\x4A']"),
+    check_parser("['\\x{0}', '\\x{7}', '\\x{7F}', '\\x{07F}', '\\x{007F}', '\\x{0007F}', '\\x{0000007F}']"),
+    check_parser("'\"'"),
+
    ok.

 records_test() ->
@@ -965,6 +1063,15 @@ chain_objects_test() ->

    ok.

+bits_test() ->
+    check_parser("Bits.all"),
+    check_parser("Bits.none"),
+    {_, Type} = compile_entrypoint_value_and_type("contract C = entrypoint f() = Bits.all", "f"),
+    check_sophia_to_fate(Type, "5", {bits, 5}),
+    check_sophia_to_fate(Type, "-5", {bits, -5}),
+    check_sophia_to_fate(Type, "#123", {bits, 256 + 32 + 3}),
+    ok.
+
 singleton_records_test() ->
    TypeDef = "record singleton('a) = {it: 'a}",
    check_parser_with_typedef(TypeDef, "{it = 123}"),