1 changed files with 36 additions and 143 deletions
--- a/src/hz_sophia.erl
+++ b/src/hz_sophia.erl
@ -65,8 +65,6 @@ next_token({Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
    bytes_token({Row, Col}, {Row, Col + 1}, [C | Rest], "#", []);
 next_token({Row, Col}, "\"" ++ Rest) ->
    string_token({Row, Col}, {Row, Col + 1}, Rest, "\"", <<>>);
-next_token({Row, Col}, "'" ++ Rest) ->
-    character_token({Row, Col}, {Row, Col + 1}, Rest, "'");
 next_token({Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Char, Row, Col, Col},
    {ok, {Token, {Row, Col + 1}, Rest}}.
@ -117,70 +115,41 @@ reverse_combine_nibbles([D1], Acc) ->
 reverse_combine_nibbles([], Acc) ->
    Acc.

+string_token(Start, {Row, Col}, "\\x" ++ String, SourceChars, Value) ->
+    case escape_hex_code({Row, Col}, {Row, Col + 2}, String, "x\\" ++ SourceChars) of
+        {ok, {Codepoint, NewSourceChars, NewPos, NewString}} ->
+            NewValue = <<Value/binary, Codepoint/utf8>>,
+            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
+        {error, Reason} ->
+            {error, Reason}
+    end;
+string_token(Start, {Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
+    case escape_char(C) of
+        {ok, ByteVal} ->
+            string_token(Start, {Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
+        error ->
+            {error, {invalid_escape_code, [C], Row, Col}}
+    end;
 string_token({_, Start}, {Row, Col}, [$" | Rest], SourceChars, Value) ->
    SourceStr = lists:reverse([$" | SourceChars]),
    Token = {string, SourceStr, Value, Row, Start, Col},
    {ok, {Token, {Row, Col + 1}, Rest}};
-string_token({_, Start}, {Row, Col}, [], SourceChars, _) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
-string_token({_, Start}, {Row, Col}, [$\r | _], SourceChars, _) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
-string_token({_, Start}, {Row, Col}, [$\n | _], SourceChars, _) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
-string_token(Start, Pos, String, SourceChars, Value) ->
-    case parse_char(Start, Pos, String, SourceChars) of
-        {ok, {Char, NewSourceChars, NewPos, NewString}} ->
-            % TODO: ERTS probably had to convert this FROM utf8 at some point,
-            % so why bother, if we need to convert it back? I guess we could
-            % accept iolists if we really wanted to waste time on this point...
-            NewValue = <<Value/binary, Char/utf8>>,
-            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
-        {error, Reason} ->
-            {error, Reason}
-    end.
+string_token(Start, {Row, Col}, [C | Rest], SourceChars, Value) ->
+    % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
+    % bother, if we need to convert it back? I guess we could accept iolists if
+    % we really wanted to waste time on this point...
+    string_token(Start, {Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).

-character_token({_, Start}, {Row, Col}, [], SourceChars) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
-character_token({_, Start}, {Row, Col}, [$\r | _], SourceChars) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
-character_token({_, Start}, {Row, Col}, [$\n | _], SourceChars) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
-character_token(Start, Pos, String, SourceChars) ->
-    case parse_char(Start, Pos, String, SourceChars) of
-        {ok, {Char, NewSourceChars, NewPos, NewString}} ->
-            character_token2(Start, NewPos, NewString, NewSourceChars, Char);
-        {error, Reason} ->
-            {error, Reason}
-    end.
-
-character_token2({_, Start}, {Row, Col}, [$' | Rest], SourceChars, Value) ->
-    SourceStr = lists:reverse([$' | SourceChars]),
-    Token = {char_literal, SourceStr, Value, Row, Start, Col},
-    {ok, {Token, {Row, Col + 1}, Rest}};
-character_token2({_, Start}, {Row, Col}, _, SourceChars, _) ->
-    SourceStr = lists:reverse(SourceChars),
-    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}.
-
-parse_char(Start, {Row, Col}, "\\x{" ++ String, SourceChars) ->
-    escape_long_hex_code(Start, {Row, Col + 3}, String, "{x\\" ++ SourceChars, 0);
-parse_char(_, {Row, Col}, [$\\, $x, A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
+escape_hex_code(Start, {Row, Col}, "{" ++ String, SourceChars) ->
+    escape_long_hex_code(Start, {Row, Col + 1}, String, "{" ++ SourceChars, 0);
+escape_hex_code(_, {Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
+    % As of writing this, the Sophia compiler will convert this byte from
+    % extended ASCII to unicode... But it really shouldn't. The literal parser
+    % does what the compiler should do.
    Byte = convert_digit(A) * 16 + convert_digit(B),
-    {ok, {Byte, [B, A, $x, $\\ | SourceChars], {Row, Col + 4}, String}};
-parse_char({Row, Start}, {Row, Col}, [$\\, C | Rest], SourceChars) ->
-    case escape_char(C) of
-        {ok, ByteVal} ->
-            {ok, {ByteVal, [C, $\ | SourceChars], {Row, Col + 2}, Rest}};
-        error ->
-            {error, {invalid_escape_code, [$\\, C], Row, Start, Col + 1}}
-    end;
-parse_char(_, {Row, Col}, [C | Rest], SourceChars) ->
-    {ok, {C, [C | SourceChars], {Row, Col + 1}, Rest}}.
+    {ok, {Byte, [B, A | SourceChars], {Row, Col + 2}, String}};
+escape_hex_code({Row1, Col1}, _, _, _) ->
+    {error, {invalid_escape_code, "\\x", Row1, Col1}}.

 escape_long_hex_code(_, {Row, Col}, "}" ++ String, SourceChars, Value) ->
    {ok, {Value, "}" ++ SourceChars, {Row, Col + 1}, String}};
@ -202,10 +171,7 @@ escape_char($n)  -> {ok, $\n};
 escape_char($r)  -> {ok, $\r};
 escape_char($t)  -> {ok, $\t};
 escape_char($v)  -> {ok, $\v};
-% Technically \" and \' are only valid inside their own quote characters, not
-% each other, but whatever, we will just be permissive here.
 escape_char($")  -> {ok, $\"};
-escape_char($')  -> {ok, $\'};
 escape_char($\\) -> {ok, $\\};
 escape_char(_)   -> error.

@ -236,13 +202,13 @@ parse_expression(Type, Pos, String) ->
    end.

 parse_expression2(Type, Pos, String, {integer, _, Value, Row, Start, End}) ->
-    typecheck_integer(Type, Pos, String, Value, Row, Start, End);
-parse_expression2(Type, Pos, String, {character, "-", _, _, _, _}) ->
-    case next_token(Pos, String) of
-        {ok, {{integer, _, Value, Row, Start, End}, NewPos, NewString}} ->
-            typecheck_integer(Type, NewPos, NewString, -Value, Row, Start, End);
-        {error, Reason} ->
-            {error, Reason}
+    case Type of
+        {_, _, integer} ->
+            {ok, {Value, Pos, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Pos, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, integer, Row, Start, End}}
    end;
 parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
    Len = byte_size(Value),
@ -254,10 +220,6 @@ parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
            {ok, {Result, Pos, String}};
        {_, _, {bytes, [ExpectedLen]}} ->
            {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
-        {_, _, bits} ->
-            Size = bit_size(Value),
-            <<IntValue:Size>> = Value,
-            {ok, {{bits, IntValue}, Pos, String}};
        {_, _, unknown_type} ->
            {ok, {Result, Pos, String}};
        {O, N, _} ->
@ -272,15 +234,6 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) ->
        {O, N, _} ->
            {error, {wrong_type, O, N, string, Row, Start, End}}
    end;
-parse_expression2(Type, Pos, String, {char_literal, _, Value, Row, Start, End}) ->
-    case Type of
-        {_, _, char} ->
-            {ok, {Value, Pos, String}};
-        {_, _, unknown_type} ->
-            {ok, {Value, Pos, String}};
-        {O, N, _} ->
-            {error, {wrong_type, O, N, char, Row, Start, End}}
-    end;
 parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) ->
    parse_list(Type, Pos, String, Row, Start);
 parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) ->
@ -323,14 +276,6 @@ unexpected_token({_, S, _, Row, Start, End}) ->

 %%% Ambiguous Chain Object vs Identifier Parsing

-parse_alphanum(Type, Pos, String, ["true"], Row, Start, End) ->
-    typecheck_bool(Type, Pos, String, true, Row, Start, End);
-parse_alphanum(Type, Pos, String, ["false"], Row, Start, End) ->
-    typecheck_bool(Type, Pos, String, false, Row, Start, End);
-parse_alphanum(Type, Pos, String, ["Bits", "all"], Row, Start, End) ->
-    typecheck_bits(Type, Pos, String, -1, Row, Start, End);
-parse_alphanum(Type, Pos, String, ["Bits", "none"], Row, Start, End) ->
-    typecheck_bits(Type, Pos, String, 0, Row, Start, End);
 parse_alphanum(Type, Pos, String, [[C | _] = S], Row, Start, End) when ?IS_LATIN_LOWER(C) ->
    % From a programming perspective, we are trying to parse a constant, so
    % an alphanum token can really only be a constructor, or a chain object.
@ -358,29 +303,6 @@ parse_alphanum(Type, Pos, String, Path, Row, Start, End) ->
    % must be a variant constructor, or invalid.
    parse_variant(Type, Pos, String, Path, Row, Start, End).

-typecheck_integer({_, _, integer}, Pos, String, Value, _, _, _) ->
-    {ok, {Value, Pos, String}};
-typecheck_integer({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
-    {ok, {Value, Pos, String}};
-typecheck_integer({_, _, bits}, Pos, String, Value, _, _, _) ->
-    {ok, {{bits, Value}, Pos, String}};
-typecheck_integer({O, N, _}, _, _, _, Row, Start, End) ->
-    {error, {wrong_type, O, N, integer, Row, Start, End}}.
-
-typecheck_bool({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
-    {ok, {Value, Pos, String}};
-typecheck_bool({_, _, boolean}, Pos, String, Value, _, _, _) ->
-    {ok, {Value, Pos, String}};
-typecheck_bool({O, N, _}, _, _, _, Row, Start, End) ->
-    {error, {wrong_type, O, N, boolean, Row, Start, End}}.
-
-typecheck_bits({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
-    {ok, {{bits, Value}, Pos, String}};
-typecheck_bits({_, _, bits}, Pos, String, Value, _, _, _) ->
-    {ok, {{bits, Value}, Pos, String}};
-typecheck_bits({O, N, _}, _, _, _, Row, Start, End) ->
-    {error, {wrong_type, O, N, bits, Row, Start, End}}.
-
 typecheck_address({_, _, address}, Pos, String, Data, _, _, _) ->
    {ok, {{address, Data}, Pos, String}};
 typecheck_address({_, _, contract}, Pos, String, Data, _, _, _) ->
@ -963,24 +885,11 @@ anon_types_test() ->
    % Integers.
    check_parser("123"),
    check_parser("1_2_3"),
-    check_parser("-123"),
-    % Booleans.
-    check_parser("true"),
-    check_parser("false"),
-    check_parser("[true, false]"),
    % Bytes.
    check_parser("#DEAD000BEEF"),
    check_parser("#DE_AD0_00B_EEF"),
    % Strings.
    check_parser("\"hello world\""),
-    % The Sophia compiler doesn't handle this right, but we should still.
-    %check_parser("\"ÿ\""),
-    %check_parser("\"♣\""),
-    % Characters.
-    check_parser("'A'"),
-    check_parser("['a', ' ', '[']"),
-    %check_parser("'ÿ'"),
-    %check_parser("'♣'"),
    % List of integers.
    check_parser("[1, 2, 3]"),
    % List of lists.
@ -996,13 +905,6 @@ string_escape_codes_test() ->
    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
    check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
-    check_parser("\"'\""),
-
-    check_parser("['\\b', '\\e', '\\f', '\\n', '\\r', '\\t', '\\v', '\"', '\\'', '\\\\']"),
-    check_parser("['\\x00', '\\x11', '\\x77', '\\x4a', '\\x4A']"),
-    check_parser("['\\x{0}', '\\x{7}', '\\x{7F}', '\\x{07F}', '\\x{007F}', '\\x{0007F}', '\\x{0000007F}']"),
-    check_parser("'\"'"),
-
    ok.

 records_test() ->
@ -1063,15 +965,6 @@ chain_objects_test() ->

    ok.

-bits_test() ->
-    check_parser("Bits.all"),
-    check_parser("Bits.none"),
-    {_, Type} = compile_entrypoint_value_and_type("contract C = entrypoint f() = Bits.all", "f"),
-    check_sophia_to_fate(Type, "5", {bits, 5}),
-    check_sophia_to_fate(Type, "-5", {bits, -5}),
-    check_sophia_to_fate(Type, "#123", {bits, 256 + 32 + 3}),
-    ok.
-
 singleton_records_test() ->
    TypeDef = "record singleton('a) = {it: 'a}",
    check_parser_with_typedef(TypeDef, "{it = 123}"),