From 9bc0ffafd1fb2d25b60288ee0a07f968c85425e5 Mon Sep 17 00:00:00 2001
From: Jarvis Carroll <jarviscarroll@qpq.swiss>
Date: Fri, 13 Feb 2026 05:52:27 +0000
Subject: [PATCH] bool/char literals

Character literals were the main complexity here, but I threw booleans in as well, since that covers all the major literals.
---
 src/hz_sophia.erl | 131 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 102 insertions(+), 29 deletions(-)

diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl
index 13ba1e7..ef6a55e 100644
--- a/src/hz_sophia.erl
+++ b/src/hz_sophia.erl
@@ -65,6 +65,8 @@ next_token({Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
     bytes_token({Row, Col}, {Row, Col + 1}, [C | Rest], "#", []);
 next_token({Row, Col}, "\"" ++ Rest) ->
     string_token({Row, Col}, {Row, Col + 1}, Rest, "\"", <<>>);
+next_token({Row, Col}, "'" ++ Rest) ->
+    character_token({Row, Col}, {Row, Col + 1}, Rest, "'");
 next_token({Row, Col}, [Char | Rest]) ->
     Token = {character, [Char], Char, Row, Col, Col},
     {ok, {Token, {Row, Col + 1}, Rest}}.
@@ -115,41 +117,70 @@ reverse_combine_nibbles([D1], Acc) ->
 reverse_combine_nibbles([], Acc) ->
     Acc.
 
-string_token(Start, {Row, Col}, "\\x" ++ String, SourceChars, Value) ->
-    case escape_hex_code({Row, Col}, {Row, Col + 2}, String, "x\\" ++ SourceChars) of
-        {ok, {Codepoint, NewSourceChars, NewPos, NewString}} ->
-            NewValue = <<Value/binary, Codepoint/utf8>>,
-            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
-        {error, Reason} ->
-            {error, Reason}
-    end;
-string_token(Start, {Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
-    case escape_char(C) of
-        {ok, ByteVal} ->
-            string_token(Start, {Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
-        error ->
-            {error, {invalid_escape_code, [C], Row, Col}}
-    end;
 string_token({_, Start}, {Row, Col}, [$" | Rest], SourceChars, Value) ->
     SourceStr = lists:reverse([$" | SourceChars]),
     Token = {string, SourceStr, Value, Row, Start, Col},
     {ok, {Token, {Row, Col + 1}, Rest}};
-string_token(Start, {Row, Col}, [C | Rest], SourceChars, Value) ->
-    % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
-    % bother, if we need to convert it back? I guess we could accept iolists if
-    % we really wanted to waste time on this point...
-    string_token(Start, {Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
+string_token({_, Start}, {Row, Col}, [], SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
+string_token({_, Start}, {Row, Col}, [$\r | _], SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
+string_token({_, Start}, {Row, Col}, [$\n | _], SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_string_literal, SourceStr, Start, Row, Col - 1}};
+string_token(Start, Pos, String, SourceChars, Value) ->
+    case parse_char(Start, Pos, String, SourceChars) of
+        {ok, {Char, NewSourceChars, NewPos, NewString}} ->
+            % TODO: ERTS probably had to convert this FROM utf8 at some point,
+            % so why bother, if we need to convert it back? I guess we could
+            % accept iolists if we really wanted to waste time on this point...
+            NewValue = <<Value/binary, Char/utf8>>,
+            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
+        {error, Reason} ->
+            {error, Reason}
+    end.
 
-escape_hex_code(Start, {Row, Col}, "{" ++ String, SourceChars) ->
-    escape_long_hex_code(Start, {Row, Col + 1}, String, "{" ++ SourceChars, 0);
-escape_hex_code(_, {Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
-    % As of writing this, the Sophia compiler will convert this byte from
-    % extended ASCII to unicode... But it really shouldn't. The literal parser
-    % does what the compiler should do.
+character_token({_, Start}, {Row, Col}, [], SourceChars) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
+character_token({_, Start}, {Row, Col}, [$\r | _], SourceChars) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
+character_token({_, Start}, {Row, Col}, [$\n | _], SourceChars) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}};
+character_token(Start, Pos, String, SourceChars) ->
+    case parse_char(Start, Pos, String, SourceChars) of
+        {ok, {Char, NewSourceChars, NewPos, NewString}} ->
+            character_token2(Start, NewPos, NewString, NewSourceChars, Char);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+character_token2({_, Start}, {Row, Col}, [$' | Rest], SourceChars, Value) ->
+    SourceStr = lists:reverse([$' | SourceChars]),
+    Token = {char_literal, SourceStr, Value, Row, Start, Col},
+    {ok, {Token, {Row, Col + 1}, Rest}};
+character_token2({_, Start}, {Row, Col}, _, SourceChars, _) ->
+    SourceStr = lists:reverse(SourceChars),
+    {error, {unclosed_character_literal, SourceStr, Start, Row, Col - 1}}.
+
+parse_char(Start, {Row, Col}, "\\x{" ++ String, SourceChars) ->
+    escape_long_hex_code(Start, {Row, Col + 3}, String, "{x\\" ++ SourceChars, 0);
+parse_char(_, {Row, Col}, [$\\, $x, A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
     Byte = convert_digit(A) * 16 + convert_digit(B),
-    {ok, {Byte, [B, A | SourceChars], {Row, Col + 2}, String}};
-escape_hex_code({Row1, Col1}, _, _, _) ->
-    {error, {invalid_escape_code, "\\x", Row1, Col1}}.
+    {ok, {Byte, [B, A, $x, $\\ | SourceChars], {Row, Col + 4}, String}};
+parse_char({Row, Start}, {Row, Col}, [$\\, C | Rest], SourceChars) ->
+    case escape_char(C) of
+        {ok, ByteVal} ->
+            {ok, {ByteVal, [C, $\ | SourceChars], {Row, Col + 2}, Rest}};
+        error ->
+            {error, {invalid_escape_code, [$\\, C], Row, Start, Col + 1}}
+    end;
+parse_char(_, {Row, Col}, [C | Rest], SourceChars) ->
+    {ok, {C, [C | SourceChars], {Row, Col + 1}, Rest}}.
 
 escape_long_hex_code(_, {Row, Col}, "}" ++ String, SourceChars, Value) ->
     {ok, {Value, "}" ++ SourceChars, {Row, Col + 1}, String}};
@@ -171,7 +202,10 @@ escape_char($n)  -> {ok, $\n};
 escape_char($r)  -> {ok, $\r};
 escape_char($t)  -> {ok, $\t};
 escape_char($v)  -> {ok, $\v};
+% Technically \" and \' are only valid inside their own quote characters, not
+% each other, but whatever, we will just be permissive here.
 escape_char($")  -> {ok, $\"};
+escape_char($')  -> {ok, $\'};
 escape_char($\\) -> {ok, $\\};
 escape_char(_)   -> error.
 
@@ -234,6 +268,15 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) ->
         {O, N, _} ->
             {error, {wrong_type, O, N, string, Row, Start, End}}
     end;
+parse_expression2(Type, Pos, String, {char_literal, _, Value, Row, Start, End}) ->
+    case Type of
+        {_, _, char} ->
+            {ok, {Value, Pos, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Pos, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, char, Row, Start, End}}
+    end;
 parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) ->
     parse_list(Type, Pos, String, Row, Start);
 parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) ->
@@ -276,6 +319,10 @@ unexpected_token({_, S, _, Row, Start, End}) ->
 
 %%% Ambiguous Chain Object vs Identifier Parsing
 
+parse_alphanum(Type, Pos, String, ["true"], Row, Start, End) ->
+    typecheck_bool(Type, Pos, String, true, Row, Start, End);
+parse_alphanum(Type, Pos, String, ["false"], Row, Start, End) ->
+    typecheck_bool(Type, Pos, String, false, Row, Start, End);
 parse_alphanum(Type, Pos, String, [[C | _] = S], Row, Start, End) when ?IS_LATIN_LOWER(C) ->
     % From a programming perspective, we are trying to parse a constant, so
     % an alphanum token can really only be a constructor, or a chain object.
@@ -303,6 +350,13 @@ parse_alphanum(Type, Pos, String, Path, Row, Start, End) ->
     % must be a variant constructor, or invalid.
     parse_variant(Type, Pos, String, Path, Row, Start, End).
 
+typecheck_bool({_, _, unknown_type}, Pos, String, Value, _, _, _) ->
+    {ok, {Value, Pos, String}};
+typecheck_bool({_, _, boolean}, Pos, String, Value, _, _, _) ->
+    {ok, {Value, Pos, String}};
+typecheck_bool({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, boolean, Row, Start, End}}.
+
 typecheck_address({_, _, address}, Pos, String, Data, _, _, _) ->
     {ok, {{address, Data}, Pos, String}};
 typecheck_address({_, _, contract}, Pos, String, Data, _, _, _) ->
@@ -885,11 +939,23 @@ anon_types_test() ->
     % Integers.
     check_parser("123"),
     check_parser("1_2_3"),
+    % Booleans.
+    check_parser("true"),
+    check_parser("false"),
+    check_parser("[true, false]"),
     % Bytes.
     check_parser("#DEAD000BEEF"),
     check_parser("#DE_AD0_00B_EEF"),
     % Strings.
     check_parser("\"hello world\""),
+    % The Sophia compiler doesn't handle this right, but we should still.
+    %check_parser("\"ÿ\""),
+    %check_parser("\"♣\""),
+    % Characters.
+    check_parser("'A'"),
+    check_parser("['a', ' ', '[']"),
+    %check_parser("'ÿ'"),
+    %check_parser("'♣'"),
     % List of integers.
     check_parser("[1, 2, 3]"),
     % List of lists.
@@ -905,6 +971,13 @@ string_escape_codes_test() ->
     check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
     check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
     check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
+    check_parser("\"'\""),
+
+    check_parser("['\\b', '\\e', '\\f', '\\n', '\\r', '\\t', '\\v', '\"', '\\'', '\\\\']"),
+    check_parser("['\\x00', '\\x11', '\\x77', '\\x4a', '\\x4A']"),
+    check_parser("['\\x{0}', '\\x{7}', '\\x{7F}', '\\x{07F}', '\\x{007F}', '\\x{0007F}', '\\x{0000007F}']"),
+    check_parser("'\"'"),
+
     ok.
 
 records_test() ->