Parse address literals.

Also signatures.
Fix lexer row/column calculations.
2026-02-03 06:00:40 +00:00 · 2026-02-03 01:42:17 +00:00 · 2026-02-03 00:41:00 +00:00
1 changed files with 183 additions and 56 deletions
@@ -4,10 +4,13 @@
 -copyright("Jarvis Carroll <spiveehere@gmail.com>").
 -license("GPL-3.0-or-later").
-export([check_parser/1]).
+-export([parse_literal/1, parse_literal/2, check_parser/1]).
 -include_lib("eunit/include/eunit.hrl").
 parse_literal(String) ->
    parse_literal(unknown_type(), String).
 parse_literal(Type, String) ->
    case parse_expression(Type, {tk, 1, 1}, String) of
        {ok, {Result, NewTk, NewString}} ->
@@ -29,7 +32,9 @@ parse_literal2(Result, Tk, String) ->
 %%% Tokenizer
-define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))).
+-define(IS_LATIN_UPPER(C), (((C) >= $A) and ((C) =< $Z))).
 -define(IS_LATIN_LOWER(C), (((C) >= $a) and ((C) =< $z))).
 -define(IS_ALPHA(C), (?IS_LATIN_UPPER(C) or ?IS_LATIN_LOWER(C) or ((C) == $_))).
 -define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
 -define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))).
 -define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
@@ -37,55 +42,55 @@ parse_literal2(Result, Tk, String) ->
 next_token({tk, Row, Col}, []) ->
    {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
 next_token({tk, Row, Col}, " " ++ Rest) ->
-    next_token({tk, Row + 1, Col}, Rest);
+    next_token({tk, Row, Col + 1}, Rest);
 next_token({tk, Row, Col}, "\t" ++ Rest) ->
-    next_token({tk, Row + 1, Col}, Rest);
+    next_token({tk, Row, Col + 1}, Rest);
-next_token({tk, _, Col}, "\r\n" ++ Rest) ->
+next_token({tk, Row, _}, "\r\n" ++ Rest) ->
-    next_token({tk, 1, Col + 1}, Rest);
+    next_token({tk, Row + 1, 1}, Rest);
-next_token({tk, _, Col}, "\r" ++ Rest) ->
+next_token({tk, Row, _}, "\r" ++ Rest) ->
-    next_token({tk, 1, Col + 1}, Rest);
+    next_token({tk, Row + 1, 1}, Rest);
-next_token({tk, _, Col}, "\n" ++ Rest) ->
+next_token({tk, Row, _}, "\n" ++ Rest) ->
-    next_token({tk, 1, Col + 1}, Rest);
+    next_token({tk, Row + 1, 1}, Rest);
 next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
    alphanum_token(Tk, Tk, String, []);
 next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
    num_token(Tk, Tk, String, [], 0);
 next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
-    bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []);
+    bytes_token({tk, Row, Col}, {tk, Row, Col + 1}, [C | Rest], "#", []);
 next_token({tk, Row, Col}, "\"" ++ Rest) ->
-    string_token({tk, Row, Col}, {tk, Row + 1, Col}, Rest, "\"", <<>>);
+    string_token({tk, Row, Col}, {tk, Row, Col + 1}, Rest, "\"", <<>>);
 next_token({tk, Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Char, Row, Col, Col},
-    {ok, {Token, {tk, Row + 1, Col}, Rest}}.
+    {ok, {Token, {tk, Row, Col + 1}, Rest}}.
 alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
-    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
+    alphanum_token(Start, {tk, Row, Col + 1}, Rest, [C | Acc]);
 alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    AlphaString = lists:reverse(Acc),
-    Token = {alphanum, AlphaString, AlphaString, Row, Start, End},
+    Token = {alphanum, AlphaString, AlphaString, Row, Start, End - 1},
    {ok, {Token, {tk, Row, End}, String}}.
 num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
    NewValue = Value * 10 + (C - $0),
-    num_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], NewValue);
+    num_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], NewValue);
 num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
    NewValue = Value * 10 + (C - $0),
-    num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Chars], NewValue);
+    num_token(Start, {tk, Row, Col + 2}, Rest, [C, $_ | Chars], NewValue);
 num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
    NumString = lists:reverse(Chars),
-    Token = {integer, NumString, Value, Row, Start, End},
+    Token = {integer, NumString, Value, Row, Start, End - 1},
    {ok, {Token, {tk, Row, End}, String}}.
 bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
    Digit = convert_digit(C),
-    bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], [Digit | Digits]);
+    bytes_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]);
 bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
    Digit = convert_digit(C),
-    bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Chars], [Digit | Digits]);
+    bytes_token(Start, {tk, Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]);
 bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
    BytesString = lists:reverse(Chars),
    Value = reverse_combine_nibbles(Digits, <<>>),
-    Token = {bytes, BytesString, Value, Row, Start, End},
+    Token = {bytes, BytesString, Value, Row, Start, End - 1},
    {ok, {Token, {tk, Row, End}, String}}.
 convert_digit(C) when C >= $0, C =< $9 ->
@@ -103,35 +108,54 @@ reverse_combine_nibbles([D1], Acc) ->
 reverse_combine_nibbles([], Acc) ->
    Acc.
-string_token(Start, {tk, Row, Col}, [$\\, $x, A, B | Rest], SourceChars, Value) ->
+string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
-    case escape_hex_code(A, B) of
+    case escape_hex_code({tk, Row, Col}, {tk, Row, Col + 2}, String, "x\\" ++ SourceChars) of
-        {ok, ByteVal} ->
+        {ok, {Codepoint, NewSourceChars, NewTk, NewString}} ->
-            string_token(Start, {tk, Row + 4, Col}, Rest, [B, A, $x, $\ | SourceChars], <<Value/binary, ByteVal>>);
+            NewValue = <<Value/binary, Codepoint/utf8>>,
-        error ->
+            string_token(Start, NewTk, NewString, NewSourceChars, NewValue);
-            {error, {invalid_escape_code, [$\\, $x, A, B], Row, Col}}
+        {error, Reason} ->
            {error, Reason}
    end;
 string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
    case escape_char(C) of
        {ok, ByteVal} ->
-            string_token(Start, {tk, Row + 2, Col}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
+            string_token(Start, {tk, Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
        error ->
            {error, {invalid_escape_code, [C], Row, Col}}
    end;
-string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) ->
+string_token({tk, _, Start}, {tk, Row, Col}, [$" | Rest], SourceChars, Value) ->
    SourceStr = lists:reverse([$" | SourceChars]),
-    Token = {string, SourceStr, Value, Row, Start, End},
+    Token = {string, SourceStr, Value, Row, Start, Col},
-    {ok, {Token, {tk, Row, End}, Rest}};
+    {ok, {Token, {tk, Row, Col + 1}, Rest}};
 string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
-    string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <<Value/binary, C>>).
+    % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
    % bother, if we need to convert it back? I guess we could accept iolists if
    % we really wanted to waste time on this point...
    string_token(Start, {tk, Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
-escape_hex_code(A, B) when ?IS_HEX(A), ?IS_HEX(B) ->
+escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) ->
    escape_long_hex_code(Start, {tk, Row, Col + 1}, String, "{" ++ SourceChars, 0);
 escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
    % As of writing this, the Sophia compiler will convert this byte from
    % extended ASCII to unicode... But it really shouldn't. The literal parser
    % does what the compiler should do.
    Byte = convert_digit(A) * 16 + convert_digit(B),
-    {ok, Byte};
+    {ok, {Byte, [B, A | SourceChars], {tk, Row, Col + 2}, String}};
-escape_hex_code(_, _) ->
+escape_hex_code({tk, Row1, Col1}, _, _, _) ->
-    error.
+    {error, {invalid_escape_code, "\\x", Row1, Col1}}.
 escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) ->
    {ok, {Value, "}" ++ SourceChars, {tk, Row, Col + 1}, String}};
 escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
    NewSourceChars = [C | SourceChars],
    NewValue = 16 * Value + convert_digit(C),
    escape_long_hex_code(Start, {tk, Row, Col + 1}, String, NewSourceChars, NewValue);
 escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) ->
    {error, {invalid_hexadecimal, [C], Row, Col}};
 escape_long_hex_code(_, Tk, [], SourceChars, Value) ->
    % Just return as if the escape code were closed, and let the string parser
    % produce an unclosed string error instead.
    {ok, {Value, SourceChars, Tk, []}}.
 escape_char($b)  -> {ok, $\b};
 escape_char($e)  -> {ok, $\e};
@@ -209,8 +233,8 @@ parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) ->
    parse_tuple(Type, Tk, String, Row, Start);
 parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) ->
    parse_record_or_map(Type, Tk, String, Row, Start);
-parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) ->
+parse_expression2(Type, Tk, String, {alphanum, S, _, Row, Start, End}) ->
-    parse_variant(Type, Tk, String, Ident, Row, Start, End);
+    parse_alphanum(Type, Tk, String, S, Row, Start, End);
 parse_expression2(_, _, _, {_, S, _, Row, Start, End}) ->
    {error, {unexpected_token, S, Row, Start, End}}.
@@ -227,6 +251,69 @@ expect_tokens([Str | Rest], Tk, String) ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.
 %%% Ambiguous Chain Object vs Identifier Parsing
 parse_alphanum(Type, Tk, String, [C | _] = S, Row, Start, End) when ?IS_LATIN_UPPER(C) ->
    % From a programming perspective, we are trying to parse a constant, so
    % an alphanum token can really only be a constructor, or a chain object.
    % Chain objects start with lowercase prefixes, like ak_, so clearly this is
    % a variant constructor.
    parse_variant(Type, Tk, String, S, Row, Start, End);
 parse_alphanum(Type, Tk, String, S, Row, Start, End) ->
    % Inversely, variant constructors are always uppercase, so now that we have
    % handled that case, only chain objects are left.
    try
        case gmser_api_encoder:decode(unicode:characters_to_binary(S)) of
            {account_pubkey, Data} ->
                typecheck_address(Type, Tk, String, Data, Row, Start, End);
            {contract_pubkey, Data} ->
                typecheck_contract(Type, Tk, String, Data, Row, Start, End);
            {signature, Data} ->
                typecheck_signature(Type, Tk, String, Data, Row, Start, End);
            {_, _} ->
                % Only a few chain objects are recognized by Sophia. The rest
                % are interpreted as identifiers, so we might as well give the
                % same sort of error that the compiler would give.
                {error, {unexpected_identifier, S, Row, Start, End}}
        end
    catch
        _:_ -> {error, {unexpected_identifier, S, Row, Start, End}}
    end.
 typecheck_address({_, _, address}, Tk, String, Data, _, _, _) ->
    {ok, {{address, Data}, Tk, String}};
 typecheck_address({_, _, contract}, Tk, String, Data, _, _, _) ->
    % The compiler would type error, but we should be lenient here.
    {ok, {{contract, Data}, Tk, String}};
 typecheck_address({_, _, unknown_type}, Tk, String, Data, _, _, _) ->
    {ok, {{address, Data}, Tk, String}};
 typecheck_address({O, N, _}, _, _, _, Row, Start, End) ->
    {error, {wrong_type, O, N, address, Row, Start, End}}.
 typecheck_contract({_, _, contract}, Tk, String, Data, _, _, _) ->
    {ok, {{contract, Data}, Tk, String}};
 typecheck_contract({_, _, address}, Tk, String, Data, _, _, _) ->
    % The compiler would type error, but we should be lenient here.
    {ok, {{address, Data}, Tk, String}};
 typecheck_contract({_, _, unknown_type}, Tk, String, Data, _, _, _) ->
    {ok, {{contract, Data}, Tk, String}};
 typecheck_contract({O, N, _}, _, _, _, Row, Start, End) ->
    {error, {wrong_type, O, N, contract, Row, Start, End}}.
 typecheck_signature({_, _, signature}, Tk, String, Data, _, _, _) ->
    {ok, {{bytes, Data}, Tk, String}};
 typecheck_signature({_, _, {bytes, [64]}}, Tk, String, Data, _, _, _) ->
    % The compiler would probably type-error, but whatever.
    {ok, {{bytes, Data}, Tk, String}};
 typecheck_signature({_, _, {bytes, [any]}}, Tk, String, Data, _, _, _) ->
    % The compiler would probably type-error, but whatever.
    {ok, {{bytes, Data}, Tk, String}};
 typecheck_signature({_, _, unknown_type}, Tk, String, Data, _, _, _) ->
    {ok, {{bytes, Data}, Tk, String}};
 typecheck_signature({O, N, _}, _, _, _, Row, Start, End) ->
    {error, {wrong_type, O, N, signature, Row, Start, End}}.
 %%% List Parsing
 parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
@@ -675,7 +762,7 @@ check_sophia_to_fate(Type, Sophia, Fate) ->
            erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
    end.
-compile_entrypoint_code_and_type(Source, Entrypoint) ->
+compile_entrypoint_value_and_type(Source, Entrypoint) ->
    {ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]),
    % Find the fcode for the correct entrypoint.
@@ -684,12 +771,13 @@ compile_entrypoint_code_and_type(Source, Entrypoint) ->
    Name = unicode:characters_to_binary(Entrypoint),
    {Hash, Name} = lists:keyfind(Name, 2, Names),
    {_, _, Code} = maps:get(Hash, Bodies),
    FATE = extract_return_value(Code),
    % Generate the AACI, and get the AACI type info for the correct entrypoint.
    AACI = hz_aaci:prepare_aaci(ACI),
    {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"),
-    {Code, Type}.
+    {FATE, Type}.
 extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) ->
    FATE;
@@ -700,11 +788,10 @@ check_parser(Sophia) ->
    % Compile the literal using the compiler, to check that it is valid Sophia
    % syntax, and to get an AACI object to pass to the parser.
    Source = "contract C = entrypoint f() = " ++ Sophia,
-    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
+    {Fate, Type} = compile_entrypoint_value_and_type(Source, "f"),
    % Check that when we parse the term we get the same value as the Sophia
    % compiler.
    Fate = extract_return_value(Code),
    check_sophia_to_fate(unknown_type(), Sophia, Fate),
    % Then, once we know that the term is correct, make sure that it is still
@@ -714,11 +801,7 @@ check_parser(Sophia) ->
 check_parser_with_typedef(Typedef, Sophia) ->
    % Compile the type definitions alongside the usual literal expression.
    Source = "contract C =\n  " ++ Typedef ++ "\n  entrypoint f() = " ++ Sophia,
-    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
+    {Fate, Type} = compile_entrypoint_value_and_type(Source, "f"),
    Fate = extract_return_value(Code),
    % Check the FATE term as usual.
    gmb_fate_encoding:serialize(Fate),
    % Do a typed parse, as usual, but there are probably record/variant
    % definitions in the AACI, so untyped parses probably don't work.
@@ -747,7 +830,7 @@ anon_types_test() ->
 string_escape_codes_test() ->
    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
-    check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""),
+    check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
    ok.
 records_test() ->
@@ -758,6 +841,38 @@ records_test() ->
    % will error, though.
    {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).
 variant_test() ->
    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",
    check_parser_with_typedef(TypeDef, "Zero"),
    check_parser_with_typedef(TypeDef, "One(0)"),
    check_parser_with_typedef(TypeDef, "Two(0, 1)"),
    check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),
    {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),
    ok.
 chain_objects_test() ->
    % Address,
    check_parser("ak_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx"),
    % Two different forms of signature,
    check_parser("[sg_XDyF8LJC4tpMyAySvpaG1f5V9F2XxAbRx9iuVjvvdNMwVracLhzAuXhRM5kXAFtpwW1DCHuz5jGehUayCah4jub32Ti2n, #00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF]"),
    % We have to build a totally custom contract example in order to get an
    % AACI and return value for parsing contract addresses. This is because the
    % compiler demands that contract addresses be type checked according to the
    % logic of "contract oriented programming", including covariance, etc. and
    % "contract oriented programming" is not very compatible with ML style type
    % inference.
    Contract = "ct_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx",
    Source = "contract C = entrypoint f(): C = " ++ Contract,
    {Fate, ContractType} = compile_entrypoint_value_and_type(Source, "f"),
    check_sophia_to_fate(ContractType, Contract, Fate),
    check_sophia_to_fate(unknown_type(), Contract, Fate),
    ok.
 singleton_records_test() ->
    TypeDef = "record singleton('a) = {it: 'a}",
    check_parser_with_typedef(TypeDef, "{it = 123}"),
@@ -795,16 +910,28 @@ excess_parens_test() ->
    ok.
-variant_test() ->
+lexer_offset_test() ->
-    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",
+    % Test that various tokens report their position correctly.
    {error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"),
    {error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"),
    {error, {unexpected_token, "ABC", 1, 5, 7}} = parse_literal("123 ABC"),
    {error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"),
    {error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""),
    {error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""),
-    check_parser_with_typedef(TypeDef, "Zero"),
+    % Check that the tokenizer knows its position correctly *after* various
-    check_parser_with_typedef(TypeDef, "One(0)"),
+    % tokens.
-    check_parser_with_typedef(TypeDef, "Two(0, 1)"),
+    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"),
-    check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),
+    ABCType = {"mytype", already_normalized, {variant, [{"ABC", []}]}},
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "ABC 123"),
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"),
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"),
    {error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"),
-    {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),
+    % Check that the tokenizer accounts for various line separators correctly.
    {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\nABC"),
    {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\r\nABC"),
    {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\rABC"),
    ok.
Author	SHA1	Message	Date
Jarvis Carroll	a695c21fc9	Parse address literals. Also signatures.	2026-02-03 06:00:40 +00:00
Jarvis Carroll	493bdb990c	Fix lexer row/column calculations.	2026-02-03 01:42:17 +00:00
Jarvis Carroll	17f635af61	Parse long hex escape codes This doesn't work super consistently in the compiler, for codepoints above 127, but it should work fine for us, so, oh well!	2026-02-03 00:41:00 +00:00