Parse qualified names.

This seemed like it was going to be insanely insanely complex, but then it turns out the compiler doesn't accept spaces in qualified names, so I can just dump periods in the lexer and hit it with string:split/3. Easy.
Handle token/parse errors more carefully
2026-02-05 07:13:25 +00:00 · 2026-02-04 07:00:39 +00:00 · 2026-02-03 06:08:54 +00:00 · 2026-02-03 06:00:40 +00:00 · 2026-02-03 01:42:17 +00:00 · 2026-02-03 00:41:00 +00:00
3 changed files with 2184 additions and 1085 deletions
@@ -0,0 +1,993 @@
+-module(hz_sophia).
+-vsn("0.8.2").
+-author("Jarvis Carroll <spiveehere@gmail.com>").
+-copyright("Jarvis Carroll <spiveehere@gmail.com>").
+-license("GPL-3.0-or-later").
+
+-export([parse_literal/1, parse_literal/2, check_parser/1]).
+
+-include_lib("eunit/include/eunit.hrl").
+
+parse_literal(String) ->
+    parse_literal(unknown_type(), String).
+
+parse_literal(Type, String) ->
+    case parse_expression(Type, {1, 1}, String) of
+        {ok, {Result, NewPos, NewString}} ->
+            parse_literal2(Result, NewPos, NewString);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_literal2(Result, Pos, String) ->
+    % We have parsed a valid expression. Now check that the string ends.
+    case next_token(Pos, String) of
+        {ok, {{eof, _, _, _, _}, _, _}} ->
+            {ok, Result};
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%%% Tokenizer
+
+-define(IS_LATIN_UPPER(C), (((C) >= $A) and ((C) =< $Z))).
+-define(IS_LATIN_LOWER(C), (((C) >= $a) and ((C) =< $z))).
+-define(IS_ALPHA(C), (?IS_LATIN_UPPER(C) or ?IS_LATIN_LOWER(C) or ((C) == $_))).
+-define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
+-define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C) or ((C) == $.))).
+-define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
+
+next_token({Row, Col}, []) ->
+    {ok, {{eof, "", Row, Col, Col}, {Row, Col}, []}};
+next_token({Row, Col}, " " ++ Rest) ->
+    next_token({Row, Col + 1}, Rest);
+next_token({Row, Col}, "\t" ++ Rest) ->
+    next_token({Row, Col + 1}, Rest);
+next_token({Row, _}, "\r\n" ++ Rest) ->
+    next_token({Row + 1, 1}, Rest);
+next_token({Row, _}, "\r" ++ Rest) ->
+    next_token({Row + 1, 1}, Rest);
+next_token({Row, _}, "\n" ++ Rest) ->
+    next_token({Row + 1, 1}, Rest);
+next_token(Pos, [C | _] = String) when ?IS_ALPHA(C) ->
+    alphanum_token(Pos, Pos, String, []);
+next_token(Pos, [C | _] = String) when ?IS_NUM(C) ->
+    num_token(Pos, Pos, String, [], 0);
+next_token({Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
+    bytes_token({Row, Col}, {Row, Col + 1}, [C | Rest], "#", []);
+next_token({Row, Col}, "\"" ++ Rest) ->
+    string_token({Row, Col}, {Row, Col + 1}, Rest, "\"", <<>>);
+next_token({Row, Col}, [Char | Rest]) ->
+    Token = {character, [Char], Char, Row, Col, Col},
+    {ok, {Token, {Row, Col + 1}, Rest}}.
+
+alphanum_token(Start, {Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
+    alphanum_token(Start, {Row, Col + 1}, Rest, [C | Acc]);
+alphanum_token({_, Start}, {Row, End}, String, Acc) ->
+    AlphaString = lists:reverse(Acc),
+    Path = string:split(AlphaString, ".", all),
+    Token = {alphanum, AlphaString, Path, Row, Start, End - 1},
+    {ok, {Token, {Row, End}, String}}.
+
+num_token(Start, {Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
+    NewValue = Value * 10 + (C - $0),
+    num_token(Start, {Row, Col + 1}, Rest, [C | Chars], NewValue);
+num_token(Start, {Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
+    NewValue = Value * 10 + (C - $0),
+    num_token(Start, {Row, Col + 2}, Rest, [C, $_ | Chars], NewValue);
+num_token({_, Start}, {Row, End}, String, Chars, Value) ->
+    NumString = lists:reverse(Chars),
+    Token = {integer, NumString, Value, Row, Start, End - 1},
+    {ok, {Token, {Row, End}, String}}.
+
+bytes_token(Start, {Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
+    Digit = convert_digit(C),
+    bytes_token(Start, {Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]);
+bytes_token(Start, {Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
+    Digit = convert_digit(C),
+    bytes_token(Start, {Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]);
+bytes_token({_, Start}, {Row, End}, String, Chars, Digits) ->
+    BytesString = lists:reverse(Chars),
+    Value = reverse_combine_nibbles(Digits, <<>>),
+    Token = {bytes, BytesString, Value, Row, Start, End - 1},
+    {ok, {Token, {Row, End}, String}}.
+
+convert_digit(C) when C >= $0, C =< $9 ->
+    C - $0;
+convert_digit(C) when C >= $A, C =< $Z ->
+    C - $A + 10;
+convert_digit(C) when C >= $a, C =< $z ->
+    C - $a + 10.
+
+reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
+    NewAcc = <<D2:4, D1:4, Acc/binary>>,
+    reverse_combine_nibbles(Rest, NewAcc);
+reverse_combine_nibbles([D1], Acc) ->
+    <<0:4, D1:4, Acc/binary>>;
+reverse_combine_nibbles([], Acc) ->
+    Acc.
+
+string_token(Start, {Row, Col}, "\\x" ++ String, SourceChars, Value) ->
+    case escape_hex_code({Row, Col}, {Row, Col + 2}, String, "x\\" ++ SourceChars) of
+        {ok, {Codepoint, NewSourceChars, NewPos, NewString}} ->
+            NewValue = <<Value/binary, Codepoint/utf8>>,
+            string_token(Start, NewPos, NewString, NewSourceChars, NewValue);
+        {error, Reason} ->
+            {error, Reason}
+    end;
+string_token(Start, {Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
+    case escape_char(C) of
+        {ok, ByteVal} ->
+            string_token(Start, {Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
+        error ->
+            {error, {invalid_escape_code, [C], Row, Col}}
+    end;
+string_token({_, Start}, {Row, Col}, [$" | Rest], SourceChars, Value) ->
+    SourceStr = lists:reverse([$" | SourceChars]),
+    Token = {string, SourceStr, Value, Row, Start, Col},
+    {ok, {Token, {Row, Col + 1}, Rest}};
+string_token(Start, {Row, Col}, [C | Rest], SourceChars, Value) ->
+    % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
+    % bother, if we need to convert it back? I guess we could accept iolists if
+    % we really wanted to waste time on this point...
+    string_token(Start, {Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
+
+escape_hex_code(Start, {Row, Col}, "{" ++ String, SourceChars) ->
+    escape_long_hex_code(Start, {Row, Col + 1}, String, "{" ++ SourceChars, 0);
+escape_hex_code(_, {Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
+    % As of writing this, the Sophia compiler will convert this byte from
+    % extended ASCII to unicode... But it really shouldn't. The literal parser
+    % does what the compiler should do.
+    Byte = convert_digit(A) * 16 + convert_digit(B),
+    {ok, {Byte, [B, A | SourceChars], {Row, Col + 2}, String}};
+escape_hex_code({Row1, Col1}, _, _, _) ->
+    {error, {invalid_escape_code, "\\x", Row1, Col1}}.
+
+escape_long_hex_code(_, {Row, Col}, "}" ++ String, SourceChars, Value) ->
+    {ok, {Value, "}" ++ SourceChars, {Row, Col + 1}, String}};
+escape_long_hex_code(Start, {Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
+    NewSourceChars = [C | SourceChars],
+    NewValue = 16 * Value + convert_digit(C),
+    escape_long_hex_code(Start, {Row, Col + 1}, String, NewSourceChars, NewValue);
+escape_long_hex_code(_, {Row, Col}, [C | _], _, _) ->
+    {error, {invalid_hexadecimal, [C], Row, Col}};
+escape_long_hex_code(_, Pos, [], SourceChars, Value) ->
+    % Just return as if the escape code were closed, and let the string parser
+    % produce an unclosed string error instead.
+    {ok, {Value, SourceChars, Pos, []}}.
+
+escape_char($b)  -> {ok, $\b};
+escape_char($e)  -> {ok, $\e};
+escape_char($f)  -> {ok, $\f};
+escape_char($n)  -> {ok, $\n};
+escape_char($r)  -> {ok, $\r};
+escape_char($t)  -> {ok, $\t};
+escape_char($v)  -> {ok, $\v};
+escape_char($")  -> {ok, $\"};
+escape_char($\\) -> {ok, $\\};
+escape_char(_)   -> error.
+
+%%% Sophia Literal Parser
+
+%%% This parser is a simple recursive descent parser, written explicitly in
+%%% erlang.
+%%%
+%%% There are no infix operators in the subset we want to parse, so recursive
+%%% descent is fine with no special tricks, no shunting yard algorithm, no
+%%% parser generators, etc.
+%%%
+%%% If we were writing this in C then we might want to work iteratively with an
+%%% array of finite state machines, i.e. with a pushdown automaton, instead of
+%%% using recursion. This is a tried and true method of making fast parsers.
+%%% Recall, however, that the BEAM *is* a stack machine, written in C, so
+%%% rather than writing confusing iterative code in Erlang, to simulate a
+%%% pushdown automaton inside another simulated stack machine... we should just
+%%% write the recursive code, thus programming the BEAM to implement the
+%%% pushdown automaton that we want.
+
+parse_expression(Type, Pos, String) ->
+    case next_token(Pos, String) of
+        {ok, {Token, NewPos, NewString}} ->
+            parse_expression2(Type, NewPos, NewString, Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_expression2(Type, Pos, String, {integer, _, Value, Row, Start, End}) ->
+    case Type of
+        {_, _, integer} ->
+            {ok, {Value, Pos, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Pos, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, integer, Row, Start, End}}
+    end;
+parse_expression2(Type, Pos, String, {bytes, _, Value, Row, Start, End}) ->
+    Len = byte_size(Value),
+    Result = {bytes, Value},
+    case Type of
+        {_, _, {bytes, [any]}} ->
+            {ok, {Result, Pos, String}};
+        {_, _, {bytes, [Len]}} ->
+            {ok, {Result, Pos, String}};
+        {_, _, {bytes, [ExpectedLen]}} ->
+            {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
+        {_, _, unknown_type} ->
+            {ok, {Result, Pos, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, {bytes, [Len]}, Row, Start, End}}
+    end;
+parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) ->
+    case Type of
+        {_, _, string} ->
+            {ok, {Value, Pos, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Pos, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, string, Row, Start, End}}
+    end;
+parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) ->
+    parse_list(Type, Pos, String, Row, Start);
+parse_expression2(Type, Pos, String, {character, "(", _, Row, Start, _}) ->
+    parse_tuple(Type, Pos, String, Row, Start);
+parse_expression2(Type, Pos, String, {character, "{", _, Row, Start, _}) ->
+    parse_record_or_map(Type, Pos, String, Row, Start);
+parse_expression2(Type, Pos, String, {alphanum, _, Path, Row, Start, End}) ->
+    parse_alphanum(Type, Pos, String, Path, Row, Start, End);
+parse_expression2(_, _, _, {eof, _, _, _, _, _}) ->
+    {error, unexpected_end_of_file};
+parse_expression2(_, _, _, Token) ->
+    unexpected_token(Token).
+
+unknown_type() ->
+    {unknown_type, already_normalized, unknown_type}.
+
+expect_tokens([], Pos, String) ->
+    {ok, {Pos, String}};
+expect_tokens([Str | Rest], Pos, String) ->
+    case next_token(Pos, String) of
+        {ok, {{_, Str, _, _, _, _}, NewPos, NewString}} ->
+            expect_tokens(Rest, NewPos, NewString);
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, Str);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+unexpected_token(Token, _Expected) ->
+    % I don't know if this is a good idea, but sometimes there are only one or
+    % two tokens that could have worked, which might make for simple
+    % non-technical error messages. I don't know how to format that yet,
+    % though.
+    unexpected_token(Token).
+
+unexpected_token({eof, _, _, _, _, _}) ->
+    {error, expression_incomplete};
+unexpected_token({_, S, _, Row, Start, End}) ->
+    {error, {unexpected_token, S, Row, Start, End}}.
+
+%%% Ambiguous Chain Object vs Identifier Parsing
+
+parse_alphanum(Type, Pos, String, [[C | _] = S], Row, Start, End) when ?IS_LATIN_LOWER(C) ->
+    % From a programming perspective, we are trying to parse a constant, so
+    % an alphanum token can really only be a constructor, or a chain object.
+    % Constructors start with uppercase characters, so lowercase can only be a
+    % chain object.
+    try
+        case gmser_api_encoder:decode(unicode:characters_to_binary(S)) of
+            {account_pubkey, Data} ->
+                typecheck_address(Type, Pos, String, Data, Row, Start, End);
+            {contract_pubkey, Data} ->
+                typecheck_contract(Type, Pos, String, Data, Row, Start, End);
+            {signature, Data} ->
+                typecheck_signature(Type, Pos, String, Data, Row, Start, End);
+            {_, _} ->
+                % Only a few chain objects are recognized by Sophia. The rest
+                % are interpreted as identifiers, so we might as well give the
+                % same sort of error that the compiler would give.
+                {error, {unexpected_identifier, S, Row, Start, End}}
+        end
+    catch
+        _:_ -> {error, {unexpected_identifier, S, Row, Start, End}}
+    end;
+parse_alphanum(Type, Pos, String, Path, Row, Start, End) ->
+    % Inversely, chain object prefixes are always lowercase, so any other path
+    % must be a variant constructor, or invalid.
+    parse_variant(Type, Pos, String, Path, Row, Start, End).
+
+typecheck_address({_, _, address}, Pos, String, Data, _, _, _) ->
+    {ok, {{address, Data}, Pos, String}};
+typecheck_address({_, _, contract}, Pos, String, Data, _, _, _) ->
+    % The compiler would type error, but we should be lenient here.
+    {ok, {{contract, Data}, Pos, String}};
+typecheck_address({_, _, unknown_type}, Pos, String, Data, _, _, _) ->
+    {ok, {{address, Data}, Pos, String}};
+typecheck_address({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, address, Row, Start, End}}.
+
+typecheck_contract({_, _, contract}, Pos, String, Data, _, _, _) ->
+    {ok, {{contract, Data}, Pos, String}};
+typecheck_contract({_, _, address}, Pos, String, Data, _, _, _) ->
+    % The compiler would type error, but we should be lenient here.
+    {ok, {{address, Data}, Pos, String}};
+typecheck_contract({_, _, unknown_type}, Pos, String, Data, _, _, _) ->
+    {ok, {{contract, Data}, Pos, String}};
+typecheck_contract({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, contract, Row, Start, End}}.
+
+typecheck_signature({_, _, signature}, Pos, String, Data, _, _, _) ->
+    {ok, {{bytes, Data}, Pos, String}};
+typecheck_signature({_, _, {bytes, [64]}}, Pos, String, Data, _, _, _) ->
+    % The compiler would probably type-error, but whatever.
+    {ok, {{bytes, Data}, Pos, String}};
+typecheck_signature({_, _, {bytes, [any]}}, Pos, String, Data, _, _, _) ->
+    % The compiler would probably type-error, but whatever.
+    {ok, {{bytes, Data}, Pos, String}};
+typecheck_signature({_, _, unknown_type}, Pos, String, Data, _, _, _) ->
+    {ok, {{bytes, Data}, Pos, String}};
+typecheck_signature({O, N, _}, _, _, _, Row, Start, End) ->
+    {error, {wrong_type, O, N, signature, Row, Start, End}}.
+
+
+%%% List Parsing
+
+parse_list({_, _, {list, [Inner]}}, Pos, String, Row, Start) ->
+    parse_list_loop(Inner, Pos, String, "]", Row, Start, []);
+parse_list({_, _, unknown_type}, Pos, String, Row, Start) ->
+    parse_list_loop(unknown_type(), Pos, String, "]", Row, Start, []);
+parse_list({O, N, _}, _, _, Row, Start) ->
+    {error, {wrong_type, O, N, list, Row, Start, Start}}.
+
+parse_list_loop(Inner, Pos, String, CloseChar, Row, Start, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, CloseChar, _, _, _, _}, NewPos, NewString}} ->
+            {ok, {lists:reverse(Acc), NewPos, NewString}};
+        {ok, {Token, NewPos, NewString}} ->
+            parse_list_loop2(Inner, NewPos, NewString, CloseChar, Row, Start, Acc, Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_list_loop2(Inner, Pos, String, CloseChar, Row, Start, Acc, Token) ->
+    case parse_expression2(Inner, Pos, String, Token) of
+        {ok, {Value, NewPos, NewString}} ->
+            parse_list_loop3(Inner, NewPos, NewString, CloseChar, Row, Start, [Value | Acc]);
+        {error, Reason} ->
+            Wrapper = choose_list_error_wrapper(CloseChar),
+            % TODO: Are tuple indices off by one from list indices?
+            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
+            {error, Wrapped}
+    end.
+
+parse_list_loop3(Inner, Pos, String, CloseChar, Row, Start, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, CloseChar, _, _, _, _}, NewPos, NewString}} ->
+            {ok, {lists:reverse(Acc), NewPos, NewString}};
+        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
+            parse_list_loop(Inner, NewPos, NewString, CloseChar, Row, Start, Acc);
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, CloseChar);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+choose_list_error_wrapper("]") -> list_element;
+choose_list_error_wrapper(")") -> tuple_element.
+
+%%% Ambiguous Parenthesis Parsing
+
+parse_tuple({_, _, unknown_type}, Pos, String, Row, Start) ->
+    % An untyped tuple is a list of untyped terms, and weirdly our list parser
+    % works perfectly for that, as long as we change the closing character to
+    % be ")" instead of "]".
+    case parse_list_loop(unknown_type(), Pos, String, ")", Row, Start, []) of
+        {ok, {[Inner], NewPos, NewString}} ->
+            % In Sophia, singleton tuples are unwrapped, and given the inner
+            % type.
+            {ok, {Inner, NewPos, NewString}};
+        {ok, {TermList, NewPos, NewString}} ->
+            Result = {tuple, list_to_tuple(TermList)},
+            {ok, {Result, NewPos, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end;
+parse_tuple({O, N, T}, Pos, String, _, _) ->
+    % Typed tuple parsing is quite complex, because we also want to support
+    % normal parentheses for grouping. It's not strictly necessary for
+    % inputting data, since we don't have any infix operators in simple
+    % data/term notation, but the alternatives are to generate singleton tuples
+    % naively, (which are impossible to generate from Sophia,) or to hard error
+    % on singleton tuples! Being faithful to Sophia is clearly nice!
+
+    % Count how many ambiguous parens there are, including the one we already
+    % saw.
+    case count_open_parens(Pos, String, 1) of
+        {ok, {Count, Token, NewPos, NewString}} ->
+            % Compare that to the amount of nesting tuple connectives are in
+            % the type we are expected to produce.
+            {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []),
+            % Now work out what to do with all this information.
+            parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewPos, NewString, Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+count_open_parens(Pos, String, Count) ->
+    case next_token(Pos, String) of
+        {ok, {{character, "(", _, _, _, _}, NewPos, NewString}} ->
+            count_open_parens(NewPos, NewString, Count + 1);
+        {ok, {Token, NewPos, NewString}} ->
+            {ok, {Count, Token, NewPos, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 ->
+    % Have an open paren, and a tuple type. We need to go deeper!
+    extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]);
+extract_tuple_type_info(ParenCount, HeadType, Tails) ->
+    % No parens, or no more (non-empty) tuples. Stop!
+    {ParenCount, HeadType, Tails}.
+
+parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) ->
+    {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."};
+parse_tuple2(O, N, ExcessCount, HeadType, Tails, Pos, String, {character, ")", _, Row, Col, _}) ->
+    parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Pos, String, Row, Col);
+parse_tuple2(O, N, ExcessCount, HeadType, Tails, Pos, String, Token) ->
+    % Finished with parentheses for now, try and parse an expression out, to
+    % get our head term.
+    case parse_expression2(HeadType, Pos, String, Token) of
+        {ok, {Result, NewPos, NewString}} ->
+            % Got a head term. Now try to build all the other tuple layers.
+            parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewPos, NewString);
+        {error, Reason} ->
+            % TODO: Wrap errors here too.
+            {error, Reason}
+    end.
+
+parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
+    % There are zero excess parens, meaning all our parens are tuples. Get the
+    % top one.
+    [Tail | _] = Tails,
+    % We expected some nonzero number of elements before the close paren, but
+    % got zero.
+    ExpectCount = 1 + length(Tail),
+    {error, {not_enough_elements, ExpectCount, 0, Row, Col}};
+parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Pos, String, _, _) ->
+    % If we have some ambiguous parentheses left, we now know one of them is
+    % this empty tuple.
+    HeadTerm = {tuple, {}},
+    NewExcessCount = ExcessCount - 1,
+    % Now continue the loop as if it were an integer or something, in the head
+    % position.
+    parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Pos, String);
+parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
+    % We were expecting a head term of a different type!
+    {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}.
+
+parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Pos, String) ->
+    % Tuples left to build, but no extra open parens to deal with, so we can
+    % just parse multivalues naively, starting from the "we have a term,
+    % waiting for a comma" stage of the loop.
+    case parse_multivalue3(TailTypes, Pos, String, -1, -1, [HeadTerm]) of
+        {ok, {Terms, NewPos, NewString}} ->
+            NewHead = {tuple, list_to_tuple(Terms)},
+            parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewPos, NewString);
+        {error, Reason} ->
+            % TODO: More error wrapping?
+            {error, Reason}
+    end;
+parse_tuple_tails(_, _, 0, HeadTerm, [], Pos, String) ->
+    % No open parens left, no tuples left to build, we are done!
+    {ok, {HeadTerm, Pos, String}};
+parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Pos, String) ->
+    % The ambiguous case, where we have a mix of tuple parens, and grouping
+    % parens. We want to peek at the next token, to see if it closes a grouping
+    % paren.
+    case next_token(Pos, String) of
+        {ok, {{character, ")", _, _, _, _}, NewPos, NewString}} ->
+            % It is grouping! Close one excess paren, and continue.
+            parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewPos, NewString);
+        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
+            % It is a real tuple! Try the normal logic, then.
+            parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewPos, NewString);
+        {ok, {Token, _, _}} ->
+            % Anything else is just a boring parse error we can complain about.
+            unexpected_token(Token, ")");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Pos, String) ->
+    case parse_multivalue(TailTypes, Pos, String, -1, -1, [HeadTerm]) of
+        {ok, {Terms, NewPos, NewString}} ->
+            NewHead = {tuple, list_to_tuple(Terms)},
+            parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewPos, NewString);
+        {error, Reason} ->
+            % TODO: wrap errors?
+            {error, Reason}
+    end;
+parse_tuple_tails2(O, N, _, _, [], _, _) ->
+    % This case is created when, for example, we want int * int, but instead we
+    % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is,
+    % ((1, 2)) would have been valid, so it's actually the second comma that
+    % tips us off to the error, not the first one.
+    %
+    % For simpler cases, like (1, 2) when int was expected, this error message
+    % is fine:
+    Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}},
+    % TODO: Row/col
+    % TODO: Generate better error messages in the cases where N *is* a tuple,
+    %       but the first thing inside that tuple is the problem.
+    Err.
+
+%%% Unambiguous Tuple/Variant Parsing
+
+parse_multivalue(ElemTypes, Pos, String, Row, Start, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, ")", _, Row2, Start2, _}, NewPos, NewString}} ->
+            check_multivalue_long_enough(ElemTypes, NewPos, NewString, Row2, Start2, Acc);
+        {ok, {Token, NewPos, NewString}} ->
+            parse_multivalue2(ElemTypes, NewPos, NewString, Row, Start, Acc, Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_multivalue2([Next | Rest], Pos, String, Row, Start, Acc, Token) ->
+    case parse_expression2(Next, Pos, String, Token) of
+        {ok, {Value, NewPos, NewString}} ->
+            parse_multivalue3(Rest, NewPos, NewString, Row, Start, [Value | Acc]);
+        {error, Reason} ->
+            Wrapper = choose_list_error_wrapper(")"),
+            % TODO: Are tuple indices off by one from list indices?
+            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
+            {error, Wrapped}
+    end;
+parse_multivalue2([], Pos, String, _, _, Acc, {character, ")", _, _, _, _}) ->
+    {ok, {lists:reverse(Acc), Pos, String}};
+parse_multivalue2([], _, _, _, _, _, Token) ->
+    unexpected_token(Token, ")").
+
+parse_multivalue3(ElemTypes, Pos, String, Row, Start, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, ")", _, Row2, Start2, _}, NewPos, NewString}} ->
+            check_multivalue_long_enough(ElemTypes, NewPos, NewString, Row2, Start2, Acc);
+        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
+            parse_multivalue(ElemTypes, NewPos, NewString, Row, Start, Acc);
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, ")");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+check_multivalue_long_enough([], Pos, String, _, _, Acc) ->
+    {ok, {lists:reverse(Acc), Pos, String}};
+check_multivalue_long_enough(Remaining, _, _, Row, Col, Got) ->
+    GotCount = length(Got),
+    ExpectCount = length(Remaining) + GotCount,
+    {error, {not_enough_elements, ExpectCount, GotCount, Row, Col}}.
+
+%%% Variant parsing
+
+parse_variant({O, N, {variant, Variants}}, Pos, String, [Ident], Row, Start, End) ->
+    parse_variant2(O, N, Variants, Pos, String, "", Ident, Row, Start, End);
+parse_variant({O, N, {variant, Variants}}, Pos, String, [Namespace, Constructor], Row, Start, End) ->
+    case get_typename(O, N) of
+        [Namespace, _] ->
+            parse_variant2(O, N, Variants, Pos, String, Namespace ++ ".", Constructor, Row, Start, End);
+        _ ->
+            {error, {invalid_constructor, O, N, Namespace ++ "." ++ Constructor, Row, Start, End}}
+    end;
+parse_variant({_, _, unknown_type}, _, _, _, Row, Start, End) ->
+    {error, {unresolved_variant, Row, Start, End}};
+parse_variant({O, N, _}, _, _, _, Row, Start, End) ->
+    % In normal code, identifiers can have many meanings, which can result in
+    % lots of different errors. In constant/immediate/normalized Sophia terms
+    % we know identifiers are always variants, so we can type error if any
+    % other type was expected.
+    {error, {wrong_type, O, N, variant, Row, Start, End}}.
+
+get_typename(O, already_normalized) ->
+    get_typename(O);
+get_typename(_, N) ->
+    get_typename(N).
+
+get_typename({Name, _}) ->
+    string:split(Name, ".", all);
+get_typename(Name) ->
+    string:split(Name, ".", all).
+
+parse_variant2(O, N, Variants, Pos, String, Prefix, Constructor, Row, Start, End) ->
+    case lookup_variant(Constructor, Variants, 0) of
+        {ok, {Tag, ElemTypes}} ->
+            GetArity = fun({_, OtherElemTypes}) -> length(OtherElemTypes) end,
+            Arities = lists:map(GetArity, Variants),
+            parse_variant3(Arities, Tag, ElemTypes, Pos, String);
+        error ->
+            {error, {invalid_constructor, O, N, Prefix ++ Constructor, Row, Start, End}}
+    end.
+
+parse_variant3(Arities, Tag, [], Pos, String) ->
+    % Parsing of 0-arity variants is different.
+    Result = {variant, Arities, Tag, {}},
+    {ok, {Result, Pos, String}};
+parse_variant3(Arities, Tag, ElemTypes, Pos, String) ->
+    case next_token(Pos, String) of
+        {ok, {{character, "(", _, Row, Start, _}, NewPos, NewString}} ->
+            parse_variant4(Arities, Tag, ElemTypes, NewPos, NewString, Row, Start);
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, "(");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_variant4(Arities, Tag, ElemTypes, Pos, String, Row, Start) ->
+    case parse_multivalue(ElemTypes, Pos, String, Row, Start, []) of
+        {ok, {Terms, NewPos, NewString}} ->
+            Result = {variant, Arities, Tag, list_to_tuple(Terms)},
+            {ok, {Result, NewPos, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+lookup_variant(_, [], _) ->
+    error;
+lookup_variant(Ident, [{Ident, ElemTypes} | _], Tag) ->
+    {ok, {Tag, ElemTypes}};
+lookup_variant(Ident, [_ | Rest], Tag) ->
+    lookup_variant(Ident, Rest, Tag + 1).
+
+%%% Record parsing
+
+parse_record_or_map({_, _, {map, [KeyType, ValueType]}}, Pos, String, _, _) ->
+    parse_map(KeyType, ValueType, Pos, String, #{});
+parse_record_or_map({_, _, {record, Fields}}, Pos, String, _, _) ->
+    parse_record(Fields, Pos, String, #{});
+parse_record_or_map({_, _, unknown_type}, Pos, String, _, _) ->
+    case next_token(Pos, String) of
+        {ok, {{character, "}", _, _, _, _}, NewPos, NewString}} ->
+            {ok, {#{}, NewPos, NewString}};
+        {ok, {{character, "[", _, _, _, _}, NewPos, NewString}} ->
+            parse_map2(unknown_type(), unknown_type(), NewPos, NewString, #{});
+        {ok, {{alphanum, _, _, Row, Start, End}, _, _}} ->
+            {error, {unresolved_record, Row, Start, End}};
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, "}");
+        {error, Reason} ->
+            {error, Reason}
+    end;
+parse_record_or_map({O, N, _}, _, _, Row, Start) ->
+    {error, {wrong_type, O, N, map, Row, Start, Start}}.
+
+parse_record(Fields, Pos, String, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{alphanum, Ident, _, Row, Start, End}, NewPos, NewString}} ->
+            parse_record2(Fields, NewPos, NewString, Acc, Ident, Row, Start, End);
+        {ok, {{character, "}", _, Row, Start, End}, NewPos, NewString}} ->
+            parse_record_end(Fields, NewPos, NewString, Acc, Row, Start, End);
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, "}");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_record2(Fields, Pos, String, Acc, Ident, Row, Start, End) ->
+    case lists:keyfind(Ident, 1, Fields) of
+        {_, Type} ->
+            parse_record3(Fields, Pos, String, Acc, Ident, Row, Start, End, Type);
+        false ->
+            {error, {invalid_field, Ident, Row, Start, End}}
+    end.
+
+parse_record3(Fields, Pos, String, Acc, Ident, Row, Start, End, Type) ->
+    case maps:is_key(Ident, Acc) of
+        false ->
+            parse_record4(Fields, Pos, String, Acc, Ident, Type);
+        true ->
+            {error, {field_already_present, Ident, Row, Start, End}}
+    end.
+
+parse_record4(Fields, Pos, String, Acc, Ident, Type) ->
+    case expect_tokens(["="], Pos, String) of
+        {ok, {NewPos, NewString}} ->
+            parse_record5(Fields, NewPos, NewString, Acc, Ident, Type);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_record5(Fields, Pos, String, Acc, Ident, Type) ->
+    case parse_expression(Type, Pos, String) of
+        {ok, {Result, NewPos, NewString}} ->
+            NewAcc = maps:put(Ident, Result, Acc),
+            parse_record6(Fields, NewPos, NewString, NewAcc);
+        {error, Reason} ->
+            wrap_error(Reason, {record_field, Ident})
+    end.
+
+parse_record6(Fields, Pos, String, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
+            parse_record(Fields, NewPos, NewString, Acc);
+        {ok, {{character, "}", _, Row, Start, End}, NewPos, NewString}} ->
+            parse_record_end(Fields, NewPos, NewString, Acc, Row, Start, End);
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, "}");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_record_end(Fields, Pos, String, FieldValues, Row, Start, End) ->
+    case parse_record_final_loop(Fields, FieldValues, []) of
+        {ok, Result} ->
+            {ok, {Result, Pos, String}};
+        {error, {missing_field, Name}} ->
+            {error, {missing_field, Name, Row, Start, End}}
+    end.
+
+parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) ->
+    case maps:find(Name, FieldValues) of
+        {ok, Value} ->
+            parse_record_final_loop(Rest, FieldValues, [Value | Acc]);
+        error ->
+            {error, {missing_field, Name}}
+    end;
+parse_record_final_loop([], _, [Field]) ->
+    % Singleton records are type-checked in Sophia, but unwrapped in the
+    % resulting FATE.
+    {ok, Field};
+parse_record_final_loop([], _, FieldsReverse) ->
+    Fields = lists:reverse(FieldsReverse),
+    Tuple = list_to_tuple(Fields),
+    {ok, {tuple, Tuple}}.
+
+
+%%% Map Parsing
+
+parse_map(KeyType, ValueType, Pos, String, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, "[", _, _, _, _}, NewPos, NewString}} ->
+            parse_map2(KeyType, ValueType, NewPos, NewString, Acc);
+        {ok, {{character, "}", _, _, _, _}, NewPos, NewString}} ->
+            {ok, {Acc, NewPos, NewString}};
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, "}");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_map2(KeyType, ValueType, Pos, String, Acc) ->
+    case parse_expression(KeyType, Pos, String) of
+        {ok, {Result, NewPos, NewString}} ->
+            parse_map3(KeyType, ValueType, NewPos, NewString, Acc, Result);
+        {error, Reason} ->
+            wrap_error(Reason, {map_key, maps:size(Acc)})
+    end.
+
+parse_map3(KeyType, ValueType, Pos, String, Acc, Key) ->
+    case expect_tokens(["]", "="], Pos, String) of
+        {ok, {NewPos, NewString}} ->
+            parse_map4(KeyType, ValueType, NewPos, NewString, Acc, Key);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_map4(KeyType, ValueType, Pos, String, Acc, Key) ->
+    case parse_expression(ValueType, Pos, String) of
+        {ok, {Result, NewPos, NewString}} ->
+            NewAcc = maps:put(Key, Result, Acc),
+            parse_map5(KeyType, ValueType, NewPos, NewString, NewAcc);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_map5(KeyType, ValueType, Pos, String, Acc) ->
+    case next_token(Pos, String) of
+        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
+            parse_map(KeyType, ValueType, NewPos, NewString, Acc);
+        {ok, {{character, "}", _, _, _, _}, NewPos, NewString}} ->
+            {ok, {Acc, NewPos, NewString}};
+        {ok, {Token, _, _}} ->
+            unexpected_token(Token, "}");
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+% TODO
+wrap_error(Reason, _) -> Reason.
+
+%%% Tests
+
+check_sophia_to_fate(Type, Sophia, Fate) ->
+    case parse_literal(Type, Sophia) of
+        {ok, Fate} ->
+            ok;
+        {ok, FateActual} ->
+            erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}});
+        {error, Reason} ->
+            erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
+    end.
+
+compile_entrypoint_value_and_type(Source, Entrypoint) ->
+    {ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]),
+
+    % Find the fcode for the correct entrypoint.
+    {fcode, Bodies, NamesMap, _} = FateCode,
+    Names = maps:to_list(NamesMap),
+    Name = unicode:characters_to_binary(Entrypoint),
+    {Hash, Name} = lists:keyfind(Name, 2, Names),
+    {_, _, Code} = maps:get(Hash, Bodies),
+    FATE = extract_return_value(Code),
+
+    % Generate the AACI, and get the AACI type info for the correct entrypoint.
+    AACI = hz_aaci:prepare_aaci(ACI),
+    {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"),
+
+    {FATE, Type}.
+
+extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) ->
+    FATE;
+extract_return_value(Code) ->
+    erlang:exit({invalid_literal_fcode, Code}).
+
+check_parser(Sophia) ->
+    % Compile the literal using the compiler, to check that it is valid Sophia
+    % syntax, and to get an AACI object to pass to the parser.
+    Source = "contract C = entrypoint f() = " ++ Sophia,
+    {Fate, Type} = compile_entrypoint_value_and_type(Source, "f"),
+
+    % Check that when we parse the term we get the same value as the Sophia
+    % compiler.
+    check_sophia_to_fate(unknown_type(), Sophia, Fate),
+
+    % Then, once we know that the term is correct, make sure that it is still
+    % accepted *with* type info.
+    check_sophia_to_fate(Type, Sophia, Fate).
+
+check_parser_with_typedef(Typedef, Sophia) ->
+    % Compile the type definitions alongside the usual literal expression.
+    Source = "contract C =\n  " ++ Typedef ++ "\n  entrypoint f() = " ++ Sophia,
+    {Fate, Type} = compile_entrypoint_value_and_type(Source, "f"),
+
+    % Do a typed parse, as usual, but there are probably record/variant
+    % definitions in the AACI, so untyped parses probably don't work.
+    check_sophia_to_fate(Type, Sophia, Fate).
+
+anon_types_test() ->
+    % Integers.
+    check_parser("123"),
+    check_parser("1_2_3"),
+    % Bytes.
+    check_parser("#DEAD000BEEF"),
+    check_parser("#DE_AD0_00B_EEF"),
+    % Strings.
+    check_parser("\"hello world\""),
+    % List of integers.
+    check_parser("[1, 2, 3]"),
+    % List of lists.
+    check_parser("[[], [1], [2, 3]]"),
+    % Tuple.
+    check_parser("(1, [2, 3], (4, 5))"),
+    % Map.
+    check_parser("{[1] = 2, [3] = 4}"),
+
+    ok.
+
+string_escape_codes_test() ->
+    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
+    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
+    check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
+    ok.
+
+records_test() ->
+    TypeDef = "record pair = {x: int, y: int}",
+    Sophia = "{x = 1, y = 2}",
+    check_parser_with_typedef(TypeDef, Sophia),
+    % The above won't run an untyped parse on the expression, but we can. It
+    % will error, though.
+    {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).
+
+variant_test() ->
+    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",
+
+    check_parser_with_typedef(TypeDef, "Zero"),
+    check_parser_with_typedef(TypeDef, "One(0)"),
+    check_parser_with_typedef(TypeDef, "Two(0, 1)"),
+    check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),
+    check_parser_with_typedef(TypeDef, "C.Zero"),
+
+    {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),
+
+    ok.
+
+namespace_variant_test() ->
+    Term = "[N.A, N.B]",
+    Source = "namespace N = datatype mytype = A | B\ncontract C = entrypoint f() = " ++ Term,
+    {Fate, VariantType} = compile_entrypoint_value_and_type(Source, "f"),
+    check_sophia_to_fate(VariantType, Term, Fate),
+
+    ok.
+
+chain_objects_test() ->
+    % Address,
+    check_parser("ak_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx"),
+    % Two different forms of signature,
+    check_parser("[sg_XDyF8LJC4tpMyAySvpaG1f5V9F2XxAbRx9iuVjvvdNMwVracLhzAuXhRM5kXAFtpwW1DCHuz5jGehUayCah4jub32Ti2n, #00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF]"),
+
+    % We have to build a totally custom contract example in order to get an
+    % AACI and return value for parsing contract addresses. This is because the
+    % compiler demands that contract addresses be type checked according to the
+    % logic of "contract oriented programming", including covariance, etc. and
+    % "contract oriented programming" is not very compatible with ML style type
+    % inference.
+    Contract = "ct_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx",
+    Source = "contract C = entrypoint f(): C = " ++ Contract,
+    {Fate, ContractType} = compile_entrypoint_value_and_type(Source, "f"),
+    check_sophia_to_fate(ContractType, Contract, Fate),
+    check_sophia_to_fate(unknown_type(), Contract, Fate),
+
+    ok.
+
+singleton_records_test() ->
+    TypeDef = "record singleton('a) = {it: 'a}",
+    check_parser_with_typedef(TypeDef, "{it = 123}"),
+    check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"),
+    check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"),
+
+    ok.
+
+singleton_variants_test() ->
+    % Similar tests to the singleton records, but this time there isn't
+    % actually a special case; singleton variants are in fact wrapped in the
+    % FATE too.
+    TypeDef = "datatype wrapped('a) = Wrap('a)",
+    check_parser_with_typedef(TypeDef, "Wrap(123)"),
+    check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"),
+    check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"),
+
+    ok.
+
+excess_parens_test() ->
+    % 'singleton' parens are another special case, but unlike singleton
+    % records, which exist in the type system, singleton parens aren't tuples
+    % at all! They are just grouping, for arithmetic. For example.
+    check_parser("(123)"),
+    check_parser("[1, (2), ((3))]"),
+    % Where this gets tricky, though, is when grouping parens are mixed with
+    % tuple parens. E.g. this list of three tuples should all parse to the same
+    % result.
+    check_parser("[((1, 2)), ((1), 2), (((1), 2))]"),
+    % Including multiple nestings of tuples and grouping, interleaved.
+    check_parser("((((1), ((2, 3)))), 4)"),
+    % Also empty tuples exist!
+    check_parser("()"),
+    check_parser("(((((), ())), ()))"),
+
+    ok.
+
+lexer_offset_test() ->
+    % Test that various tokens report their position correctly.
+    {error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"),
+    {error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"),
+    {error, {unexpected_token, "ABC", 1, 5, 7}} = parse_literal("123 ABC"),
+    {error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"),
+    {error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""),
+    {error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""),
+
+    % Check that the tokenizer knows its position correctly *after* various
+    % tokens.
+    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"),
+    ABCType = {"mytype", already_normalized, {variant, [{"ABC", []}]}},
+    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "ABC 123"),
+    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"),
+    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"),
+    {error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"),
+
+    % Check that the tokenizer accounts for various line separators correctly.
+    {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\nABC"),
+    {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\r\nABC"),
+    {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\rABC"),
+
+    ok.
+
Author	SHA1	Message	Date
Jarvis Carroll	3838a7e3c5	Parse qualified names. This seemed like it was going to be insanely insanely complex, but then it turns out the compiler doesn't accept spaces in qualified names, so I can just dump periods in the lexer and hit it with string:split/3. Easy.	2026-02-05 07:13:25 +00:00
Jarvis Carroll	d014ae0982	Handle token/parse errors more carefully	2026-02-04 07:00:39 +00:00
Jarvis Carroll	bb4bcbb7de	remove 'tk' atom from file positions	2026-02-03 06:08:54 +00:00
Jarvis Carroll	a695c21fc9	Parse address literals. Also signatures.	2026-02-03 06:00:40 +00:00
Jarvis Carroll	493bdb990c	Fix lexer row/column calculations.	2026-02-03 01:42:17 +00:00
Jarvis Carroll	17f635af61	Parse long hex escape codes This doesn't work super consistently in the compiler, for codepoints above 127, but it should work fine for us, so, oh well!	2026-02-03 00:41:00 +00:00
Jarvis Carroll	272ed01fdc	Singleton record/tuple parsing. Records are a simple case to detect and handle correctly. Tuples took an entire rewrite of the little tuple parsing bit of the code.	2026-01-30 08:12:32 +00:00
Jarvis Carroll	49cd8b6687	Parse strings	2026-01-29 06:18:06 +00:00
Jarvis Carroll	966b4b2748	Calculate scalar values during lexing This saves some effort and probably some performance for things like integers, but I'm mainly doing this in anticipation of string literals, because it would just be ridiculous to read code that lexes string literals twice.	2026-01-29 04:06:19 +00:00
Jarvis Carroll	fe182a5233	Handle underscores in integers/bytes This forces us to test for alpha/num/hex enough times that it's now worth making macros for these things.	2026-01-29 03:03:11 +00:00
Jarvis Carroll	f1696e2b9e	Bytes lexing I don't handle underscores in bytes correctly... Nor in integers, for that matter.	2026-01-29 02:01:16 +00:00
Jarvis Carroll	2bf384ca82	Infer correct values for tests automatically Now tests compare the literal parser against the output of the compiler. The little example contracts we are compiling for the AACI already had the FATE value in them, in the form of the instruction {'RETURNR', {immediate, FateValue}} so we just extract that and use it for the tests.	2026-01-27 06:42:55 +00:00
Jarvis Carroll	4f2a3c6c6f	Variant parsing	2026-01-23 06:18:39 +00:00
Jarvis Carroll	7df04a81be	Tuple parsing	2026-01-23 02:45:23 +00:00
Jarvis Carroll	6f02d4c4e6	Record parsing	2026-01-23 00:48:06 +00:00
Jarvis Carroll	56e63051bc	Map parsing	2026-01-16 05:46:27 +00:00
Jarvis Carroll	3f1c9bd626	List parsing Slowly chipping away at cases...	2026-01-15 09:38:04 +00:00
Jarvis Carroll	97e32574c4	set up parsing structure We tokenize, and then do the simplest possible recursive descent. We don't want to evaluate anything, so infix operators are out, meaning no shunting yard or tree rearranging or LR(1) shenanigans are necessary, just write the code. If we want to 'peek', just take the next token, and pass it around from that point on, until it can actually be consumed.	2026-01-15 01:52:30 +00:00
Jarvis Carroll	6f5525afcf	Rename get_function_signature hz_aaci:aaci_get_function_signature is a bit redundant.	2026-01-15 01:50:50 +00:00
Jarvis Carroll	4f1958b210	use lists:unzip/1 Just a little thing I noticed could be improved.	2026-01-13 01:19:29 +00:00
Jarvis Carroll	3da9bd570b	split coerce/3 into two functions Also renamed coerce_bindings to erlang_args_to_fate, to match.	2026-01-09 04:39:58 +00:00
Jarvis Carroll	d2163c1ff8	split AACI out of hz.erl So far the interface to hz.erl is mostly unchanged, apart from prepare_aaci/1 Maybe prepare_aaci should be re-exported, but using it is exactly in line with the 'inconvenient but more flexible primitives' that hz_aaci.erl is meant to represent, so, maybe that is a fine place to have to go for it, dunno.	2026-01-07 09:40:55 +00:00