-module(hz_sophia).
-vsn("0.8.2").
-author("Jarvis Carroll <spiveehere@gmail.com>").
-copyright("Jarvis Carroll <spiveehere@gmail.com>").
-license("GPL-3.0-or-later").

-export([parse_literal/1, parse_literal/2, check_parser/1]).

-include_lib("eunit/include/eunit.hrl").

parse_literal(String) ->
    parse_literal(unknown_type(), String).

parse_literal(Type, String) ->
    case parse_expression(Type, {tk, 1, 1}, String) of
        {ok, {Result, NewTk, NewString}} ->
            parse_literal2(Result, NewTk, NewString);
        {error, Reason} ->
            {error, Reason}
    end.

parse_literal2(Result, Tk, String) ->
    % We have parsed a valid expression. Now check that the string ends.
    case next_token(Tk, String) of
        {ok, {{eof, _, _, _, _}, _, _}} ->
            {ok, Result};
        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.

%%% Tokenizer

-define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))).
-define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
-define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))).
-define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).

next_token({tk, Row, Col}, []) ->
    {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
next_token({tk, Row, Col}, " " ++ Rest) ->
    next_token({tk, Row, Col + 1}, Rest);
next_token({tk, Row, Col}, "\t" ++ Rest) ->
    next_token({tk, Row, Col + 1}, Rest);
next_token({tk, Row, _}, "\r\n" ++ Rest) ->
    next_token({tk, Row + 1, 1}, Rest);
next_token({tk, Row, _}, "\r" ++ Rest) ->
    next_token({tk, Row + 1, 1}, Rest);
next_token({tk, Row, _}, "\n" ++ Rest) ->
    next_token({tk, Row + 1, 1}, Rest);
next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
    alphanum_token(Tk, Tk, String, []);
next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
    num_token(Tk, Tk, String, [], 0);
next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
    bytes_token({tk, Row, Col}, {tk, Row, Col + 1}, [C | Rest], "#", []);
next_token({tk, Row, Col}, "\"" ++ Rest) ->
    string_token({tk, Row, Col}, {tk, Row, Col + 1}, Rest, "\"", <<>>);
next_token({tk, Row, Col}, [Char | Rest]) ->
    Token = {character, [Char], Char, Row, Col, Col},
    {ok, {Token, {tk, Row, Col + 1}, Rest}}.

alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
    alphanum_token(Start, {tk, Row, Col + 1}, Rest, [C | Acc]);
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
    AlphaString = lists:reverse(Acc),
    Token = {alphanum, AlphaString, AlphaString, Row, Start, End - 1},
    {ok, {Token, {tk, Row, End}, String}}.

num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
    NewValue = Value * 10 + (C - $0),
    num_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], NewValue);
num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
    NewValue = Value * 10 + (C - $0),
    num_token(Start, {tk, Row, Col + 2}, Rest, [C, $_ | Chars], NewValue);
num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
    NumString = lists:reverse(Chars),
    Token = {integer, NumString, Value, Row, Start, End - 1},
    {ok, {Token, {tk, Row, End}, String}}.

bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
    Digit = convert_digit(C),
    bytes_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]);
bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
    Digit = convert_digit(C),
    bytes_token(Start, {tk, Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]);
bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
    BytesString = lists:reverse(Chars),
    Value = reverse_combine_nibbles(Digits, <<>>),
    Token = {bytes, BytesString, Value, Row, Start, End - 1},
    {ok, {Token, {tk, Row, End}, String}}.

convert_digit(C) when C >= $0, C =< $9 ->
    C - $0;
convert_digit(C) when C >= $A, C =< $Z ->
    C - $A + 10;
convert_digit(C) when C >= $a, C =< $z ->
    C - $a + 10.

reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
    NewAcc = <<D2:4, D1:4, Acc/binary>>,
    reverse_combine_nibbles(Rest, NewAcc);
reverse_combine_nibbles([D1], Acc) ->
    <<0:4, D1:4, Acc/binary>>;
reverse_combine_nibbles([], Acc) ->
    Acc.

string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
    case escape_hex_code({tk, Row, Col}, {tk, Row, Col + 2}, String, "x\\" ++ SourceChars) of
        {ok, {Codepoint, NewSourceChars, NewTk, NewString}} ->
            NewValue = <<Value/binary, Codepoint/utf8>>,
            string_token(Start, NewTk, NewString, NewSourceChars, NewValue);
        {error, Reason} ->
            {error, Reason}
    end;
string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
    case escape_char(C) of
        {ok, ByteVal} ->
            string_token(Start, {tk, Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
        error ->
            {error, {invalid_escape_code, [C], Row, Col}}
    end;
string_token({tk, _, Start}, {tk, Row, Col}, [$" | Rest], SourceChars, Value) ->
    SourceStr = lists:reverse([$" | SourceChars]),
    Token = {string, SourceStr, Value, Row, Start, Col},
    {ok, {Token, {tk, Row, Col + 1}, Rest}};
string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
    % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
    % bother, if we need to convert it back? I guess we could accept iolists if
    % we really wanted to waste time on this point...
    string_token(Start, {tk, Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).

escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) ->
    escape_long_hex_code(Start, {tk, Row, Col + 1}, String, "{" ++ SourceChars, 0);
escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
    % As of writing this, the Sophia compiler will convert this byte from
    % extended ASCII to unicode... But it really shouldn't. The literal parser
    % does what the compiler should do.
    Byte = convert_digit(A) * 16 + convert_digit(B),
    {ok, {Byte, [B, A | SourceChars], {tk, Row, Col + 2}, String}};
escape_hex_code({tk, Row1, Col1}, _, _, _) ->
    {error, {invalid_escape_code, "\\x", Row1, Col1}}.

escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) ->
    {ok, {Value, "}" ++ SourceChars, {tk, Row, Col + 1}, String}};
escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
    NewSourceChars = [C | SourceChars],
    NewValue = 16 * Value + convert_digit(C),
    escape_long_hex_code(Start, {tk, Row, Col + 1}, String, NewSourceChars, NewValue);
escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) ->
    {error, {invalid_hexadecimal, [C], Row, Col}};
escape_long_hex_code(_, Tk, [], SourceChars, Value) ->
    % Just return as if the escape code were closed, and let the string parser
    % produce an unclosed string error instead.
    {ok, {Value, SourceChars, Tk, []}}.

escape_char($b)  -> {ok, $\b};
escape_char($e)  -> {ok, $\e};
escape_char($f)  -> {ok, $\f};
escape_char($n)  -> {ok, $\n};
escape_char($r)  -> {ok, $\r};
escape_char($t)  -> {ok, $\t};
escape_char($v)  -> {ok, $\v};
escape_char($")  -> {ok, $\"};
escape_char($\\) -> {ok, $\\};
escape_char(_)   -> error.

%%% Sophia Literal Parser

%%% This parser is a simple recursive descent parser, written explicitly in
%%% erlang.
%%%
%%% There are no infix operators in the subset we want to parse, so recursive
%%% descent is fine with no special tricks, no shunting yard algorithm, no
%%% parser generators, etc.
%%%
%%% If we were writing this in C then we might want to work iteratively with an
%%% array of finite state machines, i.e. with a pushdown automaton, instead of
%%% using recursion. This is a tried and true method of making fast parsers.
%%% Recall, however, that the BEAM *is* a stack machine, written in C, so
%%% rather than writing confusing iterative code in Erlang, to simulate a
%%% pushdown automaton inside another simulated stack machine... we should just
%%% write the recursive code, thus programming the BEAM to implement the
%%% pushdown automaton that we want.

parse_expression(Type, Tk, String) ->
    case next_token(Tk, String) of
        {ok, {Token, NewTk, NewString}} ->
            parse_expression2(Type, NewTk, NewString, Token);
        {error, Reason} ->
            {error, Reason}
    end.

parse_expression2(Type, Tk, String, {integer, _, Value, Row, Start, End}) ->
    case Type of
        {_, _, integer} ->
            {ok, {Value, Tk, String}};
        {_, _, unknown_type} ->
            {ok, {Value, Tk, String}};
        {O, N, _} ->
            {error, {wrong_type, O, N, integer, Row, Start, End}}
    end;
parse_expression2(Type, Tk, String, {bytes, _, Value, Row, Start, End}) ->
    Len = byte_size(Value),
    Result = {bytes, Value},
    case Type of
        {_, _, {bytes, [any]}} ->
            {ok, {Result, Tk, String}};
        {_, _, {bytes, [Len]}} ->
            {ok, {Result, Tk, String}};
        {_, _, {bytes, [ExpectedLen]}} ->
            {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
        {_, _, unknown_type} ->
            {ok, {Result, Tk, String}};
        {O, N, _} ->
            {error, {wrong_type, O, N, {bytes, [Len]}, Row, Start, End}}
    end;
parse_expression2(Type, Tk, String, {string, _, Value, Row, Start, End}) ->
    case Type of
        {_, _, string} ->
            {ok, {Value, Tk, String}};
        {_, _, unknown_type} ->
            {ok, {Value, Tk, String}};
        {O, N, _} ->
            {error, {wrong_type, O, N, string, Row, Start, End}}
    end;
parse_expression2(Type, Tk, String, {character, "[", _, Row, Start, _}) ->
    parse_list(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) ->
    parse_tuple(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) ->
    parse_record_or_map(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) ->
    parse_variant(Type, Tk, String, Ident, Row, Start, End);
parse_expression2(_, _, _, {_, S, _, Row, Start, End}) ->
    {error, {unexpected_token, S, Row, Start, End}}.

unknown_type() ->
    {unknown_type, already_normalized, unknown_type}.

expect_tokens([], Tk, String) ->
    {ok, {Tk, String}};
expect_tokens([Str | Rest], Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} ->
            expect_tokens(Rest, NewTk, NewString);
        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.

%%% List Parsing

parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
    parse_list_loop(Inner, Tk, String, "]", Row, Start, []);
parse_list({_, _, unknown_type}, Tk, String, Row, Start) ->
    parse_list_loop(unknown_type(), Tk, String, "]", Row, Start, []);
parse_list({O, N, _}, _, _, Row, Start) ->
    {error, {wrong_type, O, N, list, Row, Start, Start}}.

parse_list_loop(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} ->
            {ok, {lists:reverse(Acc), NewTk, NewString}};
        {ok, {Token, NewTk, NewString}} ->
            parse_list_loop2(Inner, NewTk, NewString, CloseChar, Row, Start, Acc, Token)
    end.

parse_list_loop2(Inner, Tk, String, CloseChar, Row, Start, Acc, Token) ->
    case parse_expression2(Inner, Tk, String, Token) of
        {ok, {Value, NewTk, NewString}} ->
            parse_list_loop3(Inner, NewTk, NewString, CloseChar, Row, Start, [Value | Acc]);
        {error, Reason} ->
            Wrapper = choose_list_error_wrapper(CloseChar),
            % TODO: Are tuple indices off by one from list indices?
            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
            {error, Wrapped}
    end.

parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} ->
            {ok, {lists:reverse(Acc), NewTk, NewString}};
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            parse_list_loop(Inner, NewTk, NewString, CloseChar, Row, Start, Acc);
        {error, Reason} ->
            {error, Reason}
    end.

choose_list_error_wrapper("]") -> list_element;
choose_list_error_wrapper(")") -> tuple_element.

%%% Ambiguous Parenthesis Parsing

parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) ->
    % An untyped tuple is a list of untyped terms, and weirdly our list parser
    % works perfectly for that, as long as we change the closing character to
    % be ")" instead of "]".
    case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of
        {ok, {[Inner], NewTk, NewString}} ->
            % In Sophia, singleton tuples are unwrapped, and given the inner
            % type.
            {ok, {Inner, NewTk, NewString}};
        {ok, {TermList, NewTk, NewString}} ->
            Result = {tuple, list_to_tuple(TermList)},
            {ok, {Result, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end;
parse_tuple({O, N, T}, Tk, String, _, _) ->
    % Typed tuple parsing is quite complex, because we also want to support
    % normal parentheses for grouping. It's not strictly necessary for
    % inputting data, since we don't have any infix operators in simple
    % data/term notation, but the alternatives are to generate singleton tuples
    % naively, (which are impossible to generate from Sophia,) or to hard error
    % on singleton tuples! Being faithful to Sophia is clearly nice!

    % Count how many ambiguous parens there are, including the one we already
    % saw.
    case count_open_parens(Tk, String, 1) of
        {ok, {Count, Token, NewTk, NewString}} ->
            % Compare that to the amount of nesting tuple connectives are in
            % the type we are expected to produce.
            {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []),
            % Now work out what to do with all this information.
            parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewTk, NewString, Token);
        {error, Reason} ->
            {error, Reason}
    end.

count_open_parens(Tk, String, Count) ->
    case next_token(Tk, String) of
        {ok, {{character, "(", _, _, _, _}, NewTk, NewString}} ->
            count_open_parens(NewTk, NewString, Count + 1);
        {ok, {Token, NewTk, NewString}} ->
            {ok, {Count, Token, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end.

extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 ->
    % Have an open paren, and a tuple type. We need to go deeper!
    extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]);
extract_tuple_type_info(ParenCount, HeadType, Tails) ->
    % No parens, or no more (non-empty) tuples. Stop!
    {ParenCount, HeadType, Tails}.

parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) ->
    {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."};
parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, {character, ")", _, Row, Col, _}) ->
    parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Tk, String, Row, Col);
parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, Token) ->
    % Finished with parentheses for now, try and parse an expression out, to
    % get our head term.
    case parse_expression2(HeadType, Tk, String, Token) of
        {ok, {Result, NewTk, NewString}} ->
            % Got a head term. Now try to build all the other tuple layers.
            parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewTk, NewString);
        {error, Reason} ->
            % TODO: Wrap errors here too.
            {error, Reason}
    end.

parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
    % There are zero excess parens, meaning all our parens are tuples. Get the
    % top one.
    [Tail | _] = Tails,
    % We expected some nonzero number of elements before the close paren, but
    % got zero.
    ExpectCount = 1 + length(Tail),
    {error, {not_enough_elements, ExpectCount, 0, Row, Col}};
parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Tk, String, _, _) ->
    % If we have some ambiguous parentheses left, we now know one of them is
    % this empty tuple.
    HeadTerm = {tuple, {}},
    NewExcessCount = ExcessCount - 1,
    % Now continue the loop as if it were an integer or something, in the head
    % position.
    parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Tk, String);
parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
    % We were expecting a head term of a different type!
    {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}.

parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
    % Tuples left to build, but no extra open parens to deal with, so we can
    % just parse multivalues naively, starting from the "we have a term,
    % waiting for a comma" stage of the loop.
    case parse_multivalue3(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
        {ok, {Terms, NewTk, NewString}} ->
            NewHead = {tuple, list_to_tuple(Terms)},
            parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewTk, NewString);
        {error, Reason} ->
            % TODO: More error wrapping?
            {error, Reason}
    end;
parse_tuple_tails(_, _, 0, HeadTerm, [], Tk, String) ->
    % No open parens left, no tuples left to build, we are done!
    {ok, {HeadTerm, Tk, String}};
parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Tk, String) ->
    % The ambiguous case, where we have a mix of tuple parens, and grouping
    % parens. We want to peek at the next token, to see if it closes a grouping
    % paren.
    case next_token(Tk, String) of
        {ok, {{character, ")", _, _, _, _}, NewTk, NewString}} ->
            % It is grouping! Close one excess paren, and continue.
            parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewTk, NewString);
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            % It is a real tuple! Try the normal logic, then.
            parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewTk, NewString);
        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            % Anything else is just a boring parse error we can complain about.
            {error, {unexpected_token, Actual, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.

parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
    case parse_multivalue(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
        {ok, {Terms, NewTk, NewString}} ->
            NewHead = {tuple, list_to_tuple(Terms)},
            parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewTk, NewString);
        {error, Reason} ->
            % TODO: wrap errors?
            {error, Reason}
    end;
parse_tuple_tails2(O, N, _, _, [], _, _) ->
    % This case is created when, for example, we want int * int, but instead we
    % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is,
    % ((1, 2)) would have been valid, so it's actually the second comma that
    % tips us off to the error, not the first one.
    %
    % For simpler cases, like (1, 2) when int was expected, this error message
    % is fine:
    Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}},
    % TODO: Row/col
    % TODO: Generate better error messages in the cases where N *is* a tuple,
    %       but the first thing inside that tuple is the problem.
    Err.

%%% Unambiguous Tuple/Variant Parsing

parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
        {ok, {Token, NewTk, NewString}} ->
            parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token)
    end.

parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) ->
    case parse_expression2(Next, Tk, String, Token) of
        {ok, {Value, NewTk, NewString}} ->
            parse_multivalue3(Rest, NewTk, NewString, Row, Start, [Value | Acc]);
        {error, Reason} ->
            Wrapper = choose_list_error_wrapper(")"),
            % TODO: Are tuple indices off by one from list indices?
            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
            {error, Wrapped}
    end;
parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _, _}) ->
    {ok, {lists:reverse(Acc), Tk, String}};
parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) ->
    {error, {unexpected_token, S, Row, Start, End}}.

parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc);
        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.

check_multivalue_long_enough([], Tk, String, _, _, Acc) ->
    {ok, {lists:reverse(Acc), Tk, String}};
check_multivalue_long_enough(Remaining, _, _, Row, Col, Got) ->
    GotCount = length(Got),
    ExpectCount = length(Remaining) + GotCount,
    {error, {not_enough_elements, ExpectCount, GotCount, Row, Col}}.

%%% Variant parsing

parse_variant({_, _, {variant, Variants}}, Tk, String, Ident, Row, Start, End) ->
    parse_variant2(Variants, Tk, String, Ident, Row, Start, End);
parse_variant({_, _, unknown_type}, _, _, _, Row, Start, End) ->
    {error, {unresolved_variant, Row, Start, End}};
parse_variant({O, N, _}, _, _, _, Row, Start, End) ->
    % In normal code, identifiers can have many meanings, which can result in
    % lots of different errors. In this Sophia 'object notation', identifiers
    % can only ever be variant constructors, (sort of like the Sophia version
    % of atoms,) and so immediately lead to a type error if we aren't expecting
    % a variant.
    {error, {wrong_type, O, N, variant, Row, Start, End}}.

parse_variant2(Variants, Tk, String, Ident, Row, Start, End) ->
    case lookup_variant(Ident, Variants, 0) of
        {ok, {Tag, ElemTypes}} ->
            GetArity = fun({_, OtherElemTypes}) -> length(OtherElemTypes) end,
            Arities = lists:map(GetArity, Variants),
            parse_variant3(Arities, Tag, ElemTypes, Tk, String);
        error ->
            {error, {invalid_constructor, Ident, Row, Start, End}}
    end.

parse_variant3(Arities, Tag, [], Tk, String) ->
    % Parsing of 0-arity variants is different.
    Result = {variant, Arities, Tag, {}},
    {ok, {Result, Tk, String}};
parse_variant3(Arities, Tag, ElemTypes, Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} ->
            parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start);
        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.

parse_variant4(Arities, Tag, ElemTypes, Tk, String, Row, Start) ->
    case parse_multivalue(ElemTypes, Tk, String, Row, Start, []) of
        {ok, {Terms, NewTk, NewString}} ->
            Result = {variant, Arities, Tag, list_to_tuple(Terms)},
            {ok, {Result, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end.

lookup_variant(_, [], _) ->
    error;
lookup_variant(Ident, [{Ident, ElemTypes} | _], Tag) ->
    {ok, {Tag, ElemTypes}};
lookup_variant(Ident, [_ | Rest], Tag) ->
    lookup_variant(Ident, Rest, Tag + 1).

%%% Record parsing

parse_record_or_map({_, _, {map, [KeyType, ValueType]}}, Tk, String, _, _) ->
    parse_map(KeyType, ValueType, Tk, String, #{});
parse_record_or_map({_, _, {record, Fields}}, Tk, String, _, _) ->
    parse_record(Fields, Tk, String, #{});
parse_record_or_map({_, _, unknown_type}, Tk, String, _, _) ->
    case next_token(Tk, String) of
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {#{}, NewTk, NewString}};
        {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} ->
            parse_map2(unknown_type(), unknown_type(), NewTk, NewString, #{});
        {ok, {{alphanum, _, _, Row, Start, End}, _, _}} ->
            {error, {unresolved_record, Row, Start, End}};
        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end;
parse_record_or_map({O, N, _}, _, _, Row, Start) ->
    {error, {wrong_type, O, N, map, Row, Start, Start}}.

parse_record(Fields, Tk, String, Acc) ->
    case next_token(Tk, String) of
        {ok, {{alphanum, Ident, _, Row, Start, End}, NewTk, NewString}} ->
            parse_record2(Fields, NewTk, NewString, Acc, Ident, Row, Start, End);
        {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} ->
            parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End);
        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.

parse_record2(Fields, Tk, String, Acc, Ident, Row, Start, End) ->
    case lists:keyfind(Ident, 1, Fields) of
        {_, Type} ->
            parse_record3(Fields, Tk, String, Acc, Ident, Row, Start, End, Type);
        false ->
            {error, {invalid_field, Ident, Row, Start, End}}
    end.

parse_record3(Fields, Tk, String, Acc, Ident, Row, Start, End, Type) ->
    case maps:is_key(Ident, Acc) of
        false ->
            parse_record4(Fields, Tk, String, Acc, Ident, Type);
        true ->
            {error, {field_already_present, Ident, Row, Start, End}}
    end.

parse_record4(Fields, Tk, String, Acc, Ident, Type) ->
    case expect_tokens(["="], Tk, String) of
        {ok, {NewTk, NewString}} ->
            parse_record5(Fields, NewTk, NewString, Acc, Ident, Type);
        {error, Reason} ->
            {error, Reason}
    end.

parse_record5(Fields, Tk, String, Acc, Ident, Type) ->
    case parse_expression(Type, Tk, String) of
        {ok, {Result, NewTk, NewString}} ->
            NewAcc = maps:put(Ident, Result, Acc),
            parse_record6(Fields, NewTk, NewString, NewAcc);
        {error, Reason} ->
            wrap_error(Reason, {record_field, Ident})
    end.

parse_record6(Fields, Tk, String, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            parse_record(Fields, NewTk, NewString, Acc);
        {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} ->
            parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End);
        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.

parse_record_end(Fields, Tk, String, FieldValues, Row, Start, End) ->
    case parse_record_final_loop(Fields, FieldValues, []) of
        {ok, Result} ->
            {ok, {Result, Tk, String}};
        {error, {missing_field, Name}} ->
            {error, {missing_field, Name, Row, Start, End}}
    end.

parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) ->
    case maps:find(Name, FieldValues) of
        {ok, Value} ->
            parse_record_final_loop(Rest, FieldValues, [Value | Acc]);
        error ->
            {error, {missing_field, Name}}
    end;
parse_record_final_loop([], _, [Field]) ->
    % Singleton records are type-checked in Sophia, but unwrapped in the
    % resulting FATE.
    {ok, Field};
parse_record_final_loop([], _, FieldsReverse) ->
    Fields = lists:reverse(FieldsReverse),
    Tuple = list_to_tuple(Fields),
    {ok, {tuple, Tuple}}.


%%% Map Parsing

parse_map(KeyType, ValueType, Tk, String, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} ->
            parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.

parse_map2(KeyType, ValueType, Tk, String, Acc) ->
    case parse_expression(KeyType, Tk, String) of
        {ok, {Result, NewTk, NewString}} ->
            parse_map3(KeyType, ValueType, NewTk, NewString, Acc, Result);
        {error, Reason} ->
            wrap_error(Reason, {map_key, maps:size(Acc)})
    end.

parse_map3(KeyType, ValueType, Tk, String, Acc, Key) ->
    case expect_tokens(["]", "="], Tk, String) of
        {ok, {NewTk, NewString}} ->
            parse_map4(KeyType, ValueType, NewTk, NewString, Acc, Key);
        {error, Reason} ->
            {error, Reason}
    end.

parse_map4(KeyType, ValueType, Tk, String, Acc, Key) ->
    case parse_expression(ValueType, Tk, String) of
        {ok, {Result, NewTk, NewString}} ->
            NewAcc = maps:put(Key, Result, Acc),
            parse_map5(KeyType, ValueType, NewTk, NewString, NewAcc);
        {error, Reason} ->
            {error, Reason}
    end.

parse_map5(KeyType, ValueType, Tk, String, Acc) ->
    case next_token(Tk, String) of
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            parse_map(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.

% TODO
wrap_error(Reason, _) -> Reason.

%%% Tests

check_sophia_to_fate(Type, Sophia, Fate) ->
    case parse_literal(Type, Sophia) of
        {ok, Fate} ->
            ok;
        {ok, FateActual} ->
            erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}});
        {error, Reason} ->
            erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
    end.

compile_entrypoint_code_and_type(Source, Entrypoint) ->
    {ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]),

    % Find the fcode for the correct entrypoint.
    {fcode, Bodies, NamesMap, _} = FateCode,
    Names = maps:to_list(NamesMap),
    Name = unicode:characters_to_binary(Entrypoint),
    {Hash, Name} = lists:keyfind(Name, 2, Names),
    {_, _, Code} = maps:get(Hash, Bodies),

    % Generate the AACI, and get the AACI type info for the correct entrypoint.
    AACI = hz_aaci:prepare_aaci(ACI),
    {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"),

    {Code, Type}.

extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) ->
    FATE;
extract_return_value(Code) ->
    erlang:exit({invalid_literal_fcode, Code}).

check_parser(Sophia) ->
    % Compile the literal using the compiler, to check that it is valid Sophia
    % syntax, and to get an AACI object to pass to the parser.
    Source = "contract C = entrypoint f() = " ++ Sophia,
    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),

    % Check that when we parse the term we get the same value as the Sophia
    % compiler.
    Fate = extract_return_value(Code),
    check_sophia_to_fate(unknown_type(), Sophia, Fate),

    % Then, once we know that the term is correct, make sure that it is still
    % accepted *with* type info.
    check_sophia_to_fate(Type, Sophia, Fate).

check_parser_with_typedef(Typedef, Sophia) ->
    % Compile the type definitions alongside the usual literal expression.
    Source = "contract C =\n  " ++ Typedef ++ "\n  entrypoint f() = " ++ Sophia,
    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
    Fate = extract_return_value(Code),

    % Check the FATE term as usual.
    gmb_fate_encoding:serialize(Fate),

    % Do a typed parse, as usual, but there are probably record/variant
    % definitions in the AACI, so untyped parses probably don't work.
    check_sophia_to_fate(Type, Sophia, Fate).

anon_types_test() ->
    % Integers.
    check_parser("123"),
    check_parser("1_2_3"),
    % Bytes.
    check_parser("#DEAD000BEEF"),
    check_parser("#DE_AD0_00B_EEF"),
    % Strings.
    check_parser("\"hello world\""),
    % List of integers.
    check_parser("[1, 2, 3]"),
    % List of lists.
    check_parser("[[], [1], [2, 3]]"),
    % Tuple.
    check_parser("(1, [2, 3], (4, 5))"),
    % Map.
    check_parser("{[1] = 2, [3] = 4}"),

    ok.

string_escape_codes_test() ->
    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
    check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
    ok.

records_test() ->
    TypeDef = "record pair = {x: int, y: int}",
    Sophia = "{x = 1, y = 2}",
    check_parser_with_typedef(TypeDef, Sophia),
    % The above won't run an untyped parse on the expression, but we can. It
    % will error, though.
    {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).

singleton_records_test() ->
    TypeDef = "record singleton('a) = {it: 'a}",
    check_parser_with_typedef(TypeDef, "{it = 123}"),
    check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"),
    check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"),

    ok.

singleton_variants_test() ->
    % Similar tests to the singleton records, but this time there isn't
    % actually a special case; singleton variants are in fact wrapped in the
    % FATE too.
    TypeDef = "datatype wrapped('a) = Wrap('a)",
    check_parser_with_typedef(TypeDef, "Wrap(123)"),
    check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"),
    check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"),

    ok.

excess_parens_test() ->
    % 'singleton' parens are another special case, but unlike singleton
    % records, which exist in the type system, singleton parens aren't tuples
    % at all! They are just grouping, for arithmetic. For example.
    check_parser("(123)"),
    check_parser("[1, (2), ((3))]"),
    % Where this gets tricky, though, is when grouping parens are mixed with
    % tuple parens. E.g. this list of three tuples should all parse to the same
    % result.
    check_parser("[((1, 2)), ((1), 2), (((1), 2))]"),
    % Including multiple nestings of tuples and grouping, interleaved.
    check_parser("((((1), ((2, 3)))), 4)"),
    % Also empty tuples exist!
    check_parser("()"),
    check_parser("(((((), ())), ()))"),

    ok.

variant_test() ->
    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",

    check_parser_with_typedef(TypeDef, "Zero"),
    check_parser_with_typedef(TypeDef, "One(0)"),
    check_parser_with_typedef(TypeDef, "Two(0, 1)"),
    check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),

    {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),

    ok.

lexer_offset_test() ->
    % Test that various tokens report their position correctly.
    {error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"),
    {error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"),
    {error, {unexpected_token, "abc", 1, 5, 7}} = parse_literal("123 abc"),
    {error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"),
    {error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""),
    {error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""),

    % Check that the tokenizer knows its position correctly *after* various
    % tokens.
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"),
    ABCType = {"mytype", already_normalized, {variant, [{"abc", []}]}},
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "abc 123"),
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"),
    {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"),
    {error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"),

    % Check that the tokenizer accounts for various line separators correctly.
    {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\nabc"),
    {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\r\nabc"),
    {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\rabc"),

    ok.