Singleton record/tuple parsing.

Records are a simple case to detect and handle correctly. Tuples took an entire rewrite of the little tuple parsing bit of the code.
2026-01-30 08:12:32 +00:00 · 2026-01-30 08:12:32 +00:00 · 272ed01fdc
commit 272ed01fdc
parent 49cd8b6687
1 changed files with 198 additions and 26 deletions
--- a/src/hz_sophia.erl
+++ b/src/hz_sophia.erl
@ -223,7 +223,7 @@ expect_tokens([Str | Rest], Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} ->
            expect_tokens(Rest, NewTk, NewString);
-        {ok, {{_, Actual, _, Row, Start, End}}} ->
+        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.
@ -268,33 +268,158 @@ parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
 choose_list_error_wrapper("]") -> list_element;
 choose_list_error_wrapper(")") -> tuple_element.
-%%% Tuple Parsing
+%%% Ambiguous Parenthesis Parsing
 parse_tuple({_, _, {tuple, Types}}, Tk, String, Row, Start) ->
    case parse_multivalue(Types, Tk, String, Row, Start, []) of
        {ok, {TermList, NewTk, NewString}} ->
            Result = {tuple, list_to_tuple(TermList)},
            {ok, {Result, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end;
 parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) ->
    % An untyped tuple is a list of untyped terms, and weirdly our list parser
    % works perfectly for that, as long as we change the closing character to
    % be ")" instead of "]".
    case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of
        {ok, {[Inner], NewTk, NewString}} ->
            % In Sophia, singleton tuples are unwrapped, and given the inner
            % type.
            {ok, {Inner, NewTk, NewString}};
        {ok, {TermList, NewTk, NewString}} ->
            Result = {tuple, list_to_tuple(TermList)},
            {ok, {Result, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end;
-parse_tuple({O, N, _}, _, _, Row, Start) ->
+parse_tuple({O, N, T}, Tk, String, _, _) ->
-    {error, {wrong_type, O, N, tuple, Row, Start, Start}}.
+    % Typed tuple parsing is quite complex, because we also want to support
    % normal parentheses for grouping. It's not strictly necessary for
    % inputting data, since we don't have any infix operators in simple
    % data/term notation, but the alternatives are to generate singleton tuples
    % naively, (which are impossible to generate from Sophia,) or to hard error
    % on singleton tuples! Being faithful to Sophia is clearly nice!
    % Count how many ambiguous parens there are, including the one we already
    % saw.
    case count_open_parens(Tk, String, 1) of
        {ok, {Count, Token, NewTk, NewString}} ->
            % Compare that to the amount of nesting tuple connectives are in
            % the type we are expected to produce.
            {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []),
            % Now work out what to do with all this information.
            parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewTk, NewString, Token);
        {error, Reason} ->
            {error, Reason}
    end.
 count_open_parens(Tk, String, Count) ->
    case next_token(Tk, String) of
        {ok, {{character, "(", _, _, _, _}, NewTk, NewString}} ->
            count_open_parens(NewTk, NewString, Count + 1);
        {ok, {Token, NewTk, NewString}} ->
            {ok, {Count, Token, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end.
 extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 ->
    % Have an open paren, and a tuple type. We need to go deeper!
    extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]);
 extract_tuple_type_info(ParenCount, HeadType, Tails) ->
    % No parens, or no more (non-empty) tuples. Stop!
    {ParenCount, HeadType, Tails}.
 parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) ->
    {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."};
 parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, {character, ")", _, Row, Col, _}) ->
    parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Tk, String, Row, Col);
 parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, Token) ->
    % Finished with parentheses for now, try and parse an expression out, to
    % get our head term.
    case parse_expression2(HeadType, Tk, String, Token) of
        {ok, {Result, NewTk, NewString}} ->
            % Got a head term. Now try to build all the other tuple layers.
            parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewTk, NewString);
        {error, Reason} ->
            % TODO: Wrap errors here too.
            {error, Reason}
    end.
 parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
    % There are zero excess parens, meaning all our parens are tuples. Get the
    % top one.
    [Tail | _] = Tails,
    % We expected some nonzero number of elements before the close paren, but
    % got zero.
    ExpectCount = 1 + length(Tail),
    {error, {not_enough_elements, ExpectCount, 0, Row, Col}};
 parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Tk, String, _, _) ->
    % If we have some ambiguous parentheses left, we now know one of them is
    % this empty tuple.
    HeadTerm = {tuple, {}},
    NewExcessCount = ExcessCount - 1,
    % Now continue the loop as if it were an integer or something, in the head
    % position.
    parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Tk, String);
 parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
    % We were expecting a head term of a different type!
    {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}.
 parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
    % Tuples left to build, but no extra open parens to deal with, so we can
    % just parse multivalues naively, starting from the "we have a term,
    % waiting for a comma" stage of the loop.
    case parse_multivalue3(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
        {ok, {Terms, NewTk, NewString}} ->
            NewHead = {tuple, list_to_tuple(Terms)},
            parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewTk, NewString);
        {error, Reason} ->
            % TODO: More error wrapping?
            {error, Reason}
    end;
 parse_tuple_tails(_, _, 0, HeadTerm, [], Tk, String) ->
    % No open parens left, no tuples left to build, we are done!
    {ok, {HeadTerm, Tk, String}};
 parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Tk, String) ->
    % The ambiguous case, where we have a mix of tuple parens, and grouping
    % parens. We want to peek at the next token, to see if it closes a grouping
    % paren.
    case next_token(Tk, String) of
        {ok, {{character, ")", _, _, _, _}, NewTk, NewString}} ->
            % It is grouping! Close one excess paren, and continue.
            parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewTk, NewString);
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            % It is a real tuple! Try the normal logic, then.
            parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewTk, NewString);
        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            % Anything else is just a boring parse error we can complain about.
            {error, {unexpected_token, Actual, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.
 parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
    case parse_multivalue(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
        {ok, {Terms, NewTk, NewString}} ->
            NewHead = {tuple, list_to_tuple(Terms)},
            parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewTk, NewString);
        {error, Reason} ->
            % TODO: wrap errors?
            {error, Reason}
    end;
 parse_tuple_tails2(O, N, _, _, [], _, _) ->
    % This case is created when, for example, we want int * int, but instead we
    % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is,
    % ((1, 2)) would have been valid, so it's actually the second comma that
    % tips us off to the error, not the first one.
    %
    % For simpler cases, like (1, 2) when int was expected, this error message
    % is fine:
    Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}},
    % TODO: Row/col
    % TODO: Generate better error messages in the cases where N *is* a tuple,
    %       but the first thing inside that tuple is the problem.
    Err.
 %%% Unambiguous Tuple/Variant Parsing
 parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) ->
    case next_token(Tk, String) of
-        {ok, {{character, ")", Row2, Start2, _}, NewTk, NewString}} ->
+        {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
        {ok, {Token, NewTk, NewString}} ->
            parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token)
@ -310,7 +435,7 @@ parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) ->
            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
            {error, Wrapped}
    end;
-parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) ->
+parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _, _}) ->
    {ok, {lists:reverse(Acc), Tk, String}};
 parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) ->
    {error, {unexpected_token, S, Row, Start, End}}.
@ -321,6 +446,8 @@ parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) ->
            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc);
        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.
@ -364,7 +491,7 @@ parse_variant3(Arities, Tag, ElemTypes, Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} ->
            parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start);
-        {ok, {{_, Actual, _, Row, Start, End}}} ->
+        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.
@ -476,6 +603,10 @@ parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) ->
        error ->
            {error, {missing_field, Name}}
    end;
 parse_record_final_loop([], _, [Field]) ->
    % Singleton records are type-checked in Sophia, but unwrapped in the
    % resulting FATE.
    {ok, Field};
 parse_record_final_loop([], _, FieldsReverse) ->
    Fields = lists:reverse(FieldsReverse),
    Tuple = list_to_tuple(Fields),
@ -490,7 +621,7 @@ parse_map(KeyType, ValueType, Tk, String, Acc) ->
            parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
-        {ok, {{_, S, _, Row, Start, End}}} ->
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.
@ -525,7 +656,7 @@ parse_map5(KeyType, ValueType, Tk, String, Acc) ->
            parse_map(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
-        {ok, {{_, S, _, Row, Start, End}}} ->
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.
@ -570,15 +701,15 @@ check_parser(Sophia) ->
    % syntax, and to get an AACI object to pass to the parser.
    Source = "contract C = entrypoint f() = " ++ Sophia,
    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
    % Check that when we parse the term we get the same value as the Sophia
    % compiler.
    Fate = extract_return_value(Code),
    check_sophia_to_fate(unknown_type(), Sophia, Fate),
-    % Also check that the FATE term is valid, by running it through gmb.
+    % Then, once we know that the term is correct, make sure that it is still
-    gmb_fate_encoding:serialize(Fate),
+    % accepted *with* type info.
-
+    check_sophia_to_fate(Type, Sophia, Fate).
    % Now check that our parser produces that output.
    check_sophia_to_fate(Type, Sophia, Fate),
    % Also check that it can be parsed without type information.
    check_sophia_to_fate(unknown_type(), Sophia, Fate).
 check_parser_with_typedef(Typedef, Sophia) ->
    % Compile the type definitions alongside the usual literal expression.
@ -602,8 +733,6 @@ anon_types_test() ->
    check_parser("#DE_AD0_00B_EEF"),
    % Strings.
    check_parser("\"hello world\""),
    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
    check_parser("\"\\x00\\x11\\x77\""),
    % List of integers.
    check_parser("[1, 2, 3]"),
    % List of lists.
@ -615,6 +744,12 @@ anon_types_test() ->
    ok.
 string_escape_codes_test() ->
    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
    check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""),
    ok.
 records_test() ->
    TypeDef = "record pair = {x: int, y: int}",
    Sophia = "{x = 1, y = 2}",
@ -623,6 +758,43 @@ records_test() ->
    % will error, though.
    {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).
 singleton_records_test() ->
    TypeDef = "record singleton('a) = {it: 'a}",
    check_parser_with_typedef(TypeDef, "{it = 123}"),
    check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"),
    check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"),
    ok.
 singleton_variants_test() ->
    % Similar tests to the singleton records, but this time there isn't
    % actually a special case; singleton variants are in fact wrapped in the
    % FATE too.
    TypeDef = "datatype wrapped('a) = Wrap('a)",
    check_parser_with_typedef(TypeDef, "Wrap(123)"),
    check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"),
    check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"),
    ok.
 excess_parens_test() ->
    % 'singleton' parens are another special case, but unlike singleton
    % records, which exist in the type system, singleton parens aren't tuples
    % at all! They are just grouping, for arithmetic. For example.
    check_parser("(123)"),
    check_parser("[1, (2), ((3))]"),
    % Where this gets tricky, though, is when grouping parens are mixed with
    % tuple parens. E.g. this list of three tuples should all parse to the same
    % result.
    check_parser("[((1, 2)), ((1), 2), (((1), 2))]"),
    % Including multiple nestings of tuples and grouping, interleaved.
    check_parser("((((1), ((2, 3)))), 4)"),
    % Also empty tuples exist!
    check_parser("()"),
    check_parser("(((((), ())), ()))"),
    ok.
 variant_test() ->
    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",