Singleton record/tuple parsing.

Records are a simple case to detect and handle correctly. Tuples took an entire rewrite of the little tuple parsing bit of the code.
2026-01-30 08:12:32 +00:00
parent 49cd8b6687
commit 272ed01fdc
1 changed files with 198 additions and 26 deletions
@@ -223,7 +223,7 @@ expect_tokens([Str | Rest], Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} ->
            expect_tokens(Rest, NewTk, NewString);
-        {ok, {{_, Actual, _, Row, Start, End}}} ->
+        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.

@@ -268,33 +268,158 @@ parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
 choose_list_error_wrapper("]") -> list_element;
 choose_list_error_wrapper(")") -> tuple_element.

-%%% Tuple Parsing
+%%% Ambiguous Parenthesis Parsing

-parse_tuple({_, _, {tuple, Types}}, Tk, String, Row, Start) ->
-    case parse_multivalue(Types, Tk, String, Row, Start, []) of
-        {ok, {TermList, NewTk, NewString}} ->
-            Result = {tuple, list_to_tuple(TermList)},
-            {ok, {Result, NewTk, NewString}};
-        {error, Reason} ->
-            {error, Reason}
-    end;
 parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) ->
    % An untyped tuple is a list of untyped terms, and weirdly our list parser
    % works perfectly for that, as long as we change the closing character to
    % be ")" instead of "]".
    case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of
+        {ok, {[Inner], NewTk, NewString}} ->
+            % In Sophia, singleton tuples are unwrapped, and given the inner
+            % type.
+            {ok, {Inner, NewTk, NewString}};
        {ok, {TermList, NewTk, NewString}} ->
            Result = {tuple, list_to_tuple(TermList)},
            {ok, {Result, NewTk, NewString}};
        {error, Reason} ->
            {error, Reason}
    end;
-parse_tuple({O, N, _}, _, _, Row, Start) ->
-    {error, {wrong_type, O, N, tuple, Row, Start, Start}}.
+parse_tuple({O, N, T}, Tk, String, _, _) ->
+    % Typed tuple parsing is quite complex, because we also want to support
+    % normal parentheses for grouping. It's not strictly necessary for
+    % inputting data, since we don't have any infix operators in simple
+    % data/term notation, but the alternatives are to generate singleton tuples
+    % naively, (which are impossible to generate from Sophia,) or to hard error
+    % on singleton tuples! Being faithful to Sophia is clearly nice!
+
+    % Count how many ambiguous parens there are, including the one we already
+    % saw.
+    case count_open_parens(Tk, String, 1) of
+        {ok, {Count, Token, NewTk, NewString}} ->
+            % Compare that to the amount of nesting tuple connectives are in
+            % the type we are expected to produce.
+            {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []),
+            % Now work out what to do with all this information.
+            parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewTk, NewString, Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+count_open_parens(Tk, String, Count) ->
+    case next_token(Tk, String) of
+        {ok, {{character, "(", _, _, _, _}, NewTk, NewString}} ->
+            count_open_parens(NewTk, NewString, Count + 1);
+        {ok, {Token, NewTk, NewString}} ->
+            {ok, {Count, Token, NewTk, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 ->
+    % Have an open paren, and a tuple type. We need to go deeper!
+    extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]);
+extract_tuple_type_info(ParenCount, HeadType, Tails) ->
+    % No parens, or no more (non-empty) tuples. Stop!
+    {ParenCount, HeadType, Tails}.
+
+parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) ->
+    {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."};
+parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, {character, ")", _, Row, Col, _}) ->
+    parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Tk, String, Row, Col);
+parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, Token) ->
+    % Finished with parentheses for now, try and parse an expression out, to
+    % get our head term.
+    case parse_expression2(HeadType, Tk, String, Token) of
+        {ok, {Result, NewTk, NewString}} ->
+            % Got a head term. Now try to build all the other tuple layers.
+            parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewTk, NewString);
+        {error, Reason} ->
+            % TODO: Wrap errors here too.
+            {error, Reason}
+    end.
+
+parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
+    % There are zero excess parens, meaning all our parens are tuples. Get the
+    % top one.
+    [Tail | _] = Tails,
+    % We expected some nonzero number of elements before the close paren, but
+    % got zero.
+    ExpectCount = 1 + length(Tail),
+    {error, {not_enough_elements, ExpectCount, 0, Row, Col}};
+parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Tk, String, _, _) ->
+    % If we have some ambiguous parentheses left, we now know one of them is
+    % this empty tuple.
+    HeadTerm = {tuple, {}},
+    NewExcessCount = ExcessCount - 1,
+    % Now continue the loop as if it were an integer or something, in the head
+    % position.
+    parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Tk, String);
+parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
+    % We were expecting a head term of a different type!
+    {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}.
+
+parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
+    % Tuples left to build, but no extra open parens to deal with, so we can
+    % just parse multivalues naively, starting from the "we have a term,
+    % waiting for a comma" stage of the loop.
+    case parse_multivalue3(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
+        {ok, {Terms, NewTk, NewString}} ->
+            NewHead = {tuple, list_to_tuple(Terms)},
+            parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewTk, NewString);
+        {error, Reason} ->
+            % TODO: More error wrapping?
+            {error, Reason}
+    end;
+parse_tuple_tails(_, _, 0, HeadTerm, [], Tk, String) ->
+    % No open parens left, no tuples left to build, we are done!
+    {ok, {HeadTerm, Tk, String}};
+parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Tk, String) ->
+    % The ambiguous case, where we have a mix of tuple parens, and grouping
+    % parens. We want to peek at the next token, to see if it closes a grouping
+    % paren.
+    case next_token(Tk, String) of
+        {ok, {{character, ")", _, _, _, _}, NewTk, NewString}} ->
+            % It is grouping! Close one excess paren, and continue.
+            parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewTk, NewString);
+        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
+            % It is a real tuple! Try the normal logic, then.
+            parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewTk, NewString);
+        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
+            % Anything else is just a boring parse error we can complain about.
+            {error, {unexpected_token, Actual, Row, Start, End}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
+    case parse_multivalue(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
+        {ok, {Terms, NewTk, NewString}} ->
+            NewHead = {tuple, list_to_tuple(Terms)},
+            parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewTk, NewString);
+        {error, Reason} ->
+            % TODO: wrap errors?
+            {error, Reason}
+    end;
+parse_tuple_tails2(O, N, _, _, [], _, _) ->
+    % This case is created when, for example, we want int * int, but instead we
+    % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is,
+    % ((1, 2)) would have been valid, so it's actually the second comma that
+    % tips us off to the error, not the first one.
+    %
+    % For simpler cases, like (1, 2) when int was expected, this error message
+    % is fine:
+    Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}},
+    % TODO: Row/col
+    % TODO: Generate better error messages in the cases where N *is* a tuple,
+    %       but the first thing inside that tuple is the problem.
+    Err.
+
+%%% Unambiguous Tuple/Variant Parsing

 parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) ->
    case next_token(Tk, String) of
-        {ok, {{character, ")", Row2, Start2, _}, NewTk, NewString}} ->
+        {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
        {ok, {Token, NewTk, NewString}} ->
            parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token)
@@ -310,7 +435,7 @@ parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) ->
            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
            {error, Wrapped}
    end;
-parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) ->
+parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _, _}) ->
    {ok, {lists:reverse(Acc), Tk, String}};
 parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) ->
    {error, {unexpected_token, S, Row, Start, End}}.
@@ -321,6 +446,8 @@ parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) ->
            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
            parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc);
+        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
+            {error, {unexpected_token, Actual, Row, Start, End}};
        {error, Reason} ->
            {error, Reason}
    end.
@@ -364,7 +491,7 @@ parse_variant3(Arities, Tag, ElemTypes, Tk, String) ->
    case next_token(Tk, String) of
        {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} ->
            parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start);
-        {ok, {{_, Actual, _, Row, Start, End}}} ->
+        {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, Actual, Row, Start, End}}
    end.

@@ -476,6 +603,10 @@ parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) ->
        error ->
            {error, {missing_field, Name}}
    end;
+parse_record_final_loop([], _, [Field]) ->
+    % Singleton records are type-checked in Sophia, but unwrapped in the
+    % resulting FATE.
+    {ok, Field};
 parse_record_final_loop([], _, FieldsReverse) ->
    Fields = lists:reverse(FieldsReverse),
    Tuple = list_to_tuple(Fields),
@@ -490,7 +621,7 @@ parse_map(KeyType, ValueType, Tk, String, Acc) ->
            parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
-        {ok, {{_, S, _, Row, Start, End}}} ->
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.

@@ -525,7 +656,7 @@ parse_map5(KeyType, ValueType, Tk, String, Acc) ->
            parse_map(KeyType, ValueType, NewTk, NewString, Acc);
        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
            {ok, {Acc, NewTk, NewString}};
-        {ok, {{_, S, _, Row, Start, End}}} ->
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
            {error, {unexpected_token, S, Row, Start, End}}
    end.

@@ -570,15 +701,15 @@ check_parser(Sophia) ->
    % syntax, and to get an AACI object to pass to the parser.
    Source = "contract C = entrypoint f() = " ++ Sophia,
    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
+
+    % Check that when we parse the term we get the same value as the Sophia
+    % compiler.
    Fate = extract_return_value(Code),
+    check_sophia_to_fate(unknown_type(), Sophia, Fate),

-    % Also check that the FATE term is valid, by running it through gmb.
-    gmb_fate_encoding:serialize(Fate),
-
-    % Now check that our parser produces that output.
-    check_sophia_to_fate(Type, Sophia, Fate),
-    % Also check that it can be parsed without type information.
-    check_sophia_to_fate(unknown_type(), Sophia, Fate).
+    % Then, once we know that the term is correct, make sure that it is still
+    % accepted *with* type info.
+    check_sophia_to_fate(Type, Sophia, Fate).

 check_parser_with_typedef(Typedef, Sophia) ->
    % Compile the type definitions alongside the usual literal expression.
@@ -602,8 +733,6 @@ anon_types_test() ->
    check_parser("#DE_AD0_00B_EEF"),
    % Strings.
    check_parser("\"hello world\""),
-    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
-    check_parser("\"\\x00\\x11\\x77\""),
    % List of integers.
    check_parser("[1, 2, 3]"),
    % List of lists.
@@ -615,6 +744,12 @@ anon_types_test() ->

    ok.

+string_escape_codes_test() ->
+    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
+    check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
+    check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""),
+    ok.
+
 records_test() ->
    TypeDef = "record pair = {x: int, y: int}",
    Sophia = "{x = 1, y = 2}",
@@ -623,6 +758,43 @@ records_test() ->
    % will error, though.
    {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).

+singleton_records_test() ->
+    TypeDef = "record singleton('a) = {it: 'a}",
+    check_parser_with_typedef(TypeDef, "{it = 123}"),
+    check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"),
+    check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"),
+
+    ok.
+
+singleton_variants_test() ->
+    % Similar tests to the singleton records, but this time there isn't
+    % actually a special case; singleton variants are in fact wrapped in the
+    % FATE too.
+    TypeDef = "datatype wrapped('a) = Wrap('a)",
+    check_parser_with_typedef(TypeDef, "Wrap(123)"),
+    check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"),
+    check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"),
+
+    ok.
+
+excess_parens_test() ->
+    % 'singleton' parens are another special case, but unlike singleton
+    % records, which exist in the type system, singleton parens aren't tuples
+    % at all! They are just grouping, for arithmetic. For example.
+    check_parser("(123)"),
+    check_parser("[1, (2), ((3))]"),
+    % Where this gets tricky, though, is when grouping parens are mixed with
+    % tuple parens. E.g. this list of three tuples should all parse to the same
+    % result.
+    check_parser("[((1, 2)), ((1), 2), (((1), 2))]"),
+    % Including multiple nestings of tuples and grouping, interleaved.
+    check_parser("((((1), ((2, 3)))), 4)"),
+    % Also empty tuples exist!
+    check_parser("()"),
+    check_parser("(((((), ())), ()))"),
+
+    ok.
+
 variant_test() ->
    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",