Singleton record/tuple parsing.

Records are a simple case to detect and handle correctly.

Tuples took an entire rewrite of the little tuple parsing bit of the code.
This commit is contained in:
Jarvis Carroll 2026-01-30 08:12:32 +00:00
parent 49cd8b6687
commit 272ed01fdc

View File

@ -223,7 +223,7 @@ expect_tokens([Str | Rest], Tk, String) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{_, Str, _, _, _, _}, NewTk, NewString}} -> {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} ->
expect_tokens(Rest, NewTk, NewString); expect_tokens(Rest, NewTk, NewString);
{ok, {{_, Actual, _, Row, Start, End}}} -> {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, Actual, Row, Start, End}} {error, {unexpected_token, Actual, Row, Start, End}}
end. end.
@ -268,33 +268,158 @@ parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
choose_list_error_wrapper("]") -> list_element; choose_list_error_wrapper("]") -> list_element;
choose_list_error_wrapper(")") -> tuple_element. choose_list_error_wrapper(")") -> tuple_element.
%%% Tuple Parsing %%% Ambiguous Parenthesis Parsing
parse_tuple({_, _, {tuple, Types}}, Tk, String, Row, Start) ->
case parse_multivalue(Types, Tk, String, Row, Start, []) of
{ok, {TermList, NewTk, NewString}} ->
Result = {tuple, list_to_tuple(TermList)},
{ok, {Result, NewTk, NewString}};
{error, Reason} ->
{error, Reason}
end;
parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) -> parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) ->
% An untyped tuple is a list of untyped terms, and weirdly our list parser % An untyped tuple is a list of untyped terms, and weirdly our list parser
% works perfectly for that, as long as we change the closing character to % works perfectly for that, as long as we change the closing character to
% be ")" instead of "]". % be ")" instead of "]".
case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of
{ok, {[Inner], NewTk, NewString}} ->
% In Sophia, singleton tuples are unwrapped, and given the inner
% type.
{ok, {Inner, NewTk, NewString}};
{ok, {TermList, NewTk, NewString}} -> {ok, {TermList, NewTk, NewString}} ->
Result = {tuple, list_to_tuple(TermList)}, Result = {tuple, list_to_tuple(TermList)},
{ok, {Result, NewTk, NewString}}; {ok, {Result, NewTk, NewString}};
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
end; end;
parse_tuple({O, N, _}, _, _, Row, Start) -> parse_tuple({O, N, T}, Tk, String, _, _) ->
{error, {wrong_type, O, N, tuple, Row, Start, Start}}. % Typed tuple parsing is quite complex, because we also want to support
% normal parentheses for grouping. It's not strictly necessary for
% inputting data, since we don't have any infix operators in simple
% data/term notation, but the alternatives are to generate singleton tuples
% naively, (which are impossible to generate from Sophia,) or to hard error
% on singleton tuples! Being faithful to Sophia is clearly nice!
% Count how many ambiguous parens there are, including the one we already
% saw.
case count_open_parens(Tk, String, 1) of
{ok, {Count, Token, NewTk, NewString}} ->
% Compare that to the amount of nesting tuple connectives are in
% the type we are expected to produce.
{ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []),
% Now work out what to do with all this information.
parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewTk, NewString, Token);
{error, Reason} ->
{error, Reason}
end.
count_open_parens(Tk, String, Count) ->
case next_token(Tk, String) of
{ok, {{character, "(", _, _, _, _}, NewTk, NewString}} ->
count_open_parens(NewTk, NewString, Count + 1);
{ok, {Token, NewTk, NewString}} ->
{ok, {Count, Token, NewTk, NewString}};
{error, Reason} ->
{error, Reason}
end.
extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 ->
% Have an open paren, and a tuple type. We need to go deeper!
extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]);
extract_tuple_type_info(ParenCount, HeadType, Tails) ->
% No parens, or no more (non-empty) tuples. Stop!
{ParenCount, HeadType, Tails}.
parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) ->
{error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."};
parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, {character, ")", _, Row, Col, _}) ->
parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Tk, String, Row, Col);
parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, Token) ->
% Finished with parentheses for now, try and parse an expression out, to
% get our head term.
case parse_expression2(HeadType, Tk, String, Token) of
{ok, {Result, NewTk, NewString}} ->
% Got a head term. Now try to build all the other tuple layers.
parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewTk, NewString);
{error, Reason} ->
% TODO: Wrap errors here too.
{error, Reason}
end.
parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
% There are zero excess parens, meaning all our parens are tuples. Get the
% top one.
[Tail | _] = Tails,
% We expected some nonzero number of elements before the close paren, but
% got zero.
ExpectCount = 1 + length(Tail),
{error, {not_enough_elements, ExpectCount, 0, Row, Col}};
parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Tk, String, _, _) ->
% If we have some ambiguous parentheses left, we now know one of them is
% this empty tuple.
HeadTerm = {tuple, {}},
NewExcessCount = ExcessCount - 1,
% Now continue the loop as if it were an integer or something, in the head
% position.
parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Tk, String);
parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
% We were expecting a head term of a different type!
{error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}.
parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
% Tuples left to build, but no extra open parens to deal with, so we can
% just parse multivalues naively, starting from the "we have a term,
% waiting for a comma" stage of the loop.
case parse_multivalue3(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
{ok, {Terms, NewTk, NewString}} ->
NewHead = {tuple, list_to_tuple(Terms)},
parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewTk, NewString);
{error, Reason} ->
% TODO: More error wrapping?
{error, Reason}
end;
parse_tuple_tails(_, _, 0, HeadTerm, [], Tk, String) ->
% No open parens left, no tuples left to build, we are done!
{ok, {HeadTerm, Tk, String}};
parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Tk, String) ->
% The ambiguous case, where we have a mix of tuple parens, and grouping
% parens. We want to peek at the next token, to see if it closes a grouping
% paren.
case next_token(Tk, String) of
{ok, {{character, ")", _, _, _, _}, NewTk, NewString}} ->
% It is grouping! Close one excess paren, and continue.
parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewTk, NewString);
{ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
% It is a real tuple! Try the normal logic, then.
parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewTk, NewString);
{ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
% Anything else is just a boring parse error we can complain about.
{error, {unexpected_token, Actual, Row, Start, End}};
{error, Reason} ->
{error, Reason}
end.
parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Tk, String) ->
case parse_multivalue(TailTypes, Tk, String, -1, -1, [HeadTerm]) of
{ok, {Terms, NewTk, NewString}} ->
NewHead = {tuple, list_to_tuple(Terms)},
parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewTk, NewString);
{error, Reason} ->
% TODO: wrap errors?
{error, Reason}
end;
parse_tuple_tails2(O, N, _, _, [], _, _) ->
% This case is created when, for example, we want int * int, but instead we
% get a term like ((1, 2), 3), of type (int * int) * int. The trouble is,
% ((1, 2)) would have been valid, so it's actually the second comma that
% tips us off to the error, not the first one.
%
% For simpler cases, like (1, 2) when int was expected, this error message
% is fine:
Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}},
% TODO: Row/col
% TODO: Generate better error messages in the cases where N *is* a tuple,
% but the first thing inside that tuple is the problem.
Err.
%%% Unambiguous Tuple/Variant Parsing
parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) -> parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, ")", Row2, Start2, _}, NewTk, NewString}} -> {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
{ok, {Token, NewTk, NewString}} -> {ok, {Token, NewTk, NewString}} ->
parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token) parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token)
@ -310,7 +435,7 @@ parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) ->
Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}), Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
{error, Wrapped} {error, Wrapped}
end; end;
parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) -> parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _, _}) ->
{ok, {lists:reverse(Acc), Tk, String}}; {ok, {lists:reverse(Acc), Tk, String}};
parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) -> parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) ->
{error, {unexpected_token, S, Row, Start, End}}. {error, {unexpected_token, S, Row, Start, End}}.
@ -321,6 +446,8 @@ parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) ->
check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
{ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc); parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc);
{ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, Actual, Row, Start, End}};
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
end. end.
@ -364,7 +491,7 @@ parse_variant3(Arities, Tag, ElemTypes, Tk, String) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} -> {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} ->
parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start); parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start);
{ok, {{_, Actual, _, Row, Start, End}}} -> {ok, {{_, Actual, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, Actual, Row, Start, End}} {error, {unexpected_token, Actual, Row, Start, End}}
end. end.
@ -476,6 +603,10 @@ parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) ->
error -> error ->
{error, {missing_field, Name}} {error, {missing_field, Name}}
end; end;
parse_record_final_loop([], _, [Field]) ->
% Singleton records are type-checked in Sophia, but unwrapped in the
% resulting FATE.
{ok, Field};
parse_record_final_loop([], _, FieldsReverse) -> parse_record_final_loop([], _, FieldsReverse) ->
Fields = lists:reverse(FieldsReverse), Fields = lists:reverse(FieldsReverse),
Tuple = list_to_tuple(Fields), Tuple = list_to_tuple(Fields),
@ -490,7 +621,7 @@ parse_map(KeyType, ValueType, Tk, String, Acc) ->
parse_map2(KeyType, ValueType, NewTk, NewString, Acc); parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
{ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
{ok, {Acc, NewTk, NewString}}; {ok, {Acc, NewTk, NewString}};
{ok, {{_, S, _, Row, Start, End}}} -> {ok, {{_, S, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}} {error, {unexpected_token, S, Row, Start, End}}
end. end.
@ -525,7 +656,7 @@ parse_map5(KeyType, ValueType, Tk, String, Acc) ->
parse_map(KeyType, ValueType, NewTk, NewString, Acc); parse_map(KeyType, ValueType, NewTk, NewString, Acc);
{ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
{ok, {Acc, NewTk, NewString}}; {ok, {Acc, NewTk, NewString}};
{ok, {{_, S, _, Row, Start, End}}} -> {ok, {{_, S, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}} {error, {unexpected_token, S, Row, Start, End}}
end. end.
@ -570,15 +701,15 @@ check_parser(Sophia) ->
% syntax, and to get an AACI object to pass to the parser. % syntax, and to get an AACI object to pass to the parser.
Source = "contract C = entrypoint f() = " ++ Sophia, Source = "contract C = entrypoint f() = " ++ Sophia,
{Code, Type} = compile_entrypoint_code_and_type(Source, "f"), {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
% Check that when we parse the term we get the same value as the Sophia
% compiler.
Fate = extract_return_value(Code), Fate = extract_return_value(Code),
check_sophia_to_fate(unknown_type(), Sophia, Fate),
% Also check that the FATE term is valid, by running it through gmb. % Then, once we know that the term is correct, make sure that it is still
gmb_fate_encoding:serialize(Fate), % accepted *with* type info.
check_sophia_to_fate(Type, Sophia, Fate).
% Now check that our parser produces that output.
check_sophia_to_fate(Type, Sophia, Fate),
% Also check that it can be parsed without type information.
check_sophia_to_fate(unknown_type(), Sophia, Fate).
check_parser_with_typedef(Typedef, Sophia) -> check_parser_with_typedef(Typedef, Sophia) ->
% Compile the type definitions alongside the usual literal expression. % Compile the type definitions alongside the usual literal expression.
@ -602,8 +733,6 @@ anon_types_test() ->
check_parser("#DE_AD0_00B_EEF"), check_parser("#DE_AD0_00B_EEF"),
% Strings. % Strings.
check_parser("\"hello world\""), check_parser("\"hello world\""),
check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""),
check_parser("\"\\x00\\x11\\x77\""),
% List of integers. % List of integers.
check_parser("[1, 2, 3]"), check_parser("[1, 2, 3]"),
% List of lists. % List of lists.
@ -615,6 +744,12 @@ anon_types_test() ->
ok. ok.
string_escape_codes_test() ->
check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""),
check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""),
ok.
records_test() -> records_test() ->
TypeDef = "record pair = {x: int, y: int}", TypeDef = "record pair = {x: int, y: int}",
Sophia = "{x = 1, y = 2}", Sophia = "{x = 1, y = 2}",
@ -623,6 +758,43 @@ records_test() ->
% will error, though. % will error, though.
{error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia). {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).
singleton_records_test() ->
TypeDef = "record singleton('a) = {it: 'a}",
check_parser_with_typedef(TypeDef, "{it = 123}"),
check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"),
check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"),
ok.
singleton_variants_test() ->
% Similar tests to the singleton records, but this time there isn't
% actually a special case; singleton variants are in fact wrapped in the
% FATE too.
TypeDef = "datatype wrapped('a) = Wrap('a)",
check_parser_with_typedef(TypeDef, "Wrap(123)"),
check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"),
check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"),
ok.
excess_parens_test() ->
% 'singleton' parens are another special case, but unlike singleton
% records, which exist in the type system, singleton parens aren't tuples
% at all! They are just grouping, for arithmetic. For example.
check_parser("(123)"),
check_parser("[1, (2), ((3))]"),
% Where this gets tricky, though, is when grouping parens are mixed with
% tuple parens. E.g. this list of three tuples should all parse to the same
% result.
check_parser("[((1, 2)), ((1), 2), (((1), 2))]"),
% Including multiple nestings of tuples and grouping, interleaved.
check_parser("((((1), ((2, 3)))), 4)"),
% Also empty tuples exist!
check_parser("()"),
check_parser("(((((), ())), ()))"),
ok.
variant_test() -> variant_test() ->
TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)", TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",