From 60985130cb039eff53877c7e55415a7a9bce77c3 Mon Sep 17 00:00:00 2001 From: Jarvis Carroll Date: Fri, 13 Feb 2026 04:08:58 +0000 Subject: [PATCH] Refine tuple parsing errors There are four major fixes here: 1. some eof tokens were being pattern matched with the wrong arity 2. tuples that are too long actually speculatively parse as an untyped tuple, and then complain that there were too many elements, 3. singleton tuples with a trailing comma are now handled differently to grouping parentheses, consistently between typed and untyped logic 4. the extra return values used to detect untyped singleton tuples are also used to pass the close paren position, so that too_many_elements can report the correct file position too. Point 4. also completely removes the need for tracking open paren positions that I was doing, and that I thought I would need to do even more of in the ambiguous-open-paren-stack case. --- src/hz_sophia.erl | 254 ++++++++++++++++++++++++++++++---------------- 1 file changed, 165 insertions(+), 89 deletions(-) diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index a2e00ee..13ba1e7 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -28,7 +28,7 @@ parse_literal(Type, String) -> parse_literal2(Result, Pos, String) -> % We have parsed a valid expression. Now check that the string ends. case next_token(Pos, String) of - {ok, {{eof, _, _, _, _}, _, _}} -> + {ok, {{eof, _, _, _, _, _}, _, _}} -> {ok, Result}; {ok, {Token, _, _}} -> unexpected_token(Token); @@ -46,7 +46,7 @@ parse_literal2(Result, Pos, String) -> -define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))). next_token({Row, Col}, []) -> - {ok, {{eof, "", Row, Col, Col}, {Row, Col}, []}}; + {ok, {{eof, "", [], Row, Col, Col}, {Row, Col}, []}}; next_token({Row, Col}, " " ++ Rest) -> next_token({Row, Col + 1}, Rest); next_token({Row, Col}, "\t" ++ Rest) -> @@ -236,8 +236,8 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) -> end; parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) -> parse_list(Type, Pos, String, Row, Start); -parse_expression2(Type, Pos, String, {character, "(", _, Row, Start, _}) -> - parse_tuple(Type, Pos, String, Row, Start); +parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) -> + parse_tuple(Type, Pos, String); parse_expression2(Type, Pos, String, {character, "{", _, Row, Start, _}) -> parse_record_or_map(Type, Pos, String, Row, Start); parse_expression2(Type, Pos, String, {alphanum, _, Path, Row, Start, End}) -> @@ -339,27 +339,35 @@ typecheck_signature({O, N, _}, _, _, _, Row, Start, End) -> %%% List Parsing -parse_list({_, _, {list, [Inner]}}, Pos, String, Row, Start) -> - parse_list_loop(Inner, Pos, String, "]", Row, Start, []); -parse_list({_, _, unknown_type}, Pos, String, Row, Start) -> - parse_list_loop(unknown_type(), Pos, String, "]", Row, Start, []); +parse_list({_, _, {list, [Inner]}}, Pos, String, _, _) -> + parse_list2(Inner, Pos, String); +parse_list({_, _, unknown_type}, Pos, String, _, _) -> + parse_list2(unknown_type(), Pos, String); parse_list({O, N, _}, _, _, Row, Start) -> {error, {wrong_type, O, N, list, Row, Start, Start}}. -parse_list_loop(Inner, Pos, String, CloseChar, Row, Start, Acc) -> - case next_token(Pos, String) of - {ok, {{character, CloseChar, _, _, _, _}, NewPos, NewString}} -> - {ok, {lists:reverse(Acc), NewPos, NewString}}; - {ok, {Token, NewPos, NewString}} -> - parse_list_loop2(Inner, NewPos, NewString, CloseChar, Row, Start, Acc, Token); +parse_list2(Inner, Pos, String) -> + case parse_list_loop(Inner, Pos, String, "]", []) of + {ok, {Result, _, _, NewPos, NewString}} -> + {ok, {Result, NewPos, NewString}}; {error, Reason} -> {error, Reason} end. -parse_list_loop2(Inner, Pos, String, CloseChar, Row, Start, Acc, Token) -> +parse_list_loop(Inner, Pos, String, CloseChar, Acc) -> + case next_token(Pos, String) of + {ok, {{character, CloseChar, _, Row, Col, _}, NewPos, NewString}} -> + {ok, {lists:reverse(Acc), true, {Row, Col}, NewPos, NewString}}; + {ok, {Token, NewPos, NewString}} -> + parse_list_loop2(Inner, NewPos, NewString, CloseChar, Acc, Token); + {error, Reason} -> + {error, Reason} + end. + +parse_list_loop2(Inner, Pos, String, CloseChar, Acc, Token) -> case parse_expression2(Inner, Pos, String, Token) of {ok, {Value, NewPos, NewString}} -> - parse_list_loop3(Inner, NewPos, NewString, CloseChar, Row, Start, [Value | Acc]); + parse_list_loop3(Inner, NewPos, NewString, CloseChar, [Value | Acc]); {error, Reason} -> Wrapper = choose_list_error_wrapper(CloseChar), % TODO: Are tuple indices off by one from list indices? @@ -367,12 +375,12 @@ parse_list_loop2(Inner, Pos, String, CloseChar, Row, Start, Acc, Token) -> {error, Wrapped} end. -parse_list_loop3(Inner, Pos, String, CloseChar, Row, Start, Acc) -> +parse_list_loop3(Inner, Pos, String, CloseChar, Acc) -> case next_token(Pos, String) of - {ok, {{character, CloseChar, _, _, _, _}, NewPos, NewString}} -> - {ok, {lists:reverse(Acc), NewPos, NewString}}; + {ok, {{character, CloseChar, _, Row, Col, _}, NewPos, NewString}} -> + {ok, {lists:reverse(Acc), false, {Row, Col}, NewPos, NewString}}; {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} -> - parse_list_loop(Inner, NewPos, NewString, CloseChar, Row, Start, Acc); + parse_list_loop(Inner, NewPos, NewString, CloseChar, Acc); {ok, {Token, _, _}} -> unexpected_token(Token, CloseChar); {error, Reason} -> @@ -384,22 +392,22 @@ choose_list_error_wrapper(")") -> tuple_element. %%% Ambiguous Parenthesis Parsing -parse_tuple({_, _, unknown_type}, Pos, String, Row, Start) -> +parse_tuple({_, _, unknown_type}, Pos, String) -> % An untyped tuple is a list of untyped terms, and weirdly our list parser % works perfectly for that, as long as we change the closing character to % be ")" instead of "]". - case parse_list_loop(unknown_type(), Pos, String, ")", Row, Start, []) of - {ok, {[Inner], NewPos, NewString}} -> - % In Sophia, singleton tuples are unwrapped, and given the inner - % type. + case parse_list_loop(unknown_type(), Pos, String, ")", []) of + {ok, {[Inner], false, _, NewPos, NewString}} -> + % In Sophia, trailing commas are invalid, and so all singleton + % tuples are unwrapped, and translated into the inner type. {ok, {Inner, NewPos, NewString}}; - {ok, {TermList, NewPos, NewString}} -> + {ok, {TermList, _, _, NewPos, NewString}} -> Result = {tuple, list_to_tuple(TermList)}, {ok, {Result, NewPos, NewString}}; {error, Reason} -> {error, Reason} end; -parse_tuple({O, N, T}, Pos, String, _, _) -> +parse_tuple(Type, Pos, String) -> % Typed tuple parsing is quite complex, because we also want to support % normal parentheses for grouping. It's not strictly necessary for % inputting data, since we don't have any infix operators in simple @@ -413,9 +421,9 @@ parse_tuple({O, N, T}, Pos, String, _, _) -> {ok, {Count, Token, NewPos, NewString}} -> % Compare that to the amount of nesting tuple connectives are in % the type we are expected to produce. - {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []), + {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, Type, []), % Now work out what to do with all this information. - parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewPos, NewString, Token); + parse_tuple2(ExcessCount, HeadType, Tails, NewPos, NewString, Token); {error, Reason} -> {error, Reason} end. @@ -437,23 +445,23 @@ extract_tuple_type_info(ParenCount, HeadType, Tails) -> % No parens, or no more (non-empty) tuples. Stop! {ParenCount, HeadType, Tails}. -parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) -> +parse_tuple2(_, {_, _, unknown_type}, [_ | _], _, _, _) -> {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."}; -parse_tuple2(O, N, ExcessCount, HeadType, Tails, Pos, String, {character, ")", _, Row, Col, _}) -> - parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Pos, String, Row, Col); -parse_tuple2(O, N, ExcessCount, HeadType, Tails, Pos, String, Token) -> +parse_tuple2(ExcessCount, HeadType, Tails, Pos, String, {character, ")", _, Row, Col, _}) -> + parse_empty_tuple(ExcessCount, HeadType, Tails, Pos, String, Row, Col); +parse_tuple2(ExcessCount, HeadType, Tails, Pos, String, Token) -> % Finished with parentheses for now, try and parse an expression out, to % get our head term. case parse_expression2(HeadType, Pos, String, Token) of {ok, {Result, NewPos, NewString}} -> % Got a head term. Now try to build all the other tuple layers. - parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewPos, NewString); + parse_tuple_tails(ExcessCount, Result, Tails, NewPos, NewString); {error, Reason} -> % TODO: Wrap errors here too. {error, Reason} end. -parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) -> +parse_empty_tuple(0, _, Tails, _, _, Row, Col) -> % There are zero excess parens, meaning all our parens are tuples. Get the % top one. [Tail | _] = Tails, @@ -461,44 +469,32 @@ parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) -> % got zero. ExpectCount = 1 + length(Tail), {error, {not_enough_elements, ExpectCount, 0, Row, Col}}; -parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Pos, String, _, _) -> +parse_empty_tuple(ExcessCount, {_, _, {tuple, []}}, Tails, Pos, String, _, _) -> % If we have some ambiguous parentheses left, we now know one of them is % this empty tuple. HeadTerm = {tuple, {}}, NewExcessCount = ExcessCount - 1, % Now continue the loop as if it were an integer or something, in the head % position. - parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Pos, String); -parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) -> + parse_tuple_tails(NewExcessCount, HeadTerm, Tails, Pos, String); +parse_empty_tuple(_, {HeadO, HeadN, _}, _, _, _, Row, Col) -> % We were expecting a head term of a different type! {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}. -parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Pos, String) -> - % Tuples left to build, but no extra open parens to deal with, so we can - % just parse multivalues naively, starting from the "we have a term, - % waiting for a comma" stage of the loop. - case parse_multivalue3(TailTypes, Pos, String, -1, -1, [HeadTerm]) of - {ok, {Terms, NewPos, NewString}} -> - NewHead = {tuple, list_to_tuple(Terms)}, - parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewPos, NewString); - {error, Reason} -> - % TODO: More error wrapping? - {error, Reason} - end; -parse_tuple_tails(_, _, 0, HeadTerm, [], Pos, String) -> +parse_tuple_tails(0, HeadTerm, [], Pos, String) -> % No open parens left, no tuples left to build, we are done! {ok, {HeadTerm, Pos, String}}; -parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Pos, String) -> +parse_tuple_tails(ExcessCount, HeadTerm, Tails, Pos, String) -> % The ambiguous case, where we have a mix of tuple parens, and grouping % parens. We want to peek at the next token, to see if it closes a grouping % paren. case next_token(Pos, String) of - {ok, {{character, ")", _, _, _, _}, NewPos, NewString}} -> - % It is grouping! Close one excess paren, and continue. - parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewPos, NewString); - {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} -> - % It is a real tuple! Try the normal logic, then. - parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewPos, NewString); + {ok, {{character, ")", _, Row, Col, _}, NewPos, NewString}} -> + % It is grouping! Try closing a grouping paren. + parse_tuple_tails_paren(ExcessCount, HeadTerm, Tails, NewPos, NewString, Row, Col); + {ok, {{character, ",", _, Row, Col, _}, NewPos, NewString}} -> + % It is a real tuple! Try parsing a tuple. + parse_tuple_tails_comma(ExcessCount, HeadTerm, Tails, NewPos, NewString, Row, Col); {ok, {Token, _, _}} -> % Anything else is just a boring parse error we can complain about. unexpected_token(Token, ")"); @@ -506,68 +502,93 @@ parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Pos, String) -> {error, Reason} end. -parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Pos, String) -> - case parse_multivalue(TailTypes, Pos, String, -1, -1, [HeadTerm]) of +parse_tuple_tails_paren(0, _, [[] | _], _, _, Row, Col) -> + % A singleton tuple was expected, but a grouping paren was given. In theory + % we could be permissive here, but we were asked to do type checking, and + % this is a type error. The type error itself is a bit hard to reproduce, + % but we do know exactly what the fix is, so let's report that instead. + {error, {expected_trailing_comma, Row, Col}}; +parse_tuple_tails_paren(0, _, [Tail | _], _, _, Row, Col) -> + % A tuple (of more than one elements) was expected, but a grouping paren + % was given. Again, the type error is hard to produce, but the actual + % solution is simple; add more elements. + ExpectCount = length(Tail) + 1, + GotCount = 1, + {error, {not_enough_elements, ExpectCount, GotCount, Row, Col}}; +parse_tuple_tails_paren(ExcessCount, HeadTerm, Tails, Pos, String, _, _) -> + % We were expecting some grouping parens, and now we know that one of them + % was in fact grouping. Good. + parse_tuple_tails(ExcessCount - 1, HeadTerm, Tails, Pos, String). + +parse_tuple_tails_comma(_, _, [], _, _, Row, Col) -> + % No more tuples, so commas are invalid. It's hard to describe the type + % error that a comma would actually produce, so instead let's just give + % the user the actual solution to their problems, which is to remove the + % comma. + {error, {expected_close_paren, Row, Col}}; +parse_tuple_tails_comma(ExcessCount, HeadTerm, Tails, Pos, String, _, _) -> + % If there are no tails then we would have exited into the "grouping parens + % only" case, so we know this works: + [TailTypes | ParentTails] = Tails, + % Now we can parse this tuple as a tuple. + case parse_multivalue(TailTypes, Pos, String, [HeadTerm]) of {ok, {Terms, NewPos, NewString}} -> NewHead = {tuple, list_to_tuple(Terms)}, - parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewPos, NewString); + % Then continue the loop, with whatever parent tuple types this + % tuple is meant to be a part of. + parse_tuple_tails(ExcessCount, NewHead, ParentTails, NewPos, NewString); {error, Reason} -> % TODO: wrap errors? {error, Reason} - end; -parse_tuple_tails2(O, N, _, _, [], _, _) -> - % This case is created when, for example, we want int * int, but instead we - % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is, - % ((1, 2)) would have been valid, so it's actually the second comma that - % tips us off to the error, not the first one. - % - % For simpler cases, like (1, 2) when int was expected, this error message - % is fine: - Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}}, - % TODO: Row/col - % TODO: Generate better error messages in the cases where N *is* a tuple, - % but the first thing inside that tuple is the problem. - Err. + end. %%% Unambiguous Tuple/Variant Parsing -parse_multivalue(ElemTypes, Pos, String, Row, Start, Acc) -> +parse_multivalue(ElemTypes, Pos, String, Acc) -> case next_token(Pos, String) of {ok, {{character, ")", _, Row2, Start2, _}, NewPos, NewString}} -> check_multivalue_long_enough(ElemTypes, NewPos, NewString, Row2, Start2, Acc); {ok, {Token, NewPos, NewString}} -> - parse_multivalue2(ElemTypes, NewPos, NewString, Row, Start, Acc, Token); + parse_multivalue2(ElemTypes, NewPos, NewString, Acc, Token); {error, Reason} -> {error, Reason} end. -parse_multivalue2([Next | Rest], Pos, String, Row, Start, Acc, Token) -> +parse_multivalue2([Next | Rest], Pos, String, Acc, Token) -> case parse_expression2(Next, Pos, String, Token) of {ok, {Value, NewPos, NewString}} -> - parse_multivalue3(Rest, NewPos, NewString, Row, Start, [Value | Acc]); + parse_multivalue3(Rest, NewPos, NewString, [Value | Acc]); {error, Reason} -> Wrapper = choose_list_error_wrapper(")"), % TODO: Are tuple indices off by one from list indices? Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}), {error, Wrapped} end; -parse_multivalue2([], Pos, String, _, _, Acc, {character, ")", _, _, _, _}) -> - {ok, {lists:reverse(Acc), Pos, String}}; -parse_multivalue2([], _, _, _, _, _, Token) -> - unexpected_token(Token, ")"). +parse_multivalue2([], Pos, String, Acc, Token) -> + count_multivalue_excess(Pos, String, Acc, Token). -parse_multivalue3(ElemTypes, Pos, String, Row, Start, Acc) -> +parse_multivalue3(ElemTypes, Pos, String, Acc) -> case next_token(Pos, String) of {ok, {{character, ")", _, Row2, Start2, _}, NewPos, NewString}} -> check_multivalue_long_enough(ElemTypes, NewPos, NewString, Row2, Start2, Acc); {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} -> - parse_multivalue(ElemTypes, NewPos, NewString, Row, Start, Acc); + parse_multivalue(ElemTypes, NewPos, NewString, Acc); {ok, {Token, _, _}} -> unexpected_token(Token, ")"); {error, Reason} -> {error, Reason} end. +count_multivalue_excess(Pos, String, TypedAcc, Token) -> + ExpectedLen = length(TypedAcc), + case parse_list_loop2(unknown_type(), Pos, String, ")", TypedAcc, Token) of + {ok, {TermList, _, {Row, Col}, _, _}} -> + ActualLen = length(TermList), + {error, {too_many_elements, ExpectedLen, ActualLen, Row, Col}}; + {error, Reason} -> + {error, Reason} + end. + check_multivalue_long_enough([], Pos, String, _, _, Acc) -> {ok, {lists:reverse(Acc), Pos, String}}; check_multivalue_long_enough(Remaining, _, _, Row, Col, Got) -> @@ -621,16 +642,16 @@ parse_variant3(Arities, Tag, [], Pos, String) -> {ok, {Result, Pos, String}}; parse_variant3(Arities, Tag, ElemTypes, Pos, String) -> case next_token(Pos, String) of - {ok, {{character, "(", _, Row, Start, _}, NewPos, NewString}} -> - parse_variant4(Arities, Tag, ElemTypes, NewPos, NewString, Row, Start); + {ok, {{character, "(", _, _, _, _}, NewPos, NewString}} -> + parse_variant4(Arities, Tag, ElemTypes, NewPos, NewString); {ok, {Token, _, _}} -> unexpected_token(Token, "("); {error, Reason} -> {error, Reason} end. -parse_variant4(Arities, Tag, ElemTypes, Pos, String, Row, Start) -> - case parse_multivalue(ElemTypes, Pos, String, Row, Start, []) of +parse_variant4(Arities, Tag, ElemTypes, Pos, String) -> + case parse_multivalue(ElemTypes, Pos, String, []) of {ok, {Terms, NewPos, NewString}} -> Result = {variant, Arities, Tag, list_to_tuple(Terms)}, {ok, {Result, NewPos, NewString}}; @@ -907,6 +928,15 @@ variant_test() -> ok. +ambiguous_variant_test() -> + TypeDef = "datatype mytype = C | D", + check_parser_with_typedef(TypeDef, "C"), + check_parser_with_typedef(TypeDef, "D"), + check_parser_with_typedef(TypeDef, "C.C"), + check_parser_with_typedef(TypeDef, "C.D"), + + ok. + namespace_variant_test() -> Term = "[N.A, N.B]", Source = "namespace N = datatype mytype = A | B\ncontract C = entrypoint f() = " ++ Term, @@ -997,3 +1027,49 @@ lexer_offset_test() -> ok. +parser_offset_test() -> + {_, Type} = compile_entrypoint_value_and_type("contract C = entrypoint f() = ((1, 2), (3, 4))", "f"), + + {error, {not_enough_elements, 2, 1, 1, 8}} = parse_literal(Type, "((1, 2))"), + {error, {not_enough_elements, 2, 1, 1, 10}} = parse_literal(Type, "(((1, 2)))"), + {error, {too_many_elements, 2, 3, 1, 24}} = parse_literal(Type, "((1, 2), (3, 4), (5, 6))"), + {error, {too_many_elements, 2, 3, 1, 10}} = parse_literal(Type, "((1, 2, 3), (4, 5))"), + + ok. + +singleton_test() -> + % The Sophia compiler would never generate this, but it is a valid type + % within the FATE virtual machine, and it is possible to represent within + % the ACI itself. + SingletonACI = #{tuple => [<<"int">>]}, + + % Build an AACI around this, and run it through the AACI machinery. + Function = #{name => <<"f">>, + arguments => [], + stateful => false, + payable => false, + returns => SingletonACI}, + ACI = [#{contract => #{functions => [Function], + name => <<"C">>, + kind => contract_main, + payable => false, + typedefs => []}}], + {aaci, "C", #{"f" := {[], SingletonType}}, _} = hz_aaci:prepare(ACI), + + % Now let's do some testing with this weird type, to see if we handle it + % correctly. + {ok, {tuple, {1}}} = parse_literal(SingletonType, "(1,)"), + % Some ambiguous nesting parens, for fun. + {ok, {tuple, {1}}} = parse_literal(SingletonType, "(((1),))"), + % No trailing comma should give an error. + {error, {expected_trailing_comma, 1, 3}} = parse_literal(SingletonType, "(1)"), + % All of the above should behave the same in untyped contexts: + {ok, {tuple, {1}}} = parse_literal(unknown_type(), "(1,)"), + {ok, {tuple, {1}}} = parse_literal(unknown_type(), "(((1),))"), + {ok, 1} = parse_literal(unknown_type(), "(1)"), + + % Also if we wanted an integer, the singleton is NOT dropped, so is also an + % error. + {error, {expected_close_paren, 1, 3}} = parse_literal({integer, alread_normalized, integer}, "(1,)"), + + ok.