Merge branch 'parser' of ssh://git.qpq.swiss:21203/QPQ-AG/hakuzaru into parser

2026-02-13 13:50:57 +09:00
parent efe0a64056 60985130cb
commit a1fc5f19fa
1 changed files with 165 additions and 89 deletions
@@ -28,7 +28,7 @@ parse_literal(Type, String) ->
 parse_literal2(Result, Pos, String) ->
    % We have parsed a valid expression. Now check that the string ends.
    case next_token(Pos, String) of
-        {ok, {{eof, _, _, _, _}, _, _}} ->
+        {ok, {{eof, _, _, _, _, _}, _, _}} ->
            {ok, Result};
        {ok, {Token, _, _}} ->
            unexpected_token(Token);
@@ -46,7 +46,7 @@ parse_literal2(Result, Pos, String) ->
 -define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
 next_token({Row, Col}, []) ->
-    {ok, {{eof, "", Row, Col, Col}, {Row, Col}, []}};
+    {ok, {{eof, "", [], Row, Col, Col}, {Row, Col}, []}};
 next_token({Row, Col}, " " ++ Rest) ->
    next_token({Row, Col + 1}, Rest);
 next_token({Row, Col}, "\t" ++ Rest) ->
@@ -236,8 +236,8 @@ parse_expression2(Type, Pos, String, {string, _, Value, Row, Start, End}) ->
    end;
 parse_expression2(Type, Pos, String, {character, "[", _, Row, Start, _}) ->
    parse_list(Type, Pos, String, Row, Start);
-parse_expression2(Type, Pos, String, {character, "(", _, Row, Start, _}) ->
+parse_expression2(Type, Pos, String, {character, "(", _, _, _, _}) ->
-    parse_tuple(Type, Pos, String, Row, Start);
+    parse_tuple(Type, Pos, String);
 parse_expression2(Type, Pos, String, {character, "{", _, Row, Start, _}) ->
    parse_record_or_map(Type, Pos, String, Row, Start);
 parse_expression2(Type, Pos, String, {alphanum, _, Path, Row, Start, End}) ->
@@ -339,27 +339,35 @@ typecheck_signature({O, N, _}, _, _, _, Row, Start, End) ->
 %%% List Parsing
-parse_list({_, _, {list, [Inner]}}, Pos, String, Row, Start) ->
+parse_list({_, _, {list, [Inner]}}, Pos, String, _, _) ->
-    parse_list_loop(Inner, Pos, String, "]", Row, Start, []);
+    parse_list2(Inner, Pos, String);
-parse_list({_, _, unknown_type}, Pos, String, Row, Start) ->
+parse_list({_, _, unknown_type}, Pos, String, _, _) ->
-    parse_list_loop(unknown_type(), Pos, String, "]", Row, Start, []);
+    parse_list2(unknown_type(), Pos, String);
 parse_list({O, N, _}, _, _, Row, Start) ->
    {error, {wrong_type, O, N, list, Row, Start, Start}}.
-parse_list_loop(Inner, Pos, String, CloseChar, Row, Start, Acc) ->
+parse_list2(Inner, Pos, String) ->
-    case next_token(Pos, String) of
+    case parse_list_loop(Inner, Pos, String, "]", []) of
-        {ok, {{character, CloseChar, _, _, _, _}, NewPos, NewString}} ->
+        {ok, {Result, _, _, NewPos, NewString}} ->
-            {ok, {lists:reverse(Acc), NewPos, NewString}};
+            {ok, {Result, NewPos, NewString}};
        {ok, {Token, NewPos, NewString}} ->
            parse_list_loop2(Inner, NewPos, NewString, CloseChar, Row, Start, Acc, Token);
        {error, Reason} ->
            {error, Reason}
    end.
-parse_list_loop2(Inner, Pos, String, CloseChar, Row, Start, Acc, Token) ->
+parse_list_loop(Inner, Pos, String, CloseChar, Acc) ->
    case next_token(Pos, String) of
        {ok, {{character, CloseChar, _, Row, Col, _}, NewPos, NewString}} ->
            {ok, {lists:reverse(Acc), true, {Row, Col}, NewPos, NewString}};
        {ok, {Token, NewPos, NewString}} ->
            parse_list_loop2(Inner, NewPos, NewString, CloseChar, Acc, Token);
        {error, Reason} ->
            {error, Reason}
    end.
 parse_list_loop2(Inner, Pos, String, CloseChar, Acc, Token) ->
    case parse_expression2(Inner, Pos, String, Token) of
        {ok, {Value, NewPos, NewString}} ->
-            parse_list_loop3(Inner, NewPos, NewString, CloseChar, Row, Start, [Value | Acc]);
+            parse_list_loop3(Inner, NewPos, NewString, CloseChar, [Value | Acc]);
        {error, Reason} ->
            Wrapper = choose_list_error_wrapper(CloseChar),
            % TODO: Are tuple indices off by one from list indices?
@@ -367,12 +375,12 @@ parse_list_loop2(Inner, Pos, String, CloseChar, Row, Start, Acc, Token) ->
            {error, Wrapped}
    end.
-parse_list_loop3(Inner, Pos, String, CloseChar, Row, Start, Acc) ->
+parse_list_loop3(Inner, Pos, String, CloseChar, Acc) ->
    case next_token(Pos, String) of
-        {ok, {{character, CloseChar, _, _, _, _}, NewPos, NewString}} ->
+        {ok, {{character, CloseChar, _, Row, Col, _}, NewPos, NewString}} ->
-            {ok, {lists:reverse(Acc), NewPos, NewString}};
+            {ok, {lists:reverse(Acc), false, {Row, Col}, NewPos, NewString}};
        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
-            parse_list_loop(Inner, NewPos, NewString, CloseChar, Row, Start, Acc);
+            parse_list_loop(Inner, NewPos, NewString, CloseChar, Acc);
        {ok, {Token, _, _}} ->
            unexpected_token(Token, CloseChar);
        {error, Reason} ->
@@ -384,22 +392,22 @@ choose_list_error_wrapper(")") -> tuple_element.
 %%% Ambiguous Parenthesis Parsing
-parse_tuple({_, _, unknown_type}, Pos, String, Row, Start) ->
+parse_tuple({_, _, unknown_type}, Pos, String) ->
    % An untyped tuple is a list of untyped terms, and weirdly our list parser
    % works perfectly for that, as long as we change the closing character to
    % be ")" instead of "]".
-    case parse_list_loop(unknown_type(), Pos, String, ")", Row, Start, []) of
+    case parse_list_loop(unknown_type(), Pos, String, ")", []) of
-        {ok, {[Inner], NewPos, NewString}} ->
+        {ok, {[Inner], false, _, NewPos, NewString}} ->
-            % In Sophia, singleton tuples are unwrapped, and given the inner
+            % In Sophia, trailing commas are invalid, and so all singleton
-            % type.
+            % tuples are unwrapped, and translated into the inner type.
            {ok, {Inner, NewPos, NewString}};
-        {ok, {TermList, NewPos, NewString}} ->
+        {ok, {TermList, _, _, NewPos, NewString}} ->
            Result = {tuple, list_to_tuple(TermList)},
            {ok, {Result, NewPos, NewString}};
        {error, Reason} ->
            {error, Reason}
    end;
-parse_tuple({O, N, T}, Pos, String, _, _) ->
+parse_tuple(Type, Pos, String) ->
    % Typed tuple parsing is quite complex, because we also want to support
    % normal parentheses for grouping. It's not strictly necessary for
    % inputting data, since we don't have any infix operators in simple
@@ -413,9 +421,9 @@ parse_tuple({O, N, T}, Pos, String, _, _) ->
        {ok, {Count, Token, NewPos, NewString}} ->
            % Compare that to the amount of nesting tuple connectives are in
            % the type we are expected to produce.
-            {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []),
+            {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, Type, []),
            % Now work out what to do with all this information.
-            parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewPos, NewString, Token);
+            parse_tuple2(ExcessCount, HeadType, Tails, NewPos, NewString, Token);
        {error, Reason} ->
            {error, Reason}
    end.
@@ -437,23 +445,23 @@ extract_tuple_type_info(ParenCount, HeadType, Tails) ->
    % No parens, or no more (non-empty) tuples. Stop!
    {ParenCount, HeadType, Tails}.
-parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) ->
+parse_tuple2(_, {_, _, unknown_type}, [_ | _], _, _, _) ->
    {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."};
-parse_tuple2(O, N, ExcessCount, HeadType, Tails, Pos, String, {character, ")", _, Row, Col, _}) ->
+parse_tuple2(ExcessCount, HeadType, Tails, Pos, String, {character, ")", _, Row, Col, _}) ->
-    parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Pos, String, Row, Col);
+    parse_empty_tuple(ExcessCount, HeadType, Tails, Pos, String, Row, Col);
-parse_tuple2(O, N, ExcessCount, HeadType, Tails, Pos, String, Token) ->
+parse_tuple2(ExcessCount, HeadType, Tails, Pos, String, Token) ->
    % Finished with parentheses for now, try and parse an expression out, to
    % get our head term.
    case parse_expression2(HeadType, Pos, String, Token) of
        {ok, {Result, NewPos, NewString}} ->
            % Got a head term. Now try to build all the other tuple layers.
-            parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewPos, NewString);
+            parse_tuple_tails(ExcessCount, Result, Tails, NewPos, NewString);
        {error, Reason} ->
            % TODO: Wrap errors here too.
            {error, Reason}
    end.
-parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
+parse_empty_tuple(0, _, Tails, _, _, Row, Col) ->
    % There are zero excess parens, meaning all our parens are tuples. Get the
    % top one.
    [Tail | _] = Tails,
@@ -461,44 +469,32 @@ parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) ->
    % got zero.
    ExpectCount = 1 + length(Tail),
    {error, {not_enough_elements, ExpectCount, 0, Row, Col}};
-parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Pos, String, _, _) ->
+parse_empty_tuple(ExcessCount, {_, _, {tuple, []}}, Tails, Pos, String, _, _) ->
    % If we have some ambiguous parentheses left, we now know one of them is
    % this empty tuple.
    HeadTerm = {tuple, {}},
    NewExcessCount = ExcessCount - 1,
    % Now continue the loop as if it were an integer or something, in the head
    % position.
-    parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Pos, String);
+    parse_tuple_tails(NewExcessCount, HeadTerm, Tails, Pos, String);
-parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
+parse_empty_tuple(_, {HeadO, HeadN, _}, _, _, _, Row, Col) ->
    % We were expecting a head term of a different type!
    {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}.
-parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Pos, String) ->
+parse_tuple_tails(0, HeadTerm, [], Pos, String) ->
    % Tuples left to build, but no extra open parens to deal with, so we can
    % just parse multivalues naively, starting from the "we have a term,
    % waiting for a comma" stage of the loop.
    case parse_multivalue3(TailTypes, Pos, String, -1, -1, [HeadTerm]) of
        {ok, {Terms, NewPos, NewString}} ->
            NewHead = {tuple, list_to_tuple(Terms)},
            parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewPos, NewString);
        {error, Reason} ->
            % TODO: More error wrapping?
            {error, Reason}
    end;
 parse_tuple_tails(_, _, 0, HeadTerm, [], Pos, String) ->
    % No open parens left, no tuples left to build, we are done!
    {ok, {HeadTerm, Pos, String}};
-parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Pos, String) ->
+parse_tuple_tails(ExcessCount, HeadTerm, Tails, Pos, String) ->
    % The ambiguous case, where we have a mix of tuple parens, and grouping
    % parens. We want to peek at the next token, to see if it closes a grouping
    % paren.
    case next_token(Pos, String) of
-        {ok, {{character, ")", _, _, _, _}, NewPos, NewString}} ->
+        {ok, {{character, ")", _, Row, Col, _}, NewPos, NewString}} ->
-            % It is grouping! Close one excess paren, and continue.
+            % It is grouping! Try closing a grouping paren.
-            parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewPos, NewString);
+            parse_tuple_tails_paren(ExcessCount, HeadTerm, Tails, NewPos, NewString, Row, Col);
-        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
+        {ok, {{character, ",", _, Row, Col, _}, NewPos, NewString}} ->
-            % It is a real tuple! Try the normal logic, then.
+            % It is a real tuple! Try parsing a tuple.
-            parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewPos, NewString);
+            parse_tuple_tails_comma(ExcessCount, HeadTerm, Tails, NewPos, NewString, Row, Col);
        {ok, {Token, _, _}} ->
            % Anything else is just a boring parse error we can complain about.
            unexpected_token(Token, ")");
@@ -506,68 +502,93 @@ parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Pos, String) ->
            {error, Reason}
    end.
-parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Pos, String) ->
+parse_tuple_tails_paren(0, _, [[] | _], _, _, Row, Col) ->
-    case parse_multivalue(TailTypes, Pos, String, -1, -1, [HeadTerm]) of
+    % A singleton tuple was expected, but a grouping paren was given. In theory
    % we could be permissive here, but we were asked to do type checking, and
    % this is a type error. The type error itself is a bit hard to reproduce,
    % but we do know exactly what the fix is, so let's report that instead.
    {error, {expected_trailing_comma, Row, Col}};
 parse_tuple_tails_paren(0, _, [Tail | _], _, _, Row, Col) ->
    % A tuple (of more than one elements) was expected, but a grouping paren
    % was given. Again, the type error is hard to produce, but the actual
    % solution is simple; add more elements.
    ExpectCount = length(Tail) + 1,
    GotCount = 1,
    {error, {not_enough_elements, ExpectCount, GotCount, Row, Col}};
 parse_tuple_tails_paren(ExcessCount, HeadTerm, Tails, Pos, String, _, _) ->
    % We were expecting some grouping parens, and now we know that one of them
    % was in fact grouping. Good.
    parse_tuple_tails(ExcessCount - 1, HeadTerm, Tails, Pos, String).
 parse_tuple_tails_comma(_, _, [], _, _, Row, Col) ->
    % No more tuples, so commas are invalid. It's hard to describe the type
    % error that a comma would actually produce, so instead let's just give
    % the user the actual solution to their problems, which is to remove the
    % comma.
    {error, {expected_close_paren, Row, Col}};
 parse_tuple_tails_comma(ExcessCount, HeadTerm, Tails, Pos, String, _, _) ->
    % If there are no tails then we would have exited into the "grouping parens
    % only" case, so we know this works:
    [TailTypes | ParentTails] = Tails,
    % Now we can parse this tuple as a tuple.
    case parse_multivalue(TailTypes, Pos, String, [HeadTerm]) of
        {ok, {Terms, NewPos, NewString}} ->
            NewHead = {tuple, list_to_tuple(Terms)},
-            parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewPos, NewString);
+            % Then continue the loop, with whatever parent tuple types this
            % tuple is meant to be a part of.
            parse_tuple_tails(ExcessCount, NewHead, ParentTails, NewPos, NewString);
        {error, Reason} ->
            % TODO: wrap errors?
            {error, Reason}
-    end;
+    end.
 parse_tuple_tails2(O, N, _, _, [], _, _) ->
    % This case is created when, for example, we want int * int, but instead we
    % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is,
    % ((1, 2)) would have been valid, so it's actually the second comma that
    % tips us off to the error, not the first one.
    %
    % For simpler cases, like (1, 2) when int was expected, this error message
    % is fine:
    Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}},
    % TODO: Row/col
    % TODO: Generate better error messages in the cases where N *is* a tuple,
    %       but the first thing inside that tuple is the problem.
    Err.
 %%% Unambiguous Tuple/Variant Parsing
-parse_multivalue(ElemTypes, Pos, String, Row, Start, Acc) ->
+parse_multivalue(ElemTypes, Pos, String, Acc) ->
    case next_token(Pos, String) of
        {ok, {{character, ")", _, Row2, Start2, _}, NewPos, NewString}} ->
            check_multivalue_long_enough(ElemTypes, NewPos, NewString, Row2, Start2, Acc);
        {ok, {Token, NewPos, NewString}} ->
-            parse_multivalue2(ElemTypes, NewPos, NewString, Row, Start, Acc, Token);
+            parse_multivalue2(ElemTypes, NewPos, NewString, Acc, Token);
        {error, Reason} ->
            {error, Reason}
    end.
-parse_multivalue2([Next | Rest], Pos, String, Row, Start, Acc, Token) ->
+parse_multivalue2([Next | Rest], Pos, String, Acc, Token) ->
    case parse_expression2(Next, Pos, String, Token) of
        {ok, {Value, NewPos, NewString}} ->
-            parse_multivalue3(Rest, NewPos, NewString, Row, Start, [Value | Acc]);
+            parse_multivalue3(Rest, NewPos, NewString, [Value | Acc]);
        {error, Reason} ->
            Wrapper = choose_list_error_wrapper(")"),
            % TODO: Are tuple indices off by one from list indices?
            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
            {error, Wrapped}
    end;
-parse_multivalue2([], Pos, String, _, _, Acc, {character, ")", _, _, _, _}) ->
+parse_multivalue2([], Pos, String, Acc, Token) ->
-    {ok, {lists:reverse(Acc), Pos, String}};
+    count_multivalue_excess(Pos, String, Acc, Token).
 parse_multivalue2([], _, _, _, _, _, Token) ->
    unexpected_token(Token, ")").
-parse_multivalue3(ElemTypes, Pos, String, Row, Start, Acc) ->
+parse_multivalue3(ElemTypes, Pos, String, Acc) ->
    case next_token(Pos, String) of
        {ok, {{character, ")", _, Row2, Start2, _}, NewPos, NewString}} ->
            check_multivalue_long_enough(ElemTypes, NewPos, NewString, Row2, Start2, Acc);
        {ok, {{character, ",", _, _, _, _}, NewPos, NewString}} ->
-            parse_multivalue(ElemTypes, NewPos, NewString, Row, Start, Acc);
+            parse_multivalue(ElemTypes, NewPos, NewString, Acc);
        {ok, {Token, _, _}} ->
            unexpected_token(Token, ")");
        {error, Reason} ->
            {error, Reason}
    end.
 count_multivalue_excess(Pos, String, TypedAcc, Token) ->
    ExpectedLen = length(TypedAcc),
    case parse_list_loop2(unknown_type(), Pos, String, ")", TypedAcc, Token) of
        {ok, {TermList, _, {Row, Col}, _, _}} ->
            ActualLen = length(TermList),
            {error, {too_many_elements, ExpectedLen, ActualLen, Row, Col}};
        {error, Reason} ->
            {error, Reason}
    end.
 check_multivalue_long_enough([], Pos, String, _, _, Acc) ->
    {ok, {lists:reverse(Acc), Pos, String}};
 check_multivalue_long_enough(Remaining, _, _, Row, Col, Got) ->
@@ -621,16 +642,16 @@ parse_variant3(Arities, Tag, [], Pos, String) ->
    {ok, {Result, Pos, String}};
 parse_variant3(Arities, Tag, ElemTypes, Pos, String) ->
    case next_token(Pos, String) of
-        {ok, {{character, "(", _, Row, Start, _}, NewPos, NewString}} ->
+        {ok, {{character, "(", _, _, _, _}, NewPos, NewString}} ->
-            parse_variant4(Arities, Tag, ElemTypes, NewPos, NewString, Row, Start);
+            parse_variant4(Arities, Tag, ElemTypes, NewPos, NewString);
        {ok, {Token, _, _}} ->
            unexpected_token(Token, "(");
        {error, Reason} ->
            {error, Reason}
    end.
-parse_variant4(Arities, Tag, ElemTypes, Pos, String, Row, Start) ->
+parse_variant4(Arities, Tag, ElemTypes, Pos, String) ->
-    case parse_multivalue(ElemTypes, Pos, String, Row, Start, []) of
+    case parse_multivalue(ElemTypes, Pos, String, []) of
        {ok, {Terms, NewPos, NewString}} ->
            Result = {variant, Arities, Tag, list_to_tuple(Terms)},
            {ok, {Result, NewPos, NewString}};
@@ -907,6 +928,15 @@ variant_test() ->
    ok.
 ambiguous_variant_test() ->
    TypeDef = "datatype mytype = C | D",
    check_parser_with_typedef(TypeDef, "C"),
    check_parser_with_typedef(TypeDef, "D"),
    check_parser_with_typedef(TypeDef, "C.C"),
    check_parser_with_typedef(TypeDef, "C.D"),
    ok.
 namespace_variant_test() ->
    Term = "[N.A, N.B]",
    Source = "namespace N = datatype mytype = A | B\ncontract C = entrypoint f() = " ++ Term,
@@ -997,3 +1027,49 @@ lexer_offset_test() ->
    ok.
 parser_offset_test() ->
    {_, Type} = compile_entrypoint_value_and_type("contract C = entrypoint f() = ((1, 2), (3, 4))", "f"),
    {error, {not_enough_elements, 2, 1, 1, 8}} = parse_literal(Type, "((1, 2))"),
    {error, {not_enough_elements, 2, 1, 1, 10}} = parse_literal(Type, "(((1, 2)))"),
    {error, {too_many_elements, 2, 3, 1, 24}} = parse_literal(Type, "((1, 2), (3, 4), (5, 6))"),
    {error, {too_many_elements, 2, 3, 1, 10}} = parse_literal(Type, "((1, 2, 3), (4, 5))"),
    ok.
 singleton_test() ->
    % The Sophia compiler would never generate this, but it is a valid type
    % within the FATE virtual machine, and it is possible to represent within
    % the ACI itself.
    SingletonACI = #{tuple => [<<"int">>]},
    % Build an AACI around this, and run it through the AACI machinery.
    Function = #{name => <<"f">>,
                 arguments => [],
                 stateful => false,
                 payable => false,
                 returns => SingletonACI},
    ACI = [#{contract => #{functions => [Function],
                           name => <<"C">>,
                           kind => contract_main,
                           payable => false,
                           typedefs => []}}],
    {aaci, "C", #{"f" := {[], SingletonType}}, _} = hz_aaci:prepare(ACI),
    % Now let's do some testing with this weird type, to see if we handle it
    % correctly.
    {ok, {tuple, {1}}} = parse_literal(SingletonType, "(1,)"),
    % Some ambiguous nesting parens, for fun.
    {ok, {tuple, {1}}} = parse_literal(SingletonType, "(((1),))"),
    % No trailing comma should give an error.
    {error, {expected_trailing_comma, 1, 3}} = parse_literal(SingletonType, "(1)"),
    % All of the above should behave the same in untyped contexts:
    {ok, {tuple, {1}}} = parse_literal(unknown_type(), "(1,)"),
    {ok, {tuple, {1}}} = parse_literal(unknown_type(), "(((1),))"),
    {ok, 1} = parse_literal(unknown_type(), "(1)"),
    % Also if we wanted an integer, the singleton is NOT dropped, so is also an
    % error.
    {error, {expected_close_paren, 1, 3}} = parse_literal({integer, alread_normalized, integer}, "(1,)"),
    ok.