diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index b73a4c0..5017a92 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -223,7 +223,7 @@ expect_tokens([Str | Rest], Tk, String) -> case next_token(Tk, String) of {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} -> expect_tokens(Rest, NewTk, NewString); - {ok, {{_, Actual, _, Row, Start, End}}} -> + {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, Actual, Row, Start, End}} end. @@ -268,33 +268,158 @@ parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) -> choose_list_error_wrapper("]") -> list_element; choose_list_error_wrapper(")") -> tuple_element. -%%% Tuple Parsing +%%% Ambiguous Parenthesis Parsing -parse_tuple({_, _, {tuple, Types}}, Tk, String, Row, Start) -> - case parse_multivalue(Types, Tk, String, Row, Start, []) of - {ok, {TermList, NewTk, NewString}} -> - Result = {tuple, list_to_tuple(TermList)}, - {ok, {Result, NewTk, NewString}}; - {error, Reason} -> - {error, Reason} - end; parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) -> % An untyped tuple is a list of untyped terms, and weirdly our list parser % works perfectly for that, as long as we change the closing character to % be ")" instead of "]". case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of + {ok, {[Inner], NewTk, NewString}} -> + % In Sophia, singleton tuples are unwrapped, and given the inner + % type. + {ok, {Inner, NewTk, NewString}}; {ok, {TermList, NewTk, NewString}} -> Result = {tuple, list_to_tuple(TermList)}, {ok, {Result, NewTk, NewString}}; {error, Reason} -> {error, Reason} end; -parse_tuple({O, N, _}, _, _, Row, Start) -> - {error, {wrong_type, O, N, tuple, Row, Start, Start}}. +parse_tuple({O, N, T}, Tk, String, _, _) -> + % Typed tuple parsing is quite complex, because we also want to support + % normal parentheses for grouping. It's not strictly necessary for + % inputting data, since we don't have any infix operators in simple + % data/term notation, but the alternatives are to generate singleton tuples + % naively, (which are impossible to generate from Sophia,) or to hard error + % on singleton tuples! Being faithful to Sophia is clearly nice! + + % Count how many ambiguous parens there are, including the one we already + % saw. + case count_open_parens(Tk, String, 1) of + {ok, {Count, Token, NewTk, NewString}} -> + % Compare that to the amount of nesting tuple connectives are in + % the type we are expected to produce. + {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []), + % Now work out what to do with all this information. + parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewTk, NewString, Token); + {error, Reason} -> + {error, Reason} + end. + +count_open_parens(Tk, String, Count) -> + case next_token(Tk, String) of + {ok, {{character, "(", _, _, _, _}, NewTk, NewString}} -> + count_open_parens(NewTk, NewString, Count + 1); + {ok, {Token, NewTk, NewString}} -> + {ok, {Count, Token, NewTk, NewString}}; + {error, Reason} -> + {error, Reason} + end. + +extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 -> + % Have an open paren, and a tuple type. We need to go deeper! + extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]); +extract_tuple_type_info(ParenCount, HeadType, Tails) -> + % No parens, or no more (non-empty) tuples. Stop! + {ParenCount, HeadType, Tails}. + +parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) -> + {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."}; +parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, {character, ")", _, Row, Col, _}) -> + parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Tk, String, Row, Col); +parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, Token) -> + % Finished with parentheses for now, try and parse an expression out, to + % get our head term. + case parse_expression2(HeadType, Tk, String, Token) of + {ok, {Result, NewTk, NewString}} -> + % Got a head term. Now try to build all the other tuple layers. + parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewTk, NewString); + {error, Reason} -> + % TODO: Wrap errors here too. + {error, Reason} + end. + +parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) -> + % There are zero excess parens, meaning all our parens are tuples. Get the + % top one. + [Tail | _] = Tails, + % We expected some nonzero number of elements before the close paren, but + % got zero. + ExpectCount = 1 + length(Tail), + {error, {not_enough_elements, ExpectCount, 0, Row, Col}}; +parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Tk, String, _, _) -> + % If we have some ambiguous parentheses left, we now know one of them is + % this empty tuple. + HeadTerm = {tuple, {}}, + NewExcessCount = ExcessCount - 1, + % Now continue the loop as if it were an integer or something, in the head + % position. + parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Tk, String); +parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) -> + % We were expecting a head term of a different type! + {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}. + +parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Tk, String) -> + % Tuples left to build, but no extra open parens to deal with, so we can + % just parse multivalues naively, starting from the "we have a term, + % waiting for a comma" stage of the loop. + case parse_multivalue3(TailTypes, Tk, String, -1, -1, [HeadTerm]) of + {ok, {Terms, NewTk, NewString}} -> + NewHead = {tuple, list_to_tuple(Terms)}, + parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewTk, NewString); + {error, Reason} -> + % TODO: More error wrapping? + {error, Reason} + end; +parse_tuple_tails(_, _, 0, HeadTerm, [], Tk, String) -> + % No open parens left, no tuples left to build, we are done! + {ok, {HeadTerm, Tk, String}}; +parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Tk, String) -> + % The ambiguous case, where we have a mix of tuple parens, and grouping + % parens. We want to peek at the next token, to see if it closes a grouping + % paren. + case next_token(Tk, String) of + {ok, {{character, ")", _, _, _, _}, NewTk, NewString}} -> + % It is grouping! Close one excess paren, and continue. + parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewTk, NewString); + {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> + % It is a real tuple! Try the normal logic, then. + parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewTk, NewString); + {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> + % Anything else is just a boring parse error we can complain about. + {error, {unexpected_token, Actual, Row, Start, End}}; + {error, Reason} -> + {error, Reason} + end. + +parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Tk, String) -> + case parse_multivalue(TailTypes, Tk, String, -1, -1, [HeadTerm]) of + {ok, {Terms, NewTk, NewString}} -> + NewHead = {tuple, list_to_tuple(Terms)}, + parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewTk, NewString); + {error, Reason} -> + % TODO: wrap errors? + {error, Reason} + end; +parse_tuple_tails2(O, N, _, _, [], _, _) -> + % This case is created when, for example, we want int * int, but instead we + % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is, + % ((1, 2)) would have been valid, so it's actually the second comma that + % tips us off to the error, not the first one. + % + % For simpler cases, like (1, 2) when int was expected, this error message + % is fine: + Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}}, + % TODO: Row/col + % TODO: Generate better error messages in the cases where N *is* a tuple, + % but the first thing inside that tuple is the problem. + Err. + +%%% Unambiguous Tuple/Variant Parsing parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) -> case next_token(Tk, String) of - {ok, {{character, ")", Row2, Start2, _}, NewTk, NewString}} -> + {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} -> check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); {ok, {Token, NewTk, NewString}} -> parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token) @@ -310,7 +435,7 @@ parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) -> Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}), {error, Wrapped} end; -parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) -> +parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _, _}) -> {ok, {lists:reverse(Acc), Tk, String}}; parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) -> {error, {unexpected_token, S, Row, Start, End}}. @@ -321,6 +446,8 @@ parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) -> check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc); + {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> + {error, {unexpected_token, Actual, Row, Start, End}}; {error, Reason} -> {error, Reason} end. @@ -364,7 +491,7 @@ parse_variant3(Arities, Tag, ElemTypes, Tk, String) -> case next_token(Tk, String) of {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} -> parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start); - {ok, {{_, Actual, _, Row, Start, End}}} -> + {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, Actual, Row, Start, End}} end. @@ -476,6 +603,10 @@ parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) -> error -> {error, {missing_field, Name}} end; +parse_record_final_loop([], _, [Field]) -> + % Singleton records are type-checked in Sophia, but unwrapped in the + % resulting FATE. + {ok, Field}; parse_record_final_loop([], _, FieldsReverse) -> Fields = lists:reverse(FieldsReverse), Tuple = list_to_tuple(Fields), @@ -490,7 +621,7 @@ parse_map(KeyType, ValueType, Tk, String, Acc) -> parse_map2(KeyType, ValueType, NewTk, NewString, Acc); {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {Acc, NewTk, NewString}}; - {ok, {{_, S, _, Row, Start, End}}} -> + {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}} end. @@ -525,7 +656,7 @@ parse_map5(KeyType, ValueType, Tk, String, Acc) -> parse_map(KeyType, ValueType, NewTk, NewString, Acc); {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {Acc, NewTk, NewString}}; - {ok, {{_, S, _, Row, Start, End}}} -> + {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}} end. @@ -570,15 +701,15 @@ check_parser(Sophia) -> % syntax, and to get an AACI object to pass to the parser. Source = "contract C = entrypoint f() = " ++ Sophia, {Code, Type} = compile_entrypoint_code_and_type(Source, "f"), + + % Check that when we parse the term we get the same value as the Sophia + % compiler. Fate = extract_return_value(Code), + check_sophia_to_fate(unknown_type(), Sophia, Fate), - % Also check that the FATE term is valid, by running it through gmb. - gmb_fate_encoding:serialize(Fate), - - % Now check that our parser produces that output. - check_sophia_to_fate(Type, Sophia, Fate), - % Also check that it can be parsed without type information. - check_sophia_to_fate(unknown_type(), Sophia, Fate). + % Then, once we know that the term is correct, make sure that it is still + % accepted *with* type info. + check_sophia_to_fate(Type, Sophia, Fate). check_parser_with_typedef(Typedef, Sophia) -> % Compile the type definitions alongside the usual literal expression. @@ -602,8 +733,6 @@ anon_types_test() -> check_parser("#DE_AD0_00B_EEF"), % Strings. check_parser("\"hello world\""), - check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), - check_parser("\"\\x00\\x11\\x77\""), % List of integers. check_parser("[1, 2, 3]"), % List of lists. @@ -615,6 +744,12 @@ anon_types_test() -> ok. +string_escape_codes_test() -> + check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), + check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""), + check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""), + ok. + records_test() -> TypeDef = "record pair = {x: int, y: int}", Sophia = "{x = 1, y = 2}", @@ -623,6 +758,43 @@ records_test() -> % will error, though. {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia). +singleton_records_test() -> + TypeDef = "record singleton('a) = {it: 'a}", + check_parser_with_typedef(TypeDef, "{it = 123}"), + check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"), + check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"), + + ok. + +singleton_variants_test() -> + % Similar tests to the singleton records, but this time there isn't + % actually a special case; singleton variants are in fact wrapped in the + % FATE too. + TypeDef = "datatype wrapped('a) = Wrap('a)", + check_parser_with_typedef(TypeDef, "Wrap(123)"), + check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"), + check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"), + + ok. + +excess_parens_test() -> + % 'singleton' parens are another special case, but unlike singleton + % records, which exist in the type system, singleton parens aren't tuples + % at all! They are just grouping, for arithmetic. For example. + check_parser("(123)"), + check_parser("[1, (2), ((3))]"), + % Where this gets tricky, though, is when grouping parens are mixed with + % tuple parens. E.g. this list of three tuples should all parse to the same + % result. + check_parser("[((1, 2)), ((1), 2), (((1), 2))]"), + % Including multiple nestings of tuples and grouping, interleaved. + check_parser("((((1), ((2, 3)))), 4)"), + % Also empty tuples exist! + check_parser("()"), + check_parser("(((((), ())), ()))"), + + ok. + variant_test() -> TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",