diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index b43a48c..f41443d 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -32,7 +32,9 @@ parse_literal2(Result, Tk, String) -> %%% Tokenizer --define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))). +-define(IS_LATIN_UPPER(C), (((C) >= $A) and ((C) =< $Z))). +-define(IS_LATIN_LOWER(C), (((C) >= $a) and ((C) =< $z))). +-define(IS_ALPHA(C), (?IS_LATIN_UPPER(C) or ?IS_LATIN_LOWER(C) or ((C) == $_))). -define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))). -define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))). -define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))). @@ -231,8 +233,8 @@ parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) -> parse_tuple(Type, Tk, String, Row, Start); parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) -> parse_record_or_map(Type, Tk, String, Row, Start); -parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) -> - parse_variant(Type, Tk, String, Ident, Row, Start, End); +parse_expression2(Type, Tk, String, {alphanum, S, _, Row, Start, End}) -> + parse_alphanum(Type, Tk, String, S, Row, Start, End); parse_expression2(_, _, _, {_, S, _, Row, Start, End}) -> {error, {unexpected_token, S, Row, Start, End}}. @@ -249,6 +251,69 @@ expect_tokens([Str | Rest], Tk, String) -> {error, {unexpected_token, Actual, Row, Start, End}} end. +%%% Ambiguous Chain Object vs Identifier Parsing + +parse_alphanum(Type, Tk, String, [C | _] = S, Row, Start, End) when ?IS_LATIN_UPPER(C) -> + % From a programming perspective, we are trying to parse a constant, so + % an alphanum token can really only be a constructor, or a chain object. + % Chain objects start with lowercase prefixes, like ak_, so clearly this is + % a variant constructor. + parse_variant(Type, Tk, String, S, Row, Start, End); +parse_alphanum(Type, Tk, String, S, Row, Start, End) -> + % Inversely, variant constructors are always uppercase, so now that we have + % handled that case, only chain objects are left. + try + case gmser_api_encoder:decode(unicode:characters_to_binary(S)) of + {account_pubkey, Data} -> + typecheck_address(Type, Tk, String, Data, Row, Start, End); + {contract_pubkey, Data} -> + typecheck_contract(Type, Tk, String, Data, Row, Start, End); + {signature, Data} -> + typecheck_signature(Type, Tk, String, Data, Row, Start, End); + {_, _} -> + % Only a few chain objects are recognized by Sophia. The rest + % are interpreted as identifiers, so we might as well give the + % same sort of error that the compiler would give. + {error, {unexpected_identifier, S, Row, Start, End}} + end + catch + _:_ -> {error, {unexpected_identifier, S, Row, Start, End}} + end. + +typecheck_address({_, _, address}, Tk, String, Data, _, _, _) -> + {ok, {{address, Data}, Tk, String}}; +typecheck_address({_, _, contract}, Tk, String, Data, _, _, _) -> + % The compiler would type error, but we should be lenient here. + {ok, {{contract, Data}, Tk, String}}; +typecheck_address({_, _, unknown_type}, Tk, String, Data, _, _, _) -> + {ok, {{address, Data}, Tk, String}}; +typecheck_address({O, N, _}, _, _, _, Row, Start, End) -> + {error, {wrong_type, O, N, address, Row, Start, End}}. + +typecheck_contract({_, _, contract}, Tk, String, Data, _, _, _) -> + {ok, {{contract, Data}, Tk, String}}; +typecheck_contract({_, _, address}, Tk, String, Data, _, _, _) -> + % The compiler would type error, but we should be lenient here. + {ok, {{address, Data}, Tk, String}}; +typecheck_contract({_, _, unknown_type}, Tk, String, Data, _, _, _) -> + {ok, {{contract, Data}, Tk, String}}; +typecheck_contract({O, N, _}, _, _, _, Row, Start, End) -> + {error, {wrong_type, O, N, contract, Row, Start, End}}. + +typecheck_signature({_, _, signature}, Tk, String, Data, _, _, _) -> + {ok, {{bytes, Data}, Tk, String}}; +typecheck_signature({_, _, {bytes, [64]}}, Tk, String, Data, _, _, _) -> + % The compiler would probably type-error, but whatever. + {ok, {{bytes, Data}, Tk, String}}; +typecheck_signature({_, _, {bytes, [any]}}, Tk, String, Data, _, _, _) -> + % The compiler would probably type-error, but whatever. + {ok, {{bytes, Data}, Tk, String}}; +typecheck_signature({_, _, unknown_type}, Tk, String, Data, _, _, _) -> + {ok, {{bytes, Data}, Tk, String}}; +typecheck_signature({O, N, _}, _, _, _, Row, Start, End) -> + {error, {wrong_type, O, N, signature, Row, Start, End}}. + + %%% List Parsing parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) -> @@ -697,7 +762,7 @@ check_sophia_to_fate(Type, Sophia, Fate) -> erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}}) end. -compile_entrypoint_code_and_type(Source, Entrypoint) -> +compile_entrypoint_value_and_type(Source, Entrypoint) -> {ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]), % Find the fcode for the correct entrypoint. @@ -706,12 +771,13 @@ compile_entrypoint_code_and_type(Source, Entrypoint) -> Name = unicode:characters_to_binary(Entrypoint), {Hash, Name} = lists:keyfind(Name, 2, Names), {_, _, Code} = maps:get(Hash, Bodies), + FATE = extract_return_value(Code), % Generate the AACI, and get the AACI type info for the correct entrypoint. AACI = hz_aaci:prepare_aaci(ACI), {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"), - {Code, Type}. + {FATE, Type}. extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) -> FATE; @@ -722,11 +788,10 @@ check_parser(Sophia) -> % Compile the literal using the compiler, to check that it is valid Sophia % syntax, and to get an AACI object to pass to the parser. Source = "contract C = entrypoint f() = " ++ Sophia, - {Code, Type} = compile_entrypoint_code_and_type(Source, "f"), + {Fate, Type} = compile_entrypoint_value_and_type(Source, "f"), % Check that when we parse the term we get the same value as the Sophia % compiler. - Fate = extract_return_value(Code), check_sophia_to_fate(unknown_type(), Sophia, Fate), % Then, once we know that the term is correct, make sure that it is still @@ -736,11 +801,7 @@ check_parser(Sophia) -> check_parser_with_typedef(Typedef, Sophia) -> % Compile the type definitions alongside the usual literal expression. Source = "contract C =\n " ++ Typedef ++ "\n entrypoint f() = " ++ Sophia, - {Code, Type} = compile_entrypoint_code_and_type(Source, "f"), - Fate = extract_return_value(Code), - - % Check the FATE term as usual. - gmb_fate_encoding:serialize(Fate), + {Fate, Type} = compile_entrypoint_value_and_type(Source, "f"), % Do a typed parse, as usual, but there are probably record/variant % definitions in the AACI, so untyped parses probably don't work. @@ -780,6 +841,38 @@ records_test() -> % will error, though. {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia). +variant_test() -> + TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)", + + check_parser_with_typedef(TypeDef, "Zero"), + check_parser_with_typedef(TypeDef, "One(0)"), + check_parser_with_typedef(TypeDef, "Two(0, 1)"), + check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"), + + {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"), + + ok. + +chain_objects_test() -> + % Address, + check_parser("ak_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx"), + % Two different forms of signature, + check_parser("[sg_XDyF8LJC4tpMyAySvpaG1f5V9F2XxAbRx9iuVjvvdNMwVracLhzAuXhRM5kXAFtpwW1DCHuz5jGehUayCah4jub32Ti2n, #00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF]"), + + % We have to build a totally custom contract example in order to get an + % AACI and return value for parsing contract addresses. This is because the + % compiler demands that contract addresses be type checked according to the + % logic of "contract oriented programming", including covariance, etc. and + % "contract oriented programming" is not very compatible with ML style type + % inference. + Contract = "ct_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx", + Source = "contract C = entrypoint f(): C = " ++ Contract, + {Fate, ContractType} = compile_entrypoint_value_and_type(Source, "f"), + check_sophia_to_fate(ContractType, Contract, Fate), + check_sophia_to_fate(unknown_type(), Contract, Fate), + + ok. + singleton_records_test() -> TypeDef = "record singleton('a) = {it: 'a}", check_parser_with_typedef(TypeDef, "{it = 123}"), @@ -817,23 +910,11 @@ excess_parens_test() -> ok. -variant_test() -> - TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)", - - check_parser_with_typedef(TypeDef, "Zero"), - check_parser_with_typedef(TypeDef, "One(0)"), - check_parser_with_typedef(TypeDef, "Two(0, 1)"), - check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"), - - {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"), - - ok. - lexer_offset_test() -> % Test that various tokens report their position correctly. {error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"), {error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"), - {error, {unexpected_token, "abc", 1, 5, 7}} = parse_literal("123 abc"), + {error, {unexpected_token, "ABC", 1, 5, 7}} = parse_literal("123 ABC"), {error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"), {error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""), {error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""), @@ -841,16 +922,16 @@ lexer_offset_test() -> % Check that the tokenizer knows its position correctly *after* various % tokens. {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"), - ABCType = {"mytype", already_normalized, {variant, [{"abc", []}]}}, - {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "abc 123"), + ABCType = {"mytype", already_normalized, {variant, [{"ABC", []}]}}, + {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "ABC 123"), {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"), {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"), {error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"), % Check that the tokenizer accounts for various line separators correctly. - {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\nabc"), - {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\r\nabc"), - {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\rabc"), + {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\nABC"), + {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\r\nABC"), + {error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\rABC"), ok.