Compare commits

..

No commits in common. "a695c21fc926bfdedc3f611627b504e2de29be58" and "272ed01fdc7e647d930310ccbd5fef41de435f25" have entirely different histories.

View File

@ -4,13 +4,10 @@
-copyright("Jarvis Carroll <spiveehere@gmail.com>").
-license("GPL-3.0-or-later").
-export([parse_literal/1, parse_literal/2, check_parser/1]).
-export([check_parser/1]).
-include_lib("eunit/include/eunit.hrl").
parse_literal(String) ->
parse_literal(unknown_type(), String).
parse_literal(Type, String) ->
case parse_expression(Type, {tk, 1, 1}, String) of
{ok, {Result, NewTk, NewString}} ->
@ -32,9 +29,7 @@ parse_literal2(Result, Tk, String) ->
%%% Tokenizer
-define(IS_LATIN_UPPER(C), (((C) >= $A) and ((C) =< $Z))).
-define(IS_LATIN_LOWER(C), (((C) >= $a) and ((C) =< $z))).
-define(IS_ALPHA(C), (?IS_LATIN_UPPER(C) or ?IS_LATIN_LOWER(C) or ((C) == $_))).
-define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))).
-define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
-define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))).
-define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
@ -42,55 +37,55 @@ parse_literal2(Result, Tk, String) ->
next_token({tk, Row, Col}, []) ->
{ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
next_token({tk, Row, Col}, " " ++ Rest) ->
next_token({tk, Row, Col + 1}, Rest);
next_token({tk, Row + 1, Col}, Rest);
next_token({tk, Row, Col}, "\t" ++ Rest) ->
next_token({tk, Row, Col + 1}, Rest);
next_token({tk, Row, _}, "\r\n" ++ Rest) ->
next_token({tk, Row + 1, 1}, Rest);
next_token({tk, Row, _}, "\r" ++ Rest) ->
next_token({tk, Row + 1, 1}, Rest);
next_token({tk, Row, _}, "\n" ++ Rest) ->
next_token({tk, Row + 1, 1}, Rest);
next_token({tk, Row + 1, Col}, Rest);
next_token({tk, _, Col}, "\r\n" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest);
next_token({tk, _, Col}, "\r" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest);
next_token({tk, _, Col}, "\n" ++ Rest) ->
next_token({tk, 1, Col + 1}, Rest);
next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
alphanum_token(Tk, Tk, String, []);
next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
num_token(Tk, Tk, String, [], 0);
next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
bytes_token({tk, Row, Col}, {tk, Row, Col + 1}, [C | Rest], "#", []);
bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []);
next_token({tk, Row, Col}, "\"" ++ Rest) ->
string_token({tk, Row, Col}, {tk, Row, Col + 1}, Rest, "\"", <<>>);
string_token({tk, Row, Col}, {tk, Row + 1, Col}, Rest, "\"", <<>>);
next_token({tk, Row, Col}, [Char | Rest]) ->
Token = {character, [Char], Char, Row, Col, Col},
{ok, {Token, {tk, Row, Col + 1}, Rest}}.
{ok, {Token, {tk, Row + 1, Col}, Rest}}.
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
alphanum_token(Start, {tk, Row, Col + 1}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
AlphaString = lists:reverse(Acc),
Token = {alphanum, AlphaString, AlphaString, Row, Start, End - 1},
Token = {alphanum, AlphaString, AlphaString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
NewValue = Value * 10 + (C - $0),
num_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], NewValue);
num_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], NewValue);
num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
NewValue = Value * 10 + (C - $0),
num_token(Start, {tk, Row, Col + 2}, Rest, [C, $_ | Chars], NewValue);
num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Chars], NewValue);
num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
NumString = lists:reverse(Chars),
Token = {integer, NumString, Value, Row, Start, End - 1},
Token = {integer, NumString, Value, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
Digit = convert_digit(C),
bytes_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]);
bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], [Digit | Digits]);
bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
Digit = convert_digit(C),
bytes_token(Start, {tk, Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]);
bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Chars], [Digit | Digits]);
bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
BytesString = lists:reverse(Chars),
Value = reverse_combine_nibbles(Digits, <<>>),
Token = {bytes, BytesString, Value, Row, Start, End - 1},
Token = {bytes, BytesString, Value, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
convert_digit(C) when C >= $0, C =< $9 ->
@ -108,54 +103,35 @@ reverse_combine_nibbles([D1], Acc) ->
reverse_combine_nibbles([], Acc) ->
Acc.
string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
case escape_hex_code({tk, Row, Col}, {tk, Row, Col + 2}, String, "x\\" ++ SourceChars) of
{ok, {Codepoint, NewSourceChars, NewTk, NewString}} ->
NewValue = <<Value/binary, Codepoint/utf8>>,
string_token(Start, NewTk, NewString, NewSourceChars, NewValue);
{error, Reason} ->
{error, Reason}
string_token(Start, {tk, Row, Col}, [$\\, $x, A, B | Rest], SourceChars, Value) ->
case escape_hex_code(A, B) of
{ok, ByteVal} ->
string_token(Start, {tk, Row + 4, Col}, Rest, [B, A, $x, $\ | SourceChars], <<Value/binary, ByteVal>>);
error ->
{error, {invalid_escape_code, [$\\, $x, A, B], Row, Col}}
end;
string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
case escape_char(C) of
{ok, ByteVal} ->
string_token(Start, {tk, Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
string_token(Start, {tk, Row + 2, Col}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
error ->
{error, {invalid_escape_code, [C], Row, Col}}
end;
string_token({tk, _, Start}, {tk, Row, Col}, [$" | Rest], SourceChars, Value) ->
string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) ->
SourceStr = lists:reverse([$" | SourceChars]),
Token = {string, SourceStr, Value, Row, Start, Col},
{ok, {Token, {tk, Row, Col + 1}, Rest}};
Token = {string, SourceStr, Value, Row, Start, End},
{ok, {Token, {tk, Row, End}, Rest}};
string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
% TODO: ERTS probably had to convert this FROM utf8 at some point, so why
% bother, if we need to convert it back? I guess we could accept iolists if
% we really wanted to waste time on this point...
string_token(Start, {tk, Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <<Value/binary, C>>).
escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) ->
escape_long_hex_code(Start, {tk, Row, Col + 1}, String, "{" ++ SourceChars, 0);
escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
escape_hex_code(A, B) when ?IS_HEX(A), ?IS_HEX(B) ->
% As of writing this, the Sophia compiler will convert this byte from
% extended ASCII to unicode... But it really shouldn't. The literal parser
% does what the compiler should do.
Byte = convert_digit(A) * 16 + convert_digit(B),
{ok, {Byte, [B, A | SourceChars], {tk, Row, Col + 2}, String}};
escape_hex_code({tk, Row1, Col1}, _, _, _) ->
{error, {invalid_escape_code, "\\x", Row1, Col1}}.
escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) ->
{ok, {Value, "}" ++ SourceChars, {tk, Row, Col + 1}, String}};
escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
NewSourceChars = [C | SourceChars],
NewValue = 16 * Value + convert_digit(C),
escape_long_hex_code(Start, {tk, Row, Col + 1}, String, NewSourceChars, NewValue);
escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) ->
{error, {invalid_hexadecimal, [C], Row, Col}};
escape_long_hex_code(_, Tk, [], SourceChars, Value) ->
% Just return as if the escape code were closed, and let the string parser
% produce an unclosed string error instead.
{ok, {Value, SourceChars, Tk, []}}.
{ok, Byte};
escape_hex_code(_, _) ->
error.
escape_char($b) -> {ok, $\b};
escape_char($e) -> {ok, $\e};
@ -233,8 +209,8 @@ parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) ->
parse_tuple(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) ->
parse_record_or_map(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {alphanum, S, _, Row, Start, End}) ->
parse_alphanum(Type, Tk, String, S, Row, Start, End);
parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) ->
parse_variant(Type, Tk, String, Ident, Row, Start, End);
parse_expression2(_, _, _, {_, S, _, Row, Start, End}) ->
{error, {unexpected_token, S, Row, Start, End}}.
@ -251,69 +227,6 @@ expect_tokens([Str | Rest], Tk, String) ->
{error, {unexpected_token, Actual, Row, Start, End}}
end.
%%% Ambiguous Chain Object vs Identifier Parsing
parse_alphanum(Type, Tk, String, [C | _] = S, Row, Start, End) when ?IS_LATIN_UPPER(C) ->
% From a programming perspective, we are trying to parse a constant, so
% an alphanum token can really only be a constructor, or a chain object.
% Chain objects start with lowercase prefixes, like ak_, so clearly this is
% a variant constructor.
parse_variant(Type, Tk, String, S, Row, Start, End);
parse_alphanum(Type, Tk, String, S, Row, Start, End) ->
% Inversely, variant constructors are always uppercase, so now that we have
% handled that case, only chain objects are left.
try
case gmser_api_encoder:decode(unicode:characters_to_binary(S)) of
{account_pubkey, Data} ->
typecheck_address(Type, Tk, String, Data, Row, Start, End);
{contract_pubkey, Data} ->
typecheck_contract(Type, Tk, String, Data, Row, Start, End);
{signature, Data} ->
typecheck_signature(Type, Tk, String, Data, Row, Start, End);
{_, _} ->
% Only a few chain objects are recognized by Sophia. The rest
% are interpreted as identifiers, so we might as well give the
% same sort of error that the compiler would give.
{error, {unexpected_identifier, S, Row, Start, End}}
end
catch
_:_ -> {error, {unexpected_identifier, S, Row, Start, End}}
end.
typecheck_address({_, _, address}, Tk, String, Data, _, _, _) ->
{ok, {{address, Data}, Tk, String}};
typecheck_address({_, _, contract}, Tk, String, Data, _, _, _) ->
% The compiler would type error, but we should be lenient here.
{ok, {{contract, Data}, Tk, String}};
typecheck_address({_, _, unknown_type}, Tk, String, Data, _, _, _) ->
{ok, {{address, Data}, Tk, String}};
typecheck_address({O, N, _}, _, _, _, Row, Start, End) ->
{error, {wrong_type, O, N, address, Row, Start, End}}.
typecheck_contract({_, _, contract}, Tk, String, Data, _, _, _) ->
{ok, {{contract, Data}, Tk, String}};
typecheck_contract({_, _, address}, Tk, String, Data, _, _, _) ->
% The compiler would type error, but we should be lenient here.
{ok, {{address, Data}, Tk, String}};
typecheck_contract({_, _, unknown_type}, Tk, String, Data, _, _, _) ->
{ok, {{contract, Data}, Tk, String}};
typecheck_contract({O, N, _}, _, _, _, Row, Start, End) ->
{error, {wrong_type, O, N, contract, Row, Start, End}}.
typecheck_signature({_, _, signature}, Tk, String, Data, _, _, _) ->
{ok, {{bytes, Data}, Tk, String}};
typecheck_signature({_, _, {bytes, [64]}}, Tk, String, Data, _, _, _) ->
% The compiler would probably type-error, but whatever.
{ok, {{bytes, Data}, Tk, String}};
typecheck_signature({_, _, {bytes, [any]}}, Tk, String, Data, _, _, _) ->
% The compiler would probably type-error, but whatever.
{ok, {{bytes, Data}, Tk, String}};
typecheck_signature({_, _, unknown_type}, Tk, String, Data, _, _, _) ->
{ok, {{bytes, Data}, Tk, String}};
typecheck_signature({O, N, _}, _, _, _, Row, Start, End) ->
{error, {wrong_type, O, N, signature, Row, Start, End}}.
%%% List Parsing
parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
@ -762,7 +675,7 @@ check_sophia_to_fate(Type, Sophia, Fate) ->
erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
end.
compile_entrypoint_value_and_type(Source, Entrypoint) ->
compile_entrypoint_code_and_type(Source, Entrypoint) ->
{ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]),
% Find the fcode for the correct entrypoint.
@ -771,13 +684,12 @@ compile_entrypoint_value_and_type(Source, Entrypoint) ->
Name = unicode:characters_to_binary(Entrypoint),
{Hash, Name} = lists:keyfind(Name, 2, Names),
{_, _, Code} = maps:get(Hash, Bodies),
FATE = extract_return_value(Code),
% Generate the AACI, and get the AACI type info for the correct entrypoint.
AACI = hz_aaci:prepare_aaci(ACI),
{ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"),
{FATE, Type}.
{Code, Type}.
extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) ->
FATE;
@ -788,10 +700,11 @@ check_parser(Sophia) ->
% Compile the literal using the compiler, to check that it is valid Sophia
% syntax, and to get an AACI object to pass to the parser.
Source = "contract C = entrypoint f() = " ++ Sophia,
{Fate, Type} = compile_entrypoint_value_and_type(Source, "f"),
{Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
% Check that when we parse the term we get the same value as the Sophia
% compiler.
Fate = extract_return_value(Code),
check_sophia_to_fate(unknown_type(), Sophia, Fate),
% Then, once we know that the term is correct, make sure that it is still
@ -801,7 +714,11 @@ check_parser(Sophia) ->
check_parser_with_typedef(Typedef, Sophia) ->
% Compile the type definitions alongside the usual literal expression.
Source = "contract C =\n " ++ Typedef ++ "\n entrypoint f() = " ++ Sophia,
{Fate, Type} = compile_entrypoint_value_and_type(Source, "f"),
{Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
Fate = extract_return_value(Code),
% Check the FATE term as usual.
gmb_fate_encoding:serialize(Fate),
% Do a typed parse, as usual, but there are probably record/variant
% definitions in the AACI, so untyped parses probably don't work.
@ -830,7 +747,7 @@ anon_types_test() ->
string_escape_codes_test() ->
check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""),
check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""),
ok.
records_test() ->
@ -841,38 +758,6 @@ records_test() ->
% will error, though.
{error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).
variant_test() ->
TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",
check_parser_with_typedef(TypeDef, "Zero"),
check_parser_with_typedef(TypeDef, "One(0)"),
check_parser_with_typedef(TypeDef, "Two(0, 1)"),
check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),
{error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),
ok.
chain_objects_test() ->
% Address,
check_parser("ak_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx"),
% Two different forms of signature,
check_parser("[sg_XDyF8LJC4tpMyAySvpaG1f5V9F2XxAbRx9iuVjvvdNMwVracLhzAuXhRM5kXAFtpwW1DCHuz5jGehUayCah4jub32Ti2n, #00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF_00112233445566778899AABBCCDDEEFF]"),
% We have to build a totally custom contract example in order to get an
% AACI and return value for parsing contract addresses. This is because the
% compiler demands that contract addresses be type checked according to the
% logic of "contract oriented programming", including covariance, etc. and
% "contract oriented programming" is not very compatible with ML style type
% inference.
Contract = "ct_2FTnrGfV8qsfHpaSEHpBrziioCpwwzLqSevHqfxQY3PaAAdARx",
Source = "contract C = entrypoint f(): C = " ++ Contract,
{Fate, ContractType} = compile_entrypoint_value_and_type(Source, "f"),
check_sophia_to_fate(ContractType, Contract, Fate),
check_sophia_to_fate(unknown_type(), Contract, Fate),
ok.
singleton_records_test() ->
TypeDef = "record singleton('a) = {it: 'a}",
check_parser_with_typedef(TypeDef, "{it = 123}"),
@ -910,28 +795,16 @@ excess_parens_test() ->
ok.
lexer_offset_test() ->
% Test that various tokens report their position correctly.
{error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"),
{error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"),
{error, {unexpected_token, "ABC", 1, 5, 7}} = parse_literal("123 ABC"),
{error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"),
{error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""),
{error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""),
variant_test() ->
TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",
% Check that the tokenizer knows its position correctly *after* various
% tokens.
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"),
ABCType = {"mytype", already_normalized, {variant, [{"ABC", []}]}},
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "ABC 123"),
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"),
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"),
{error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"),
check_parser_with_typedef(TypeDef, "Zero"),
check_parser_with_typedef(TypeDef, "One(0)"),
check_parser_with_typedef(TypeDef, "Two(0, 1)"),
check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),
% Check that the tokenizer accounts for various line separators correctly.
{error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\nABC"),
{error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\r\nABC"),
{error, {unexpected_token, "ABC", 2, 1, 3}} = parse_literal("123\rABC"),
{error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),
ok.