-module(hz_sophia). -vsn("0.8.2"). -author("Jarvis Carroll "). -copyright("Jarvis Carroll "). -license("GPL-3.0-or-later"). -export([parse_literal/1, parse_literal/2, check_parser/1]). -include_lib("eunit/include/eunit.hrl"). parse_literal(String) -> parse_literal(unknown_type(), String). parse_literal(Type, String) -> case parse_expression(Type, {tk, 1, 1}, String) of {ok, {Result, NewTk, NewString}} -> parse_literal2(Result, NewTk, NewString); {error, Reason} -> {error, Reason} end. parse_literal2(Result, Tk, String) -> % We have parsed a valid expression. Now check that the string ends. case next_token(Tk, String) of {ok, {{eof, _, _, _, _}, _, _}} -> {ok, Result}; {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}}; {error, Reason} -> {error, Reason} end. %%% Tokenizer -define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))). -define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))). -define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))). -define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))). next_token({tk, Row, Col}, []) -> {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}}; next_token({tk, Row, Col}, " " ++ Rest) -> next_token({tk, Row, Col + 1}, Rest); next_token({tk, Row, Col}, "\t" ++ Rest) -> next_token({tk, Row, Col + 1}, Rest); next_token({tk, Row, _}, "\r\n" ++ Rest) -> next_token({tk, Row + 1, 1}, Rest); next_token({tk, Row, _}, "\r" ++ Rest) -> next_token({tk, Row + 1, 1}, Rest); next_token({tk, Row, _}, "\n" ++ Rest) -> next_token({tk, Row + 1, 1}, Rest); next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) -> alphanum_token(Tk, Tk, String, []); next_token(Tk, [C | _] = String) when ?IS_NUM(C) -> num_token(Tk, Tk, String, [], 0); next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) -> bytes_token({tk, Row, Col}, {tk, Row, Col + 1}, [C | Rest], "#", []); next_token({tk, Row, Col}, "\"" ++ Rest) -> string_token({tk, Row, Col}, {tk, Row, Col + 1}, Rest, "\"", <<>>); next_token({tk, Row, Col}, [Char | Rest]) -> Token = {character, [Char], Char, Row, Col, Col}, {ok, {Token, {tk, Row, Col + 1}, Rest}}. alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) -> alphanum_token(Start, {tk, Row, Col + 1}, Rest, [C | Acc]); alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> AlphaString = lists:reverse(Acc), Token = {alphanum, AlphaString, AlphaString, Row, Start, End - 1}, {ok, {Token, {tk, Row, End}, String}}. num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) -> NewValue = Value * 10 + (C - $0), num_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], NewValue); num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) -> NewValue = Value * 10 + (C - $0), num_token(Start, {tk, Row, Col + 2}, Rest, [C, $_ | Chars], NewValue); num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) -> NumString = lists:reverse(Chars), Token = {integer, NumString, Value, Row, Start, End - 1}, {ok, {Token, {tk, Row, End}, String}}. bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) -> Digit = convert_digit(C), bytes_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]); bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) -> Digit = convert_digit(C), bytes_token(Start, {tk, Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]); bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) -> BytesString = lists:reverse(Chars), Value = reverse_combine_nibbles(Digits, <<>>), Token = {bytes, BytesString, Value, Row, Start, End - 1}, {ok, {Token, {tk, Row, End}, String}}. convert_digit(C) when C >= $0, C =< $9 -> C - $0; convert_digit(C) when C >= $A, C =< $Z -> C - $A + 10; convert_digit(C) when C >= $a, C =< $z -> C - $a + 10. reverse_combine_nibbles([D1, D2 | Rest], Acc) -> NewAcc = <>, reverse_combine_nibbles(Rest, NewAcc); reverse_combine_nibbles([D1], Acc) -> <<0:4, D1:4, Acc/binary>>; reverse_combine_nibbles([], Acc) -> Acc. string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) -> case escape_hex_code({tk, Row, Col}, {tk, Row, Col + 2}, String, "x\\" ++ SourceChars) of {ok, {Codepoint, NewSourceChars, NewTk, NewString}} -> NewValue = <>, string_token(Start, NewTk, NewString, NewSourceChars, NewValue); {error, Reason} -> {error, Reason} end; string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) -> case escape_char(C) of {ok, ByteVal} -> string_token(Start, {tk, Row, Col + 2}, Rest, [C, $\ | SourceChars], <>); error -> {error, {invalid_escape_code, [C], Row, Col}} end; string_token({tk, _, Start}, {tk, Row, Col}, [$" | Rest], SourceChars, Value) -> SourceStr = lists:reverse([$" | SourceChars]), Token = {string, SourceStr, Value, Row, Start, Col}, {ok, {Token, {tk, Row, Col + 1}, Rest}}; string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) -> % TODO: ERTS probably had to convert this FROM utf8 at some point, so why % bother, if we need to convert it back? I guess we could accept iolists if % we really wanted to waste time on this point... string_token(Start, {tk, Row, Col + 1}, Rest, [C | SourceChars], <>). escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) -> escape_long_hex_code(Start, {tk, Row, Col + 1}, String, "{" ++ SourceChars, 0); escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) -> % As of writing this, the Sophia compiler will convert this byte from % extended ASCII to unicode... But it really shouldn't. The literal parser % does what the compiler should do. Byte = convert_digit(A) * 16 + convert_digit(B), {ok, {Byte, [B, A | SourceChars], {tk, Row, Col + 2}, String}}; escape_hex_code({tk, Row1, Col1}, _, _, _) -> {error, {invalid_escape_code, "\\x", Row1, Col1}}. escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) -> {ok, {Value, "}" ++ SourceChars, {tk, Row, Col + 1}, String}}; escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) -> NewSourceChars = [C | SourceChars], NewValue = 16 * Value + convert_digit(C), escape_long_hex_code(Start, {tk, Row, Col + 1}, String, NewSourceChars, NewValue); escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) -> {error, {invalid_hexadecimal, [C], Row, Col}}; escape_long_hex_code(_, Tk, [], SourceChars, Value) -> % Just return as if the escape code were closed, and let the string parser % produce an unclosed string error instead. {ok, {Value, SourceChars, Tk, []}}. escape_char($b) -> {ok, $\b}; escape_char($e) -> {ok, $\e}; escape_char($f) -> {ok, $\f}; escape_char($n) -> {ok, $\n}; escape_char($r) -> {ok, $\r}; escape_char($t) -> {ok, $\t}; escape_char($v) -> {ok, $\v}; escape_char($") -> {ok, $\"}; escape_char($\\) -> {ok, $\\}; escape_char(_) -> error. %%% Sophia Literal Parser %%% This parser is a simple recursive descent parser, written explicitly in %%% erlang. %%% %%% There are no infix operators in the subset we want to parse, so recursive %%% descent is fine with no special tricks, no shunting yard algorithm, no %%% parser generators, etc. %%% %%% If we were writing this in C then we might want to work iteratively with an %%% array of finite state machines, i.e. with a pushdown automaton, instead of %%% using recursion. This is a tried and true method of making fast parsers. %%% Recall, however, that the BEAM *is* a stack machine, written in C, so %%% rather than writing confusing iterative code in Erlang, to simulate a %%% pushdown automaton inside another simulated stack machine... we should just %%% write the recursive code, thus programming the BEAM to implement the %%% pushdown automaton that we want. parse_expression(Type, Tk, String) -> case next_token(Tk, String) of {ok, {Token, NewTk, NewString}} -> parse_expression2(Type, NewTk, NewString, Token); {error, Reason} -> {error, Reason} end. parse_expression2(Type, Tk, String, {integer, _, Value, Row, Start, End}) -> case Type of {_, _, integer} -> {ok, {Value, Tk, String}}; {_, _, unknown_type} -> {ok, {Value, Tk, String}}; {O, N, _} -> {error, {wrong_type, O, N, integer, Row, Start, End}} end; parse_expression2(Type, Tk, String, {bytes, _, Value, Row, Start, End}) -> Len = byte_size(Value), Result = {bytes, Value}, case Type of {_, _, {bytes, [any]}} -> {ok, {Result, Tk, String}}; {_, _, {bytes, [Len]}} -> {ok, {Result, Tk, String}}; {_, _, {bytes, [ExpectedLen]}} -> {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}}; {_, _, unknown_type} -> {ok, {Result, Tk, String}}; {O, N, _} -> {error, {wrong_type, O, N, {bytes, [Len]}, Row, Start, End}} end; parse_expression2(Type, Tk, String, {string, _, Value, Row, Start, End}) -> case Type of {_, _, string} -> {ok, {Value, Tk, String}}; {_, _, unknown_type} -> {ok, {Value, Tk, String}}; {O, N, _} -> {error, {wrong_type, O, N, string, Row, Start, End}} end; parse_expression2(Type, Tk, String, {character, "[", _, Row, Start, _}) -> parse_list(Type, Tk, String, Row, Start); parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) -> parse_tuple(Type, Tk, String, Row, Start); parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) -> parse_record_or_map(Type, Tk, String, Row, Start); parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) -> parse_variant(Type, Tk, String, Ident, Row, Start, End); parse_expression2(_, _, _, {_, S, _, Row, Start, End}) -> {error, {unexpected_token, S, Row, Start, End}}. unknown_type() -> {unknown_type, already_normalized, unknown_type}. expect_tokens([], Tk, String) -> {ok, {Tk, String}}; expect_tokens([Str | Rest], Tk, String) -> case next_token(Tk, String) of {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} -> expect_tokens(Rest, NewTk, NewString); {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, Actual, Row, Start, End}} end. %%% List Parsing parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) -> parse_list_loop(Inner, Tk, String, "]", Row, Start, []); parse_list({_, _, unknown_type}, Tk, String, Row, Start) -> parse_list_loop(unknown_type(), Tk, String, "]", Row, Start, []); parse_list({O, N, _}, _, _, Row, Start) -> {error, {wrong_type, O, N, list, Row, Start, Start}}. parse_list_loop(Inner, Tk, String, CloseChar, Row, Start, Acc) -> case next_token(Tk, String) of {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} -> {ok, {lists:reverse(Acc), NewTk, NewString}}; {ok, {Token, NewTk, NewString}} -> parse_list_loop2(Inner, NewTk, NewString, CloseChar, Row, Start, Acc, Token) end. parse_list_loop2(Inner, Tk, String, CloseChar, Row, Start, Acc, Token) -> case parse_expression2(Inner, Tk, String, Token) of {ok, {Value, NewTk, NewString}} -> parse_list_loop3(Inner, NewTk, NewString, CloseChar, Row, Start, [Value | Acc]); {error, Reason} -> Wrapper = choose_list_error_wrapper(CloseChar), % TODO: Are tuple indices off by one from list indices? Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}), {error, Wrapped} end. parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) -> case next_token(Tk, String) of {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} -> {ok, {lists:reverse(Acc), NewTk, NewString}}; {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> parse_list_loop(Inner, NewTk, NewString, CloseChar, Row, Start, Acc); {error, Reason} -> {error, Reason} end. choose_list_error_wrapper("]") -> list_element; choose_list_error_wrapper(")") -> tuple_element. %%% Ambiguous Parenthesis Parsing parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) -> % An untyped tuple is a list of untyped terms, and weirdly our list parser % works perfectly for that, as long as we change the closing character to % be ")" instead of "]". case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of {ok, {[Inner], NewTk, NewString}} -> % In Sophia, singleton tuples are unwrapped, and given the inner % type. {ok, {Inner, NewTk, NewString}}; {ok, {TermList, NewTk, NewString}} -> Result = {tuple, list_to_tuple(TermList)}, {ok, {Result, NewTk, NewString}}; {error, Reason} -> {error, Reason} end; parse_tuple({O, N, T}, Tk, String, _, _) -> % Typed tuple parsing is quite complex, because we also want to support % normal parentheses for grouping. It's not strictly necessary for % inputting data, since we don't have any infix operators in simple % data/term notation, but the alternatives are to generate singleton tuples % naively, (which are impossible to generate from Sophia,) or to hard error % on singleton tuples! Being faithful to Sophia is clearly nice! % Count how many ambiguous parens there are, including the one we already % saw. case count_open_parens(Tk, String, 1) of {ok, {Count, Token, NewTk, NewString}} -> % Compare that to the amount of nesting tuple connectives are in % the type we are expected to produce. {ExcessCount, HeadType, Tails} = extract_tuple_type_info(Count, {O, N, T}, []), % Now work out what to do with all this information. parse_tuple2(O, N, ExcessCount, HeadType, Tails, NewTk, NewString, Token); {error, Reason} -> {error, Reason} end. count_open_parens(Tk, String, Count) -> case next_token(Tk, String) of {ok, {{character, "(", _, _, _, _}, NewTk, NewString}} -> count_open_parens(NewTk, NewString, Count + 1); {ok, {Token, NewTk, NewString}} -> {ok, {Count, Token, NewTk, NewString}}; {error, Reason} -> {error, Reason} end. extract_tuple_type_info(ParenCount, {_, _, {tuple, [Head | Rest]}}, Tails) when ParenCount > 0 -> % Have an open paren, and a tuple type. We need to go deeper! extract_tuple_type_info(ParenCount - 1, Head, [Rest | Tails]); extract_tuple_type_info(ParenCount, HeadType, Tails) -> % No parens, or no more (non-empty) tuples. Stop! {ParenCount, HeadType, Tails}. parse_tuple2(_, _, _, {_, _, unknown_type}, [_ | _], _, _, _) -> {error, "Parsing of tuples with known lengths but unknown contents is not yet implemented."}; parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, {character, ")", _, Row, Col, _}) -> parse_empty_tuple(O, N, ExcessCount, HeadType, Tails, Tk, String, Row, Col); parse_tuple2(O, N, ExcessCount, HeadType, Tails, Tk, String, Token) -> % Finished with parentheses for now, try and parse an expression out, to % get our head term. case parse_expression2(HeadType, Tk, String, Token) of {ok, {Result, NewTk, NewString}} -> % Got a head term. Now try to build all the other tuple layers. parse_tuple_tails(O, N, ExcessCount, Result, Tails, NewTk, NewString); {error, Reason} -> % TODO: Wrap errors here too. {error, Reason} end. parse_empty_tuple(_, _, 0, _, Tails, _, _, Row, Col) -> % There are zero excess parens, meaning all our parens are tuples. Get the % top one. [Tail | _] = Tails, % We expected some nonzero number of elements before the close paren, but % got zero. ExpectCount = 1 + length(Tail), {error, {not_enough_elements, ExpectCount, 0, Row, Col}}; parse_empty_tuple(O, N, ExcessCount, {_, _, {tuple, []}}, Tails, Tk, String, _, _) -> % If we have some ambiguous parentheses left, we now know one of them is % this empty tuple. HeadTerm = {tuple, {}}, NewExcessCount = ExcessCount - 1, % Now continue the loop as if it were an integer or something, in the head % position. parse_tuple_tails(O, N, NewExcessCount, HeadTerm, Tails, Tk, String); parse_empty_tuple(_, _, _, {HeadO, HeadN, _}, _, _, _, Row, Col) -> % We were expecting a head term of a different type! {error, {wrong_type, HeadO, HeadN, unit, Row, Col, Col}}. parse_tuple_tails(O, N, 0, HeadTerm, [TailTypes | ParentTails], Tk, String) -> % Tuples left to build, but no extra open parens to deal with, so we can % just parse multivalues naively, starting from the "we have a term, % waiting for a comma" stage of the loop. case parse_multivalue3(TailTypes, Tk, String, -1, -1, [HeadTerm]) of {ok, {Terms, NewTk, NewString}} -> NewHead = {tuple, list_to_tuple(Terms)}, parse_tuple_tails(O, N, 0, NewHead, ParentTails, NewTk, NewString); {error, Reason} -> % TODO: More error wrapping? {error, Reason} end; parse_tuple_tails(_, _, 0, HeadTerm, [], Tk, String) -> % No open parens left, no tuples left to build, we are done! {ok, {HeadTerm, Tk, String}}; parse_tuple_tails(O, N, ExcessCount, HeadTerm, Tails, Tk, String) -> % The ambiguous case, where we have a mix of tuple parens, and grouping % parens. We want to peek at the next token, to see if it closes a grouping % paren. case next_token(Tk, String) of {ok, {{character, ")", _, _, _, _}, NewTk, NewString}} -> % It is grouping! Close one excess paren, and continue. parse_tuple_tails(O, N, ExcessCount - 1, HeadTerm, Tails, NewTk, NewString); {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> % It is a real tuple! Try the normal logic, then. parse_tuple_tails2(O, N, ExcessCount, HeadTerm, Tails, NewTk, NewString); {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> % Anything else is just a boring parse error we can complain about. {error, {unexpected_token, Actual, Row, Start, End}}; {error, Reason} -> {error, Reason} end. parse_tuple_tails2(O, N, ExcessCount, HeadTerm, [TailTypes | ParentTails], Tk, String) -> case parse_multivalue(TailTypes, Tk, String, -1, -1, [HeadTerm]) of {ok, {Terms, NewTk, NewString}} -> NewHead = {tuple, list_to_tuple(Terms)}, parse_tuple_tails(O, N, ExcessCount, NewHead, ParentTails, NewTk, NewString); {error, Reason} -> % TODO: wrap errors? {error, Reason} end; parse_tuple_tails2(O, N, _, _, [], _, _) -> % This case is created when, for example, we want int * int, but instead we % get a term like ((1, 2), 3), of type (int * int) * int. The trouble is, % ((1, 2)) would have been valid, so it's actually the second comma that % tips us off to the error, not the first one. % % For simpler cases, like (1, 2) when int was expected, this error message % is fine: Err = {error, {wrong_type, O, N, tuple, -1, -1, -1}}, % TODO: Row/col % TODO: Generate better error messages in the cases where N *is* a tuple, % but the first thing inside that tuple is the problem. Err. %%% Unambiguous Tuple/Variant Parsing parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) -> case next_token(Tk, String) of {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} -> check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); {ok, {Token, NewTk, NewString}} -> parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token) end. parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) -> case parse_expression2(Next, Tk, String, Token) of {ok, {Value, NewTk, NewString}} -> parse_multivalue3(Rest, NewTk, NewString, Row, Start, [Value | Acc]); {error, Reason} -> Wrapper = choose_list_error_wrapper(")"), % TODO: Are tuple indices off by one from list indices? Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}), {error, Wrapped} end; parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _, _}) -> {ok, {lists:reverse(Acc), Tk, String}}; parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) -> {error, {unexpected_token, S, Row, Start, End}}. parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) -> case next_token(Tk, String) of {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} -> check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc); {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, Actual, Row, Start, End}}; {error, Reason} -> {error, Reason} end. check_multivalue_long_enough([], Tk, String, _, _, Acc) -> {ok, {lists:reverse(Acc), Tk, String}}; check_multivalue_long_enough(Remaining, _, _, Row, Col, Got) -> GotCount = length(Got), ExpectCount = length(Remaining) + GotCount, {error, {not_enough_elements, ExpectCount, GotCount, Row, Col}}. %%% Variant parsing parse_variant({_, _, {variant, Variants}}, Tk, String, Ident, Row, Start, End) -> parse_variant2(Variants, Tk, String, Ident, Row, Start, End); parse_variant({_, _, unknown_type}, _, _, _, Row, Start, End) -> {error, {unresolved_variant, Row, Start, End}}; parse_variant({O, N, _}, _, _, _, Row, Start, End) -> % In normal code, identifiers can have many meanings, which can result in % lots of different errors. In this Sophia 'object notation', identifiers % can only ever be variant constructors, (sort of like the Sophia version % of atoms,) and so immediately lead to a type error if we aren't expecting % a variant. {error, {wrong_type, O, N, variant, Row, Start, End}}. parse_variant2(Variants, Tk, String, Ident, Row, Start, End) -> case lookup_variant(Ident, Variants, 0) of {ok, {Tag, ElemTypes}} -> GetArity = fun({_, OtherElemTypes}) -> length(OtherElemTypes) end, Arities = lists:map(GetArity, Variants), parse_variant3(Arities, Tag, ElemTypes, Tk, String); error -> {error, {invalid_constructor, Ident, Row, Start, End}} end. parse_variant3(Arities, Tag, [], Tk, String) -> % Parsing of 0-arity variants is different. Result = {variant, Arities, Tag, {}}, {ok, {Result, Tk, String}}; parse_variant3(Arities, Tag, ElemTypes, Tk, String) -> case next_token(Tk, String) of {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} -> parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start); {ok, {{_, Actual, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, Actual, Row, Start, End}} end. parse_variant4(Arities, Tag, ElemTypes, Tk, String, Row, Start) -> case parse_multivalue(ElemTypes, Tk, String, Row, Start, []) of {ok, {Terms, NewTk, NewString}} -> Result = {variant, Arities, Tag, list_to_tuple(Terms)}, {ok, {Result, NewTk, NewString}}; {error, Reason} -> {error, Reason} end. lookup_variant(_, [], _) -> error; lookup_variant(Ident, [{Ident, ElemTypes} | _], Tag) -> {ok, {Tag, ElemTypes}}; lookup_variant(Ident, [_ | Rest], Tag) -> lookup_variant(Ident, Rest, Tag + 1). %%% Record parsing parse_record_or_map({_, _, {map, [KeyType, ValueType]}}, Tk, String, _, _) -> parse_map(KeyType, ValueType, Tk, String, #{}); parse_record_or_map({_, _, {record, Fields}}, Tk, String, _, _) -> parse_record(Fields, Tk, String, #{}); parse_record_or_map({_, _, unknown_type}, Tk, String, _, _) -> case next_token(Tk, String) of {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {#{}, NewTk, NewString}}; {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} -> parse_map2(unknown_type(), unknown_type(), NewTk, NewString, #{}); {ok, {{alphanum, _, _, Row, Start, End}, _, _}} -> {error, {unresolved_record, Row, Start, End}}; {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}} end; parse_record_or_map({O, N, _}, _, _, Row, Start) -> {error, {wrong_type, O, N, map, Row, Start, Start}}. parse_record(Fields, Tk, String, Acc) -> case next_token(Tk, String) of {ok, {{alphanum, Ident, _, Row, Start, End}, NewTk, NewString}} -> parse_record2(Fields, NewTk, NewString, Acc, Ident, Row, Start, End); {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} -> parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End); {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}}; {error, Reason} -> {error, Reason} end. parse_record2(Fields, Tk, String, Acc, Ident, Row, Start, End) -> case lists:keyfind(Ident, 1, Fields) of {_, Type} -> parse_record3(Fields, Tk, String, Acc, Ident, Row, Start, End, Type); false -> {error, {invalid_field, Ident, Row, Start, End}} end. parse_record3(Fields, Tk, String, Acc, Ident, Row, Start, End, Type) -> case maps:is_key(Ident, Acc) of false -> parse_record4(Fields, Tk, String, Acc, Ident, Type); true -> {error, {field_already_present, Ident, Row, Start, End}} end. parse_record4(Fields, Tk, String, Acc, Ident, Type) -> case expect_tokens(["="], Tk, String) of {ok, {NewTk, NewString}} -> parse_record5(Fields, NewTk, NewString, Acc, Ident, Type); {error, Reason} -> {error, Reason} end. parse_record5(Fields, Tk, String, Acc, Ident, Type) -> case parse_expression(Type, Tk, String) of {ok, {Result, NewTk, NewString}} -> NewAcc = maps:put(Ident, Result, Acc), parse_record6(Fields, NewTk, NewString, NewAcc); {error, Reason} -> wrap_error(Reason, {record_field, Ident}) end. parse_record6(Fields, Tk, String, Acc) -> case next_token(Tk, String) of {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> parse_record(Fields, NewTk, NewString, Acc); {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} -> parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End); {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}}; {error, Reason} -> {error, Reason} end. parse_record_end(Fields, Tk, String, FieldValues, Row, Start, End) -> case parse_record_final_loop(Fields, FieldValues, []) of {ok, Result} -> {ok, {Result, Tk, String}}; {error, {missing_field, Name}} -> {error, {missing_field, Name, Row, Start, End}} end. parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) -> case maps:find(Name, FieldValues) of {ok, Value} -> parse_record_final_loop(Rest, FieldValues, [Value | Acc]); error -> {error, {missing_field, Name}} end; parse_record_final_loop([], _, [Field]) -> % Singleton records are type-checked in Sophia, but unwrapped in the % resulting FATE. {ok, Field}; parse_record_final_loop([], _, FieldsReverse) -> Fields = lists:reverse(FieldsReverse), Tuple = list_to_tuple(Fields), {ok, {tuple, Tuple}}. %%% Map Parsing parse_map(KeyType, ValueType, Tk, String, Acc) -> case next_token(Tk, String) of {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} -> parse_map2(KeyType, ValueType, NewTk, NewString, Acc); {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {Acc, NewTk, NewString}}; {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}} end. parse_map2(KeyType, ValueType, Tk, String, Acc) -> case parse_expression(KeyType, Tk, String) of {ok, {Result, NewTk, NewString}} -> parse_map3(KeyType, ValueType, NewTk, NewString, Acc, Result); {error, Reason} -> wrap_error(Reason, {map_key, maps:size(Acc)}) end. parse_map3(KeyType, ValueType, Tk, String, Acc, Key) -> case expect_tokens(["]", "="], Tk, String) of {ok, {NewTk, NewString}} -> parse_map4(KeyType, ValueType, NewTk, NewString, Acc, Key); {error, Reason} -> {error, Reason} end. parse_map4(KeyType, ValueType, Tk, String, Acc, Key) -> case parse_expression(ValueType, Tk, String) of {ok, {Result, NewTk, NewString}} -> NewAcc = maps:put(Key, Result, Acc), parse_map5(KeyType, ValueType, NewTk, NewString, NewAcc); {error, Reason} -> {error, Reason} end. parse_map5(KeyType, ValueType, Tk, String, Acc) -> case next_token(Tk, String) of {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} -> parse_map(KeyType, ValueType, NewTk, NewString, Acc); {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} -> {ok, {Acc, NewTk, NewString}}; {ok, {{_, S, _, Row, Start, End}, _, _}} -> {error, {unexpected_token, S, Row, Start, End}} end. % TODO wrap_error(Reason, _) -> Reason. %%% Tests check_sophia_to_fate(Type, Sophia, Fate) -> case parse_literal(Type, Sophia) of {ok, Fate} -> ok; {ok, FateActual} -> erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}}); {error, Reason} -> erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}}) end. compile_entrypoint_code_and_type(Source, Entrypoint) -> {ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]), % Find the fcode for the correct entrypoint. {fcode, Bodies, NamesMap, _} = FateCode, Names = maps:to_list(NamesMap), Name = unicode:characters_to_binary(Entrypoint), {Hash, Name} = lists:keyfind(Name, 2, Names), {_, _, Code} = maps:get(Hash, Bodies), % Generate the AACI, and get the AACI type info for the correct entrypoint. AACI = hz_aaci:prepare_aaci(ACI), {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"), {Code, Type}. extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) -> FATE; extract_return_value(Code) -> erlang:exit({invalid_literal_fcode, Code}). check_parser(Sophia) -> % Compile the literal using the compiler, to check that it is valid Sophia % syntax, and to get an AACI object to pass to the parser. Source = "contract C = entrypoint f() = " ++ Sophia, {Code, Type} = compile_entrypoint_code_and_type(Source, "f"), % Check that when we parse the term we get the same value as the Sophia % compiler. Fate = extract_return_value(Code), check_sophia_to_fate(unknown_type(), Sophia, Fate), % Then, once we know that the term is correct, make sure that it is still % accepted *with* type info. check_sophia_to_fate(Type, Sophia, Fate). check_parser_with_typedef(Typedef, Sophia) -> % Compile the type definitions alongside the usual literal expression. Source = "contract C =\n " ++ Typedef ++ "\n entrypoint f() = " ++ Sophia, {Code, Type} = compile_entrypoint_code_and_type(Source, "f"), Fate = extract_return_value(Code), % Check the FATE term as usual. gmb_fate_encoding:serialize(Fate), % Do a typed parse, as usual, but there are probably record/variant % definitions in the AACI, so untyped parses probably don't work. check_sophia_to_fate(Type, Sophia, Fate). anon_types_test() -> % Integers. check_parser("123"), check_parser("1_2_3"), % Bytes. check_parser("#DEAD000BEEF"), check_parser("#DE_AD0_00B_EEF"), % Strings. check_parser("\"hello world\""), % List of integers. check_parser("[1, 2, 3]"), % List of lists. check_parser("[[], [1], [2, 3]]"), % Tuple. check_parser("(1, [2, 3], (4, 5))"), % Map. check_parser("{[1] = 2, [3] = 4}"), ok. string_escape_codes_test() -> check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""), check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""), ok. records_test() -> TypeDef = "record pair = {x: int, y: int}", Sophia = "{x = 1, y = 2}", check_parser_with_typedef(TypeDef, Sophia), % The above won't run an untyped parse on the expression, but we can. It % will error, though. {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia). singleton_records_test() -> TypeDef = "record singleton('a) = {it: 'a}", check_parser_with_typedef(TypeDef, "{it = 123}"), check_parser_with_typedef(TypeDef, "{it = {it = {it = 5}}}"), check_parser_with_typedef(TypeDef, "[{it = 1}, {it = 2}, {it = 3}]"), ok. singleton_variants_test() -> % Similar tests to the singleton records, but this time there isn't % actually a special case; singleton variants are in fact wrapped in the % FATE too. TypeDef = "datatype wrapped('a) = Wrap('a)", check_parser_with_typedef(TypeDef, "Wrap(123)"), check_parser_with_typedef(TypeDef, "Wrap(Wrap(123))"), check_parser_with_typedef(TypeDef, "[Wrap(1), Wrap(2), Wrap(3)]"), ok. excess_parens_test() -> % 'singleton' parens are another special case, but unlike singleton % records, which exist in the type system, singleton parens aren't tuples % at all! They are just grouping, for arithmetic. For example. check_parser("(123)"), check_parser("[1, (2), ((3))]"), % Where this gets tricky, though, is when grouping parens are mixed with % tuple parens. E.g. this list of three tuples should all parse to the same % result. check_parser("[((1, 2)), ((1), 2), (((1), 2))]"), % Including multiple nestings of tuples and grouping, interleaved. check_parser("((((1), ((2, 3)))), 4)"), % Also empty tuples exist! check_parser("()"), check_parser("(((((), ())), ()))"), ok. variant_test() -> TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)", check_parser_with_typedef(TypeDef, "Zero"), check_parser_with_typedef(TypeDef, "One(0)"), check_parser_with_typedef(TypeDef, "Two(0, 1)"), check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"), {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"), ok. lexer_offset_test() -> % Test that various tokens report their position correctly. {error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"), {error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"), {error, {unexpected_token, "abc", 1, 5, 7}} = parse_literal("123 abc"), {error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"), {error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""), {error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""), % Check that the tokenizer knows its position correctly *after* various % tokens. {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"), ABCType = {"mytype", already_normalized, {variant, [{"abc", []}]}}, {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "abc 123"), {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"), {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"), {error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"), % Check that the tokenizer accounts for various line separators correctly. {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\nabc"), {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\r\nabc"), {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\rabc"), ok.