Parse strings

Calculate scalar values during lexing
This saves some effort and probably some performance for things like integers, but I'm mainly doing this in anticipation of string literals, because it would just be ridiculous to read code that lexes string literals twice.
2026-01-29 06:18:06 +00:00 · 2026-01-29 04:06:19 +00:00 · 2026-01-29 03:03:11 +00:00 · 2026-01-29 02:01:16 +00:00 · 2026-01-27 06:42:55 +00:00 · 2026-01-23 06:18:39 +00:00
4 changed files with 1831 additions and 1134 deletions
@@ -190,48 +190,16 @@ l_to_i(S) ->
    end.


-spec req(Type, Message) -> RequestMap
-    when Type       :: {sign, message | binary | bitcoin}
-                     | tx
-                     | ack
-                     | sign,
-         Message    :: binary(),
-         RequestMap :: map().
-%% @doc
-%% GRIDS maps always contain the following keys:
-%% ```
-%%  #{"grids"      => 1,
-%%    "chain"      => "gajumaru",
-%%    "network_id" => "groot.mainnet.gajumaru.io",
-%%    "type"       => "message" | "binary" | "binary" | "tx" | "ack"
-%%    "public_id"  => term(),
-%%    "payload"    => string()};
-%% '''
-
 req(Type, Message) ->
    req(Type, Message, false).

-req({sign, message}, Message, ID) ->
+req(sign, Message, ID) ->
    #{"grids"      => 1,
      "chain"      => "gajumaru",
      "network_id" => hz:network_id(),
      "type"       => "message",
      "public_id"  => ID,
      "payload"    => Message};
-req({sign, binary}, Binary, ID) ->
-    #{"grids"      => 1,
-      "chain"      => "gajumaru",
-      "network_id" => hz:network_id(),
-      "type"       => "binary",
-      "public_id"  => ID,
-      "payload"    => base64:encode(Binary)};
-req({sign, bitcoin}, Binary, ID) ->
-    #{"grids"      => 1,
-      "chain"      => "gajumaru",
-      "network_id" => hz:network_id(),
-      "type"       => "bitcoin",
-      "public_id"  => ID,
-      "payload"    => base64:encode(Binary)};
 req(tx, Data, ID) ->
    #{"grids"      => 1,
      "chain"      => "gajumaru",
@@ -245,6 +213,4 @@ req(ack, Message, ID) ->
      "network_id" => hz:network_id(),
      "type"       => "ack",
      "public_id"  => ID,
-      "payload"    => Message};
-req(sign, Message, ID) ->
-    req({sign, message}, Message, ID).
+      "payload"    => Message}.
@@ -0,0 +1,638 @@
+-module(hz_sophia).
+-vsn("0.8.2").
+-author("Jarvis Carroll <spiveehere@gmail.com>").
+-copyright("Jarvis Carroll <spiveehere@gmail.com>").
+-license("GPL-3.0-or-later").
+
+-export([check_parser/1]).
+
+-include_lib("eunit/include/eunit.hrl").
+
+parse_literal(Type, String) ->
+    case parse_expression(Type, {tk, 1, 1}, String) of
+        {ok, {Result, NewTk, NewString}} ->
+            parse_literal2(Result, NewTk, NewString);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_literal2(Result, Tk, String) ->
+    % We have parsed a valid expression. Now check that the string ends.
+    case next_token(Tk, String) of
+        {ok, {{eof, _, _, _, _}, _, _}} ->
+            {ok, Result};
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
+            {error, {unexpected_token, S, Row, Start, End}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+%%% Tokenizer
+
+-define(IS_ALPHA(C), ((((C) >= $A) and ((C) =< $Z)) or (((C) >= $a) and ((C) =< $z)) or ((C) == $_))).
+-define(IS_NUM(C), (((C) >= $0) and ((C) =< $9))).
+-define(IS_ALPHANUM(C), (?IS_ALPHA(C) or ?IS_NUM(C))).
+-define(IS_HEX(C), (?IS_NUM(C) or (((C) >= $A) and ((C) =< $F)) or (((C) >= $a) and ((C) =< $f)))).
+
+next_token({tk, Row, Col}, []) ->
+    {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
+next_token({tk, Row, Col}, " " ++ Rest) ->
+    next_token({tk, Row + 1, Col}, Rest);
+next_token({tk, Row, Col}, "\t" ++ Rest) ->
+    next_token({tk, Row + 1, Col}, Rest);
+next_token({tk, _, Col}, "\r\n" ++ Rest) ->
+    next_token({tk, 1, Col + 1}, Rest);
+next_token({tk, _, Col}, "\r" ++ Rest) ->
+    next_token({tk, 1, Col + 1}, Rest);
+next_token({tk, _, Col}, "\n" ++ Rest) ->
+    next_token({tk, 1, Col + 1}, Rest);
+next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
+    alphanum_token(Tk, Tk, String, []);
+next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
+    num_token(Tk, Tk, String, [], 0);
+next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
+    bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []);
+next_token({tk, Row, Col}, "\"" ++ Rest) ->
+    string_token({tk, Row, Col}, {tk, Row + 1, Col}, Rest, "\"", <<>>);
+next_token({tk, Row, Col}, [Char | Rest]) ->
+    Token = {character, [Char], Char, Row, Col, Col},
+    {ok, {Token, {tk, Row + 1, Col}, Rest}}.
+
+alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
+    alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
+alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
+    AlphaString = lists:reverse(Acc),
+    Token = {alphanum, AlphaString, AlphaString, Row, Start, End},
+    {ok, {Token, {tk, Row, End}, String}}.
+
+num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
+    NewValue = Value * 10 + (C - $0),
+    num_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], NewValue);
+num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
+    NewValue = Value * 10 + (C - $0),
+    num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Chars], NewValue);
+num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
+    NumString = lists:reverse(Chars),
+    Token = {integer, NumString, Value, Row, Start, End},
+    {ok, {Token, {tk, Row, End}, String}}.
+
+bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
+    Digit = convert_digit(C),
+    bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], [Digit | Digits]);
+bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
+    Digit = convert_digit(C),
+    bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Chars], [Digit | Digits]);
+bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
+    BytesString = lists:reverse(Chars),
+    Value = reverse_combine_nibbles(Digits, <<>>),
+    Token = {bytes, BytesString, Value, Row, Start, End},
+    {ok, {Token, {tk, Row, End}, String}}.
+
+convert_digit(C) when C >= $0, C =< $9 ->
+    C - $0;
+convert_digit(C) when C >= $A, C =< $Z ->
+    C - $A + 10;
+convert_digit(C) when C >= $a, C =< $z ->
+    C - $a + 10.
+
+reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
+    NewAcc = <<D2:4, D1:4, Acc/binary>>,
+    reverse_combine_nibbles(Rest, NewAcc);
+reverse_combine_nibbles([D1], Acc) ->
+    <<0:4, D1:4, Acc/binary>>;
+reverse_combine_nibbles([], Acc) ->
+    Acc.
+
+string_token(Start, {tk, Row, Col}, [$\\, $x, A, B | Rest], SourceChars, Value) ->
+    case escape_hex_code(A, B) of
+        {ok, ByteVal} ->
+            string_token(Start, {tk, Row + 4, Col}, Rest, [B, A, $x, $\ | SourceChars], <<Value/binary, ByteVal>>);
+        error ->
+            {error, {invalid_escape_code, [$\\, $x, A, B], Row, Col}}
+    end;
+string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
+    case escape_char(C) of
+        {ok, ByteVal} ->
+            string_token(Start, {tk, Row + 2, Col}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
+        error ->
+            {error, {invalid_escape_code, [C], Row, Col}}
+    end;
+string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) ->
+    SourceStr = lists:reverse([$" | SourceChars]),
+    Token = {string, SourceStr, Value, Row, Start, End},
+    {ok, {Token, {tk, Row, End}, Rest}};
+string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
+    string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <<Value/binary, C>>).
+
+escape_hex_code(A, B) when ?IS_HEX(A), ?IS_HEX(B) ->
+    % As of writing this, the Sophia compiler will convert this byte from
+    % extended ASCII to unicode... But it really shouldn't. The literal parser
+    % does what the compiler should do.
+    Byte = convert_digit(A) * 16 + convert_digit(B),
+    {ok, Byte};
+escape_hex_code(_, _) ->
+    error.
+
+escape_char($b)  -> {ok, $\b};
+escape_char($e)  -> {ok, $\e};
+escape_char($f)  -> {ok, $\f};
+escape_char($n)  -> {ok, $\n};
+escape_char($r)  -> {ok, $\r};
+escape_char($t)  -> {ok, $\t};
+escape_char($v)  -> {ok, $\v};
+escape_char($")  -> {ok, $\"};
+escape_char($\\) -> {ok, $\\};
+escape_char(_)   -> error.
+
+%%% Sophia Literal Parser
+
+%%% This parser is a simple recursive descent parser, written explicitly in
+%%% erlang.
+%%%
+%%% There are no infix operators in the subset we want to parse, so recursive
+%%% descent is fine with no special tricks, no shunting yard algorithm, no
+%%% parser generators, etc.
+%%%
+%%% If we were writing this in C then we might want to work iteratively with an
+%%% array of finite state machines, i.e. with a pushdown automaton, instead of
+%%% using recursion. This is a tried and true method of making fast parsers.
+%%% Recall, however, that the BEAM *is* a stack machine, written in C, so
+%%% rather than writing confusing iterative code in Erlang, to simulate a
+%%% pushdown automaton inside another simulated stack machine... we should just
+%%% write the recursive code, thus programming the BEAM to implement the
+%%% pushdown automaton that we want.
+
+parse_expression(Type, Tk, String) ->
+    case next_token(Tk, String) of
+        {ok, {Token, NewTk, NewString}} ->
+            parse_expression2(Type, NewTk, NewString, Token);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_expression2(Type, Tk, String, {integer, _, Value, Row, Start, End}) ->
+    case Type of
+        {_, _, integer} ->
+            {ok, {Value, Tk, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Tk, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, integer, Row, Start, End}}
+    end;
+parse_expression2(Type, Tk, String, {bytes, _, Value, Row, Start, End}) ->
+    Len = byte_size(Value),
+    Result = {bytes, Value},
+    case Type of
+        {_, _, {bytes, [any]}} ->
+            {ok, {Result, Tk, String}};
+        {_, _, {bytes, [Len]}} ->
+            {ok, {Result, Tk, String}};
+        {_, _, {bytes, [ExpectedLen]}} ->
+            {error, {bytes_wrong_size, ExpectedLen, Len, Row, Start, End}};
+        {_, _, unknown_type} ->
+            {ok, {Result, Tk, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, {bytes, [Len]}, Row, Start, End}}
+    end;
+parse_expression2(Type, Tk, String, {string, _, Value, Row, Start, End}) ->
+    case Type of
+        {_, _, string} ->
+            {ok, {Value, Tk, String}};
+        {_, _, unknown_type} ->
+            {ok, {Value, Tk, String}};
+        {O, N, _} ->
+            {error, {wrong_type, O, N, string, Row, Start, End}}
+    end;
+parse_expression2(Type, Tk, String, {character, "[", _, Row, Start, _}) ->
+    parse_list(Type, Tk, String, Row, Start);
+parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) ->
+    parse_tuple(Type, Tk, String, Row, Start);
+parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) ->
+    parse_record_or_map(Type, Tk, String, Row, Start);
+parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) ->
+    parse_variant(Type, Tk, String, Ident, Row, Start, End);
+parse_expression2(_, _, _, {_, S, _, Row, Start, End}) ->
+    {error, {unexpected_token, S, Row, Start, End}}.
+
+unknown_type() ->
+    {unknown_type, already_normalized, unknown_type}.
+
+expect_tokens([], Tk, String) ->
+    {ok, {Tk, String}};
+expect_tokens([Str | Rest], Tk, String) ->
+    case next_token(Tk, String) of
+        {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} ->
+            expect_tokens(Rest, NewTk, NewString);
+        {ok, {{_, Actual, _, Row, Start, End}}} ->
+            {error, {unexpected_token, Actual, Row, Start, End}}
+    end.
+
+%%% List Parsing
+
+parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
+    parse_list_loop(Inner, Tk, String, "]", Row, Start, []);
+parse_list({_, _, unknown_type}, Tk, String, Row, Start) ->
+    parse_list_loop(unknown_type(), Tk, String, "]", Row, Start, []);
+parse_list({O, N, _}, _, _, Row, Start) ->
+    {error, {wrong_type, O, N, list, Row, Start, Start}}.
+
+parse_list_loop(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} ->
+            {ok, {lists:reverse(Acc), NewTk, NewString}};
+        {ok, {Token, NewTk, NewString}} ->
+            parse_list_loop2(Inner, NewTk, NewString, CloseChar, Row, Start, Acc, Token)
+    end.
+
+parse_list_loop2(Inner, Tk, String, CloseChar, Row, Start, Acc, Token) ->
+    case parse_expression2(Inner, Tk, String, Token) of
+        {ok, {Value, NewTk, NewString}} ->
+            parse_list_loop3(Inner, NewTk, NewString, CloseChar, Row, Start, [Value | Acc]);
+        {error, Reason} ->
+            Wrapper = choose_list_error_wrapper(CloseChar),
+            % TODO: Are tuple indices off by one from list indices?
+            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
+            {error, Wrapped}
+    end.
+
+parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} ->
+            {ok, {lists:reverse(Acc), NewTk, NewString}};
+        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
+            parse_list_loop(Inner, NewTk, NewString, CloseChar, Row, Start, Acc);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+choose_list_error_wrapper("]") -> list_element;
+choose_list_error_wrapper(")") -> tuple_element.
+
+%%% Tuple Parsing
+
+parse_tuple({_, _, {tuple, Types}}, Tk, String, Row, Start) ->
+    case parse_multivalue(Types, Tk, String, Row, Start, []) of
+        {ok, {TermList, NewTk, NewString}} ->
+            Result = {tuple, list_to_tuple(TermList)},
+            {ok, {Result, NewTk, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end;
+parse_tuple({_, _, unknown_type}, Tk, String, Row, Start) ->
+    % An untyped tuple is a list of untyped terms, and weirdly our list parser
+    % works perfectly for that, as long as we change the closing character to
+    % be ")" instead of "]".
+    case parse_list_loop(unknown_type(), Tk, String, ")", Row, Start, []) of
+        {ok, {TermList, NewTk, NewString}} ->
+            Result = {tuple, list_to_tuple(TermList)},
+            {ok, {Result, NewTk, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end;
+parse_tuple({O, N, _}, _, _, Row, Start) ->
+    {error, {wrong_type, O, N, tuple, Row, Start, Start}}.
+
+parse_multivalue(ElemTypes, Tk, String, Row, Start, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, ")", Row2, Start2, _}, NewTk, NewString}} ->
+            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
+        {ok, {Token, NewTk, NewString}} ->
+            parse_multivalue2(ElemTypes, NewTk, NewString, Row, Start, Acc, Token)
+    end.
+
+parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) ->
+    case parse_expression2(Next, Tk, String, Token) of
+        {ok, {Value, NewTk, NewString}} ->
+            parse_multivalue3(Rest, NewTk, NewString, Row, Start, [Value | Acc]);
+        {error, Reason} ->
+            Wrapper = choose_list_error_wrapper(")"),
+            % TODO: Are tuple indices off by one from list indices?
+            Wrapped = wrap_error(Reason, {Wrapper, length(Acc)}),
+            {error, Wrapped}
+    end;
+parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) ->
+    {ok, {lists:reverse(Acc), Tk, String}};
+parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) ->
+    {error, {unexpected_token, S, Row, Start, End}}.
+
+parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
+            check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
+        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
+            parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+check_multivalue_long_enough([], Tk, String, _, _, Acc) ->
+    {ok, {lists:reverse(Acc), Tk, String}};
+check_multivalue_long_enough(Remaining, _, _, Row, Col, Got) ->
+    GotCount = length(Got),
+    ExpectCount = length(Remaining) + GotCount,
+    {error, {not_enough_elements, ExpectCount, GotCount, Row, Col}}.
+
+%%% Variant parsing
+
+parse_variant({_, _, {variant, Variants}}, Tk, String, Ident, Row, Start, End) ->
+    parse_variant2(Variants, Tk, String, Ident, Row, Start, End);
+parse_variant({_, _, unknown_type}, _, _, _, Row, Start, End) ->
+    {error, {unresolved_variant, Row, Start, End}};
+parse_variant({O, N, _}, _, _, _, Row, Start, End) ->
+    % In normal code, identifiers can have many meanings, which can result in
+    % lots of different errors. In this Sophia 'object notation', identifiers
+    % can only ever be variant constructors, (sort of like the Sophia version
+    % of atoms,) and so immediately lead to a type error if we aren't expecting
+    % a variant.
+    {error, {wrong_type, O, N, variant, Row, Start, End}}.
+
+parse_variant2(Variants, Tk, String, Ident, Row, Start, End) ->
+    case lookup_variant(Ident, Variants, 0) of
+        {ok, {Tag, ElemTypes}} ->
+            GetArity = fun({_, OtherElemTypes}) -> length(OtherElemTypes) end,
+            Arities = lists:map(GetArity, Variants),
+            parse_variant3(Arities, Tag, ElemTypes, Tk, String);
+        error ->
+            {error, {invalid_constructor, Ident, Row, Start, End}}
+    end.
+
+parse_variant3(Arities, Tag, [], Tk, String) ->
+    % Parsing of 0-arity variants is different.
+    Result = {variant, Arities, Tag, {}},
+    {ok, {Result, Tk, String}};
+parse_variant3(Arities, Tag, ElemTypes, Tk, String) ->
+    case next_token(Tk, String) of
+        {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} ->
+            parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start);
+        {ok, {{_, Actual, _, Row, Start, End}}} ->
+            {error, {unexpected_token, Actual, Row, Start, End}}
+    end.
+
+parse_variant4(Arities, Tag, ElemTypes, Tk, String, Row, Start) ->
+    case parse_multivalue(ElemTypes, Tk, String, Row, Start, []) of
+        {ok, {Terms, NewTk, NewString}} ->
+            Result = {variant, Arities, Tag, list_to_tuple(Terms)},
+            {ok, {Result, NewTk, NewString}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+lookup_variant(_, [], _) ->
+    error;
+lookup_variant(Ident, [{Ident, ElemTypes} | _], Tag) ->
+    {ok, {Tag, ElemTypes}};
+lookup_variant(Ident, [_ | Rest], Tag) ->
+    lookup_variant(Ident, Rest, Tag + 1).
+
+%%% Record parsing
+
+parse_record_or_map({_, _, {map, [KeyType, ValueType]}}, Tk, String, _, _) ->
+    parse_map(KeyType, ValueType, Tk, String, #{});
+parse_record_or_map({_, _, {record, Fields}}, Tk, String, _, _) ->
+    parse_record(Fields, Tk, String, #{});
+parse_record_or_map({_, _, unknown_type}, Tk, String, _, _) ->
+    case next_token(Tk, String) of
+        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
+            {ok, {#{}, NewTk, NewString}};
+        {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} ->
+            parse_map2(unknown_type(), unknown_type(), NewTk, NewString, #{});
+        {ok, {{alphanum, _, _, Row, Start, End}, _, _}} ->
+            {error, {unresolved_record, Row, Start, End}};
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
+            {error, {unexpected_token, S, Row, Start, End}}
+    end;
+parse_record_or_map({O, N, _}, _, _, Row, Start) ->
+    {error, {wrong_type, O, N, map, Row, Start, Start}}.
+
+parse_record(Fields, Tk, String, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{alphanum, Ident, _, Row, Start, End}, NewTk, NewString}} ->
+            parse_record2(Fields, NewTk, NewString, Acc, Ident, Row, Start, End);
+        {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} ->
+            parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End);
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
+            {error, {unexpected_token, S, Row, Start, End}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_record2(Fields, Tk, String, Acc, Ident, Row, Start, End) ->
+    case lists:keyfind(Ident, 1, Fields) of
+        {_, Type} ->
+            parse_record3(Fields, Tk, String, Acc, Ident, Row, Start, End, Type);
+        false ->
+            {error, {invalid_field, Ident, Row, Start, End}}
+    end.
+
+parse_record3(Fields, Tk, String, Acc, Ident, Row, Start, End, Type) ->
+    case maps:is_key(Ident, Acc) of
+        false ->
+            parse_record4(Fields, Tk, String, Acc, Ident, Type);
+        true ->
+            {error, {field_already_present, Ident, Row, Start, End}}
+    end.
+
+parse_record4(Fields, Tk, String, Acc, Ident, Type) ->
+    case expect_tokens(["="], Tk, String) of
+        {ok, {NewTk, NewString}} ->
+            parse_record5(Fields, NewTk, NewString, Acc, Ident, Type);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_record5(Fields, Tk, String, Acc, Ident, Type) ->
+    case parse_expression(Type, Tk, String) of
+        {ok, {Result, NewTk, NewString}} ->
+            NewAcc = maps:put(Ident, Result, Acc),
+            parse_record6(Fields, NewTk, NewString, NewAcc);
+        {error, Reason} ->
+            wrap_error(Reason, {record_field, Ident})
+    end.
+
+parse_record6(Fields, Tk, String, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
+            parse_record(Fields, NewTk, NewString, Acc);
+        {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} ->
+            parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End);
+        {ok, {{_, S, _, Row, Start, End}, _, _}} ->
+            {error, {unexpected_token, S, Row, Start, End}};
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_record_end(Fields, Tk, String, FieldValues, Row, Start, End) ->
+    case parse_record_final_loop(Fields, FieldValues, []) of
+        {ok, Result} ->
+            {ok, {Result, Tk, String}};
+        {error, {missing_field, Name}} ->
+            {error, {missing_field, Name, Row, Start, End}}
+    end.
+
+parse_record_final_loop([{Name, _} | Rest], FieldValues, Acc) ->
+    case maps:find(Name, FieldValues) of
+        {ok, Value} ->
+            parse_record_final_loop(Rest, FieldValues, [Value | Acc]);
+        error ->
+            {error, {missing_field, Name}}
+    end;
+parse_record_final_loop([], _, FieldsReverse) ->
+    Fields = lists:reverse(FieldsReverse),
+    Tuple = list_to_tuple(Fields),
+    {ok, {tuple, Tuple}}.
+
+
+%%% Map Parsing
+
+parse_map(KeyType, ValueType, Tk, String, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} ->
+            parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
+        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
+            {ok, {Acc, NewTk, NewString}};
+        {ok, {{_, S, _, Row, Start, End}}} ->
+            {error, {unexpected_token, S, Row, Start, End}}
+    end.
+
+parse_map2(KeyType, ValueType, Tk, String, Acc) ->
+    case parse_expression(KeyType, Tk, String) of
+        {ok, {Result, NewTk, NewString}} ->
+            parse_map3(KeyType, ValueType, NewTk, NewString, Acc, Result);
+        {error, Reason} ->
+            wrap_error(Reason, {map_key, maps:size(Acc)})
+    end.
+
+parse_map3(KeyType, ValueType, Tk, String, Acc, Key) ->
+    case expect_tokens(["]", "="], Tk, String) of
+        {ok, {NewTk, NewString}} ->
+            parse_map4(KeyType, ValueType, NewTk, NewString, Acc, Key);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_map4(KeyType, ValueType, Tk, String, Acc, Key) ->
+    case parse_expression(ValueType, Tk, String) of
+        {ok, {Result, NewTk, NewString}} ->
+            NewAcc = maps:put(Key, Result, Acc),
+            parse_map5(KeyType, ValueType, NewTk, NewString, NewAcc);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+parse_map5(KeyType, ValueType, Tk, String, Acc) ->
+    case next_token(Tk, String) of
+        {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
+            parse_map(KeyType, ValueType, NewTk, NewString, Acc);
+        {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
+            {ok, {Acc, NewTk, NewString}};
+        {ok, {{_, S, _, Row, Start, End}}} ->
+            {error, {unexpected_token, S, Row, Start, End}}
+    end.
+
+% TODO
+wrap_error(Reason, _) -> Reason.
+
+%%% Tests
+
+check_sophia_to_fate(Type, Sophia, Fate) ->
+    case parse_literal(Type, Sophia) of
+        {ok, Fate} ->
+            ok;
+        {ok, FateActual} ->
+            erlang:error({to_fate_failed, Sophia, Fate, {ok, FateActual}});
+        {error, Reason} ->
+            erlang:error({to_fate_failed, Sophia, Fate, {error, Reason}})
+    end.
+
+compile_entrypoint_code_and_type(Source, Entrypoint) ->
+    {ok, #{fate_code := FateCode, aci := ACI}} = so_compiler:from_string(Source, [{aci, json}]),
+
+    % Find the fcode for the correct entrypoint.
+    {fcode, Bodies, NamesMap, _} = FateCode,
+    Names = maps:to_list(NamesMap),
+    Name = unicode:characters_to_binary(Entrypoint),
+    {Hash, Name} = lists:keyfind(Name, 2, Names),
+    {_, _, Code} = maps:get(Hash, Bodies),
+
+    % Generate the AACI, and get the AACI type info for the correct entrypoint.
+    AACI = hz_aaci:prepare_aaci(ACI),
+    {ok, {_, Type}} = hz_aaci:get_function_signature(AACI, "f"),
+
+    {Code, Type}.
+
+extract_return_value(#{0 := [{'RETURNR', {immediate, FATE}}]}) ->
+    FATE;
+extract_return_value(Code) ->
+    erlang:exit({invalid_literal_fcode, Code}).
+
+check_parser(Sophia) ->
+    % Compile the literal using the compiler, to check that it is valid Sophia
+    % syntax, and to get an AACI object to pass to the parser.
+    Source = "contract C = entrypoint f() = " ++ Sophia,
+    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
+    Fate = extract_return_value(Code),
+
+    % Also check that the FATE term is valid, by running it through gmb.
+    gmb_fate_encoding:serialize(Fate),
+
+    % Now check that our parser produces that output.
+    check_sophia_to_fate(Type, Sophia, Fate),
+    % Also check that it can be parsed without type information.
+    check_sophia_to_fate(unknown_type(), Sophia, Fate).
+
+check_parser_with_typedef(Typedef, Sophia) ->
+    % Compile the type definitions alongside the usual literal expression.
+    Source = "contract C =\n  " ++ Typedef ++ "\n  entrypoint f() = " ++ Sophia,
+    {Code, Type} = compile_entrypoint_code_and_type(Source, "f"),
+    Fate = extract_return_value(Code),
+
+    % Check the FATE term as usual.
+    gmb_fate_encoding:serialize(Fate),
+
+    % Do a typed parse, as usual, but there are probably record/variant
+    % definitions in the AACI, so untyped parses probably don't work.
+    check_sophia_to_fate(Type, Sophia, Fate).
+
+anon_types_test() ->
+    % Integers.
+    check_parser("123"),
+    check_parser("1_2_3"),
+    % Bytes.
+    check_parser("#DEAD000BEEF"),
+    check_parser("#DE_AD0_00B_EEF"),
+    % Strings.
+    check_parser("\"hello world\""),
+    check_parser("\"  \\b\\e\\f\\n\\r\\t\\v\\\"\\\\  \""),
+    check_parser("\"\\x00\\x11\\x77\""),
+    % List of integers.
+    check_parser("[1, 2, 3]"),
+    % List of lists.
+    check_parser("[[], [1], [2, 3]]"),
+    % Tuple.
+    check_parser("(1, [2, 3], (4, 5))"),
+    % Map.
+    check_parser("{[1] = 2, [3] = 4}"),
+
+    ok.
+
+records_test() ->
+    TypeDef = "record pair = {x: int, y: int}",
+    Sophia = "{x = 1, y = 2}",
+    check_parser_with_typedef(TypeDef, Sophia),
+    % The above won't run an untyped parse on the expression, but we can. It
+    % will error, though.
+    {error, {unresolved_record, _, _, _}} = parse_literal(unknown_type(), Sophia).
+
+variant_test() ->
+    TypeDef = "datatype multi('a) = Zero | One('a) | Two('a, 'a)",
+
+    check_parser_with_typedef(TypeDef, "Zero"),
+    check_parser_with_typedef(TypeDef, "One(0)"),
+    check_parser_with_typedef(TypeDef, "Two(0, 1)"),
+    check_parser_with_typedef(TypeDef, "Two([], [1, 2, 3])"),
+
+    {error, {unresolved_variant, _, _, _}} = parse_literal(unknown_type(), "Zero"),
+
+    ok.
+
+
Author	SHA1	Message	Date
Jarvis Carroll	49cd8b6687	Parse strings	2026-01-29 06:18:06 +00:00
Jarvis Carroll	966b4b2748	Calculate scalar values during lexing This saves some effort and probably some performance for things like integers, but I'm mainly doing this in anticipation of string literals, because it would just be ridiculous to read code that lexes string literals twice.	2026-01-29 04:06:19 +00:00
Jarvis Carroll	fe182a5233	Handle underscores in integers/bytes This forces us to test for alpha/num/hex enough times that it's now worth making macros for these things.	2026-01-29 03:03:11 +00:00
Jarvis Carroll	f1696e2b9e	Bytes lexing I don't handle underscores in bytes correctly... Nor in integers, for that matter.	2026-01-29 02:01:16 +00:00
Jarvis Carroll	2bf384ca82	Infer correct values for tests automatically Now tests compare the literal parser against the output of the compiler. The little example contracts we are compiling for the AACI already had the FATE value in them, in the form of the instruction {'RETURNR', {immediate, FateValue}} so we just extract that and use it for the tests.	2026-01-27 06:42:55 +00:00
Jarvis Carroll	4f2a3c6c6f	Variant parsing	2026-01-23 06:18:39 +00:00
Jarvis Carroll	7df04a81be	Tuple parsing	2026-01-23 02:45:23 +00:00
Jarvis Carroll	6f02d4c4e6	Record parsing	2026-01-23 00:48:06 +00:00
Jarvis Carroll	56e63051bc	Map parsing	2026-01-16 05:46:27 +00:00
Jarvis Carroll	3f1c9bd626	List parsing Slowly chipping away at cases...	2026-01-15 09:38:04 +00:00
Jarvis Carroll	97e32574c4	set up parsing structure We tokenize, and then do the simplest possible recursive descent. We don't want to evaluate anything, so infix operators are out, meaning no shunting yard or tree rearranging or LR(1) shenanigans are necessary, just write the code. If we want to 'peek', just take the next token, and pass it around from that point on, until it can actually be consumed.	2026-01-15 01:52:30 +00:00
Jarvis Carroll	6f5525afcf	Rename get_function_signature hz_aaci:aaci_get_function_signature is a bit redundant.	2026-01-15 01:50:50 +00:00
Jarvis Carroll	4f1958b210	use lists:unzip/1 Just a little thing I noticed could be improved.	2026-01-13 01:19:29 +00:00
Jarvis Carroll	3da9bd570b	split coerce/3 into two functions Also renamed coerce_bindings to erlang_args_to_fate, to match.	2026-01-09 04:39:58 +00:00
Jarvis Carroll	d2163c1ff8	split AACI out of hz.erl So far the interface to hz.erl is mostly unchanged, apart from prepare_aaci/1 Maybe prepare_aaci should be re-exported, but using it is exactly in line with the 'inconvenient but more flexible primitives' that hz_aaci.erl is meant to represent, so, maybe that is a fine place to have to go for it, dunno.	2026-01-07 09:40:55 +00:00