Make Hakuzaru Great Again #22

Merged
zxq9 merged 46 commits from parser into master 2026-05-10 15:26:44 +09:00
2 changed files with 132 additions and 0 deletions
Showing only changes of commit 97e32574c4 - Show all commits
+2
View File
@@ -22,6 +22,8 @@
fate_to_erlang/2,
erlang_args_to_fate/2,
get_function_signature/2]).
% Internal stuff that is useful for writing AACI unit tests.
-export([annotate_type/2]).
%%% Types
+130
View File
@@ -0,0 +1,130 @@
-module(hz_sophia).
-vsn("0.8.2").
-author("Jarvis Carroll <spiveehere@gmail.com>").
-copyright("Jarvis Carroll <spiveehere@gmail.com>").
-license("GPL-3.0-or-later").
-include_lib("eunit/include/eunit.hrl").
parse_literal(Type, String) ->
case parse_expression(Type, {tk, 1, 1}, String) of
{ok, {Result, NewTk, NewString}} ->
parse_literal2(Result, NewTk, NewString);
{error, Reason} ->
{error, Reason}
end.
parse_literal2(Result, Tk, String) ->
% We have parsed a valid expression. Now check that the string ends.
case next_token(Tk, String) of
{ok, {{eof, _, _, _, _}, _, _}} ->
{ok, Result};
{ok, {{_, S, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}};
{error, Reason} ->
{error, Reason}
end.
%%% Tokenizer
next_token({tk, Row, Col}, []) ->
{ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
next_token({tk, Row, Col}, " " ++ Rest) ->
next_token({tk, Row + 1, Col}, Rest);
next_token({tk, Row, Col}, "\t" ++ Rest) ->
next_token({tk, Row + 1, Col}, Rest);
next_token(Tk, [N | _] = String) when N >= $0, N =< $9 ->
num_token(Tk, Tk, String, []);
next_token(Tk, [N | _] = String) when N >= $A, N =< $Z ->
alphanum_token(Tk, Tk, String, []);
next_token(Tk, [N | _] = String) when N >= $a, N =< $z ->
alphanum_token(Tk, Tk, String, []);
next_token(Tk, [$_ | _] = String) ->
alphanum_token(Tk, Tk, String, []);
next_token({tk, Row, Col}, [Char | _]) ->
{error, {unknown_char, Row, Col, [Char]}}.
num_token(Start, {tk, Row, Col}, [N | Rest], Acc) when N >= $0, N =< $9 ->
num_token(Start, {tk, Row + 1, Col}, Rest, [N | Acc]);
num_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
NumString = lists:reverse(Acc),
Token = {integer, NumString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $A, C =< $Z ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $a, C =< $z ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when C >= $0, C =< $9 ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token(Start, {tk, Row, Col}, [$_ | Rest], Acc) ->
alphanum_token(Start, {tk, Row, Col}, Rest, [$_ | Acc]);
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
AlphaString = lists:reverse(Acc),
Token = {alphanum, AlphaString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}.
%%% Sophia Literal Parser
%%% This parser is a simple recursive descent parser, written explicitly in
%%% erlang.
%%%
%%% There are no infix operators in the subset we want to parse, so recursive
%%% descent is fine with no special tricks, no shunting yard algorithm, no
%%% parser generators, etc.
%%%
%%% If we were writing this in C then we might want to work iteratively with an
%%% array of finite state machines, i.e. with a pushdown automaton, instead of
%%% using recursion. This is a tried and true method of making fast parsers.
%%% Recall, however, that the BEAM *is* a stack machine, written in C, so
%%% rather than writing confusing iterative code in Erlang, to simulate a
%%% pushdown automaton inside another simulated stack machine... we should just
%%% write the recursive code, thus programming the BEAM to implement the
%%% pushdown automaton that we want.
parse_expression(Type, Tk, String) ->
{ok, {Token, NewTk, NewString}} = next_token(Tk, String),
parse_expression2(Type, NewTk, NewString, Token).
parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
Value = list_to_integer(S),
check_type(integer, Type, Row, Start, End, {Value, Tk, String});
parse_expression2(_, _, _, {_, S, Row, Start, End}) ->
{error, {unexpected_token, S, Row, Start, End}}.
check_type(Expected, {_, _, Expected}, _, _, _, Result) ->
{ok, Result};
check_type(_, {_, _, unknown_type}, _, _, _, Result) ->
% We want it to be possible to opt out of type-checking, since FATE is
% dynamically typed anyway.
{ok, Result};
check_type(Expected, {O, N, _}, Row, Start, End, _) ->
{error, {wrong_type, O, N, Expected, Row, Start, End}}.
%%% Tests
check_sophia_to_fate(Type, Sophia, Fate) ->
{ok, FateActual} = parse_literal(Type, Sophia),
case FateActual of
Fate ->
ok;
_ ->
erlang:error({to_fate_failed, Fate, FateActual})
end.
check_parser(Type, Sophia, Fate) ->
UnknownType = setelement(3, Type, unknown_type),
check_sophia_to_fate(Type, Sophia, Fate),
check_sophia_to_fate(UnknownType, Sophia, Fate),
% Finally, check that the FATE result is something that gmb understands.
gmb_fate_encoding:serialize(Fate),
ok.
int_test() ->
{ok, Type} = hz_aaci:annotate_type(integer, #{}),
check_parser(Type, "123", 123).