Calculate scalar values during lexing

This saves some effort and probably some performance for things like integers, but I'm mainly doing this in anticipation of string literals, because it would just be ridiculous to read code that lexes string literals twice.
This commit is contained in:
Jarvis Carroll 2026-01-29 04:06:19 +00:00
parent fe182a5233
commit 966b4b2748

View File

@ -21,7 +21,7 @@ parse_literal2(Result, Tk, String) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{eof, _, _, _, _}, _, _}} -> {ok, {{eof, _, _, _, _}, _, _}} ->
{ok, Result}; {ok, Result};
{ok, {{_, S, Row, Start, End}, _, _}} -> {ok, {{_, S, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}}; {error, {unexpected_token, S, Row, Start, End}};
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
@ -49,38 +49,58 @@ next_token({tk, _, Col}, "\n" ++ Rest) ->
next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) -> next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
alphanum_token(Tk, Tk, String, []); alphanum_token(Tk, Tk, String, []);
next_token(Tk, [C | _] = String) when ?IS_NUM(C) -> next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
num_token(Tk, Tk, String, []); num_token(Tk, Tk, String, [], 0);
next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) -> next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
bytes_token({tk, Row, Col}, {tk, Row + 2, Col}, Rest, [C, $#]); bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []);
next_token({tk, Row, Col}, [Char | Rest]) -> next_token({tk, Row, Col}, [Char | Rest]) ->
Token = {character, [Char], Row, Col, Col}, Token = {character, [Char], Char, Row, Col, Col},
{ok, {Token, {tk, Row + 1, Col}, Rest}}. {ok, {Token, {tk, Row + 1, Col}, Rest}}.
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) -> alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
AlphaString = lists:reverse(Acc), AlphaString = lists:reverse(Acc),
Token = {alphanum, AlphaString, Row, Start, End}, Token = {alphanum, AlphaString, AlphaString, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}. {ok, {Token, {tk, Row, End}, String}}.
num_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_NUM(C) -> num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
num_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]); NewValue = Value * 10 + (C - $0),
num_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_NUM(C) -> num_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], NewValue);
num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Acc]); num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
num_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> NewValue = Value * 10 + (C - $0),
NumString = lists:reverse(Acc), num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Chars], NewValue);
Token = {integer, NumString, Row, Start, End}, num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
NumString = lists:reverse(Chars),
Token = {integer, NumString, Value, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}. {ok, {Token, {tk, Row, End}, String}}.
bytes_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_HEX(C) -> bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Acc]); Digit = convert_digit(C),
bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Acc) when ?IS_HEX(C) -> bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], [Digit | Digits]);
bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Acc]); bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
bytes_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> Digit = convert_digit(C),
BytesString = lists:reverse(Acc), bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Chars], [Digit | Digits]);
Token = {bytes, BytesString, Row, Start, End}, bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
BytesString = lists:reverse(Chars),
Value = reverse_combine_nibbles(Digits, <<>>),
Token = {bytes, BytesString, Value, Row, Start, End},
{ok, {Token, {tk, Row, End}, String}}. {ok, {Token, {tk, Row, End}, String}}.
convert_digit(C) when C >= $0, C =< $9 ->
C - $0;
convert_digit(C) when C >= $A, C =< $Z ->
C - $A + 10;
convert_digit(C) when C >= $a, C =< $z ->
C - $a + 10.
reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
NewAcc = <<D2:4, D1:4, Acc/binary>>,
reverse_combine_nibbles(Rest, NewAcc);
reverse_combine_nibbles([D1], Acc) ->
<<0:4, D1:4, Acc/binary>>;
reverse_combine_nibbles([], Acc) ->
Acc.
%%% Sophia Literal Parser %%% Sophia Literal Parser
@ -104,8 +124,7 @@ parse_expression(Type, Tk, String) ->
{ok, {Token, NewTk, NewString}} = next_token(Tk, String), {ok, {Token, NewTk, NewString}} = next_token(Tk, String),
parse_expression2(Type, NewTk, NewString, Token). parse_expression2(Type, NewTk, NewString, Token).
parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) -> parse_expression2(Type, Tk, String, {integer, _, Value, Row, Start, End}) ->
Value = convert_int(S),
case Type of case Type of
{_, _, integer} -> {_, _, integer} ->
{ok, {Value, Tk, String}}; {ok, {Value, Tk, String}};
@ -114,8 +133,7 @@ parse_expression2(Type, Tk, String, {integer, S, Row, Start, End}) ->
{O, N, _} -> {O, N, _} ->
{error, {wrong_type, O, N, integer, Row, Start, End}} {error, {wrong_type, O, N, integer, Row, Start, End}}
end; end;
parse_expression2(Type, Tk, String, {bytes, "#" ++ S, Row, Start, End}) -> parse_expression2(Type, Tk, String, {bytes, _, Value, Row, Start, End}) ->
Value = convert_bytes(S),
Len = byte_size(Value), Len = byte_size(Value),
Result = {bytes, Value}, Result = {bytes, Value},
case Type of case Type of
@ -130,15 +148,15 @@ parse_expression2(Type, Tk, String, {bytes, "#" ++ S, Row, Start, End}) ->
{O, N, _} -> {O, N, _} ->
{error, {wrong_type, O, N, integer, Row, Start, End}} {error, {wrong_type, O, N, integer, Row, Start, End}}
end; end;
parse_expression2(Type, Tk, String, {character, "[", Row, Start, _}) -> parse_expression2(Type, Tk, String, {character, "[", _, Row, Start, _}) ->
parse_list(Type, Tk, String, Row, Start); parse_list(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {character, "(", Row, Start, _}) -> parse_expression2(Type, Tk, String, {character, "(", _, Row, Start, _}) ->
parse_tuple(Type, Tk, String, Row, Start); parse_tuple(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {character, "{", Row, Start, _}) -> parse_expression2(Type, Tk, String, {character, "{", _, Row, Start, _}) ->
parse_record_or_map(Type, Tk, String, Row, Start); parse_record_or_map(Type, Tk, String, Row, Start);
parse_expression2(Type, Tk, String, {alphanum, Ident, Row, Start, End}) -> parse_expression2(Type, Tk, String, {alphanum, Ident, _, Row, Start, End}) ->
parse_variant(Type, Tk, String, Ident, Row, Start, End); parse_variant(Type, Tk, String, Ident, Row, Start, End);
parse_expression2(_, _, _, {_, S, Row, Start, End}) -> parse_expression2(_, _, _, {_, S, _, Row, Start, End}) ->
{error, {unexpected_token, S, Row, Start, End}}. {error, {unexpected_token, S, Row, Start, End}}.
unknown_type() -> unknown_type() ->
@ -148,54 +166,12 @@ expect_tokens([], Tk, String) ->
{ok, {Tk, String}}; {ok, {Tk, String}};
expect_tokens([Str | Rest], Tk, String) -> expect_tokens([Str | Rest], Tk, String) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{_, Str, _, _, _}, NewTk, NewString}} -> {ok, {{_, Str, _, _, _, _}, NewTk, NewString}} ->
expect_tokens(Rest, NewTk, NewString); expect_tokens(Rest, NewTk, NewString);
{ok, {{_, Actual, Row, Start, End}}} -> {ok, {{_, Actual, _, Row, Start, End}}} ->
{error, {unexpected_token, Actual, Row, Start, End}} {error, {unexpected_token, Actual, Row, Start, End}}
end. end.
convert_int(Chars) ->
convert_int(Chars, 0).
convert_int("_" ++ Chars, Result) ->
convert_int(Chars, Result);
convert_int([N | Chars], Result) ->
Digit = N - $0,
NewResult = Result * 10 + Digit,
convert_int(Chars, NewResult);
convert_int([], Result) ->
Result.
convert_bytes(Chars) ->
% We do this as two reversing foldl type loops. One removes underscores and
% converts the ASCII into integers, and the other peels off pairs of
% numbers to form bytes.
Digits = reverse_convert_digits(Chars, []),
reverse_combine_nibbles(Digits, <<>>).
reverse_convert_digits("_" ++ Rest, Acc) ->
reverse_convert_digits(Rest, Acc);
reverse_convert_digits([C | Rest], Acc) ->
Digit = convert_digit(C),
reverse_convert_digits(Rest, [Digit | Acc]);
reverse_convert_digits([], Acc) ->
Acc.
convert_digit(C) when C >= $0, C =< $9 ->
C - $0;
convert_digit(C) when C >= $A, C =< $Z ->
C - $A + 10;
convert_digit(C) when C >= $a, C =< $z ->
C - $a + 10.
reverse_combine_nibbles([D1, D2 | Rest], Acc) ->
NewAcc = <<D2:4, D1:4, Acc/binary>>,
reverse_combine_nibbles(Rest, NewAcc);
reverse_combine_nibbles([D1], Acc) ->
<<0:4, D1:4, Acc/binary>>;
reverse_combine_nibbles([], Acc) ->
Acc.
%%% List Parsing %%% List Parsing
parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) -> parse_list({_, _, {list, [Inner]}}, Tk, String, Row, Start) ->
@ -207,7 +183,7 @@ parse_list({O, N, _}, _, _, Row, Start) ->
parse_list_loop(Inner, Tk, String, CloseChar, Row, Start, Acc) -> parse_list_loop(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, CloseChar, _, _, _}, NewTk, NewString}} -> {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} ->
{ok, {lists:reverse(Acc), NewTk, NewString}}; {ok, {lists:reverse(Acc), NewTk, NewString}};
{ok, {Token, NewTk, NewString}} -> {ok, {Token, NewTk, NewString}} ->
parse_list_loop2(Inner, NewTk, NewString, CloseChar, Row, Start, Acc, Token) parse_list_loop2(Inner, NewTk, NewString, CloseChar, Row, Start, Acc, Token)
@ -226,9 +202,9 @@ parse_list_loop2(Inner, Tk, String, CloseChar, Row, Start, Acc, Token) ->
parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) -> parse_list_loop3(Inner, Tk, String, CloseChar, Row, Start, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, CloseChar, _, _, _}, NewTk, NewString}} -> {ok, {{character, CloseChar, _, _, _, _}, NewTk, NewString}} ->
{ok, {lists:reverse(Acc), NewTk, NewString}}; {ok, {lists:reverse(Acc), NewTk, NewString}};
{ok, {{character, ",", _, _, _}, NewTk, NewString}} -> {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
parse_list_loop(Inner, NewTk, NewString, CloseChar, Row, Start, Acc); parse_list_loop(Inner, NewTk, NewString, CloseChar, Row, Start, Acc);
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
@ -281,14 +257,14 @@ parse_multivalue2([Next | Rest], Tk, String, Row, Start, Acc, Token) ->
end; end;
parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) -> parse_multivalue2([], Tk, String, _, _, Acc, {character, ")", _, _, _}) ->
{ok, {lists:reverse(Acc), Tk, String}}; {ok, {lists:reverse(Acc), Tk, String}};
parse_multivalue2([], _, _, _, _, _, {_, S, Row, Start, End}) -> parse_multivalue2([], _, _, _, _, _, {_, S, _, Row, Start, End}) ->
{error, {unexpected_token, S, Row, Start, End}}. {error, {unexpected_token, S, Row, Start, End}}.
parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) -> parse_multivalue3(ElemTypes, Tk, String, Row, Start, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, ")", Row2, Start2, _}, NewTk, NewString}} -> {ok, {{character, ")", _, Row2, Start2, _}, NewTk, NewString}} ->
check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc); check_multivalue_long_enough(ElemTypes, NewTk, NewString, Row2, Start2, Acc);
{ok, {{character, ",", _, _, _}, NewTk, NewString}} -> {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc); parse_multivalue(ElemTypes, NewTk, NewString, Row, Start, Acc);
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
@ -331,9 +307,9 @@ parse_variant3(Arities, Tag, [], Tk, String) ->
{ok, {Result, Tk, String}}; {ok, {Result, Tk, String}};
parse_variant3(Arities, Tag, ElemTypes, Tk, String) -> parse_variant3(Arities, Tag, ElemTypes, Tk, String) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, "(", Row, Start, _}, NewTk, NewString}} -> {ok, {{character, "(", _, Row, Start, _}, NewTk, NewString}} ->
parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start); parse_variant4(Arities, Tag, ElemTypes, NewTk, NewString, Row, Start);
{ok, {{_, Actual, Row, Start, End}}} -> {ok, {{_, Actual, _, Row, Start, End}}} ->
{error, {unexpected_token, Actual, Row, Start, End}} {error, {unexpected_token, Actual, Row, Start, End}}
end. end.
@ -361,13 +337,13 @@ parse_record_or_map({_, _, {record, Fields}}, Tk, String, _, _) ->
parse_record(Fields, Tk, String, #{}); parse_record(Fields, Tk, String, #{});
parse_record_or_map({_, _, unknown_type}, Tk, String, _, _) -> parse_record_or_map({_, _, unknown_type}, Tk, String, _, _) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, "}", _, _, _}, NewTk, NewString}} -> {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
{ok, {#{}, NewTk, NewString}}; {ok, {#{}, NewTk, NewString}};
{ok, {{character, "[", _, _, _}, NewTk, NewString}} -> {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} ->
parse_map2(unknown_type(), unknown_type(), NewTk, NewString, #{}); parse_map2(unknown_type(), unknown_type(), NewTk, NewString, #{});
{ok, {{alphanum, _, Row, Start, End}, _, _}} -> {ok, {{alphanum, _, _, Row, Start, End}, _, _}} ->
{error, {unresolved_record, Row, Start, End}}; {error, {unresolved_record, Row, Start, End}};
{ok, {{_, S, Row, Start, End}, _, _}} -> {ok, {{_, S, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}} {error, {unexpected_token, S, Row, Start, End}}
end; end;
parse_record_or_map({O, N, _}, _, _, Row, Start) -> parse_record_or_map({O, N, _}, _, _, Row, Start) ->
@ -375,11 +351,11 @@ parse_record_or_map({O, N, _}, _, _, Row, Start) ->
parse_record(Fields, Tk, String, Acc) -> parse_record(Fields, Tk, String, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{alphanum, Ident, Row, Start, End}, NewTk, NewString}} -> {ok, {{alphanum, Ident, _, Row, Start, End}, NewTk, NewString}} ->
parse_record2(Fields, NewTk, NewString, Acc, Ident, Row, Start, End); parse_record2(Fields, NewTk, NewString, Acc, Ident, Row, Start, End);
{ok, {{character, "}", Row, Start, End}, NewTk, NewString}} -> {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} ->
parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End); parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End);
{ok, {{_, S, Row, Start, End}, _, _}} -> {ok, {{_, S, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}}; {error, {unexpected_token, S, Row, Start, End}};
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
@ -420,11 +396,11 @@ parse_record5(Fields, Tk, String, Acc, Ident, Type) ->
parse_record6(Fields, Tk, String, Acc) -> parse_record6(Fields, Tk, String, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, ",", _, _, _}, NewTk, NewString}} -> {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
parse_record(Fields, NewTk, NewString, Acc); parse_record(Fields, NewTk, NewString, Acc);
{ok, {{character, "}", Row, Start, End}, NewTk, NewString}} -> {ok, {{character, "}", _, Row, Start, End}, NewTk, NewString}} ->
parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End); parse_record_end(Fields, NewTk, NewString, Acc, Row, Start, End);
{ok, {{_, S, Row, Start, End}, _, _}} -> {ok, {{_, S, _, Row, Start, End}, _, _}} ->
{error, {unexpected_token, S, Row, Start, End}}; {error, {unexpected_token, S, Row, Start, End}};
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
@ -455,11 +431,11 @@ parse_record_final_loop([], _, FieldsReverse) ->
parse_map(KeyType, ValueType, Tk, String, Acc) -> parse_map(KeyType, ValueType, Tk, String, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, "[", _, _, _}, NewTk, NewString}} -> {ok, {{character, "[", _, _, _, _}, NewTk, NewString}} ->
parse_map2(KeyType, ValueType, NewTk, NewString, Acc); parse_map2(KeyType, ValueType, NewTk, NewString, Acc);
{ok, {{character, "}", _, _, _}, NewTk, NewString}} -> {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
{ok, {Acc, NewTk, NewString}}; {ok, {Acc, NewTk, NewString}};
{ok, {{_, S, Row, Start, End}}} -> {ok, {{_, S, _, Row, Start, End}}} ->
{error, {unexpected_token, S, Row, Start, End}} {error, {unexpected_token, S, Row, Start, End}}
end. end.
@ -490,11 +466,11 @@ parse_map4(KeyType, ValueType, Tk, String, Acc, Key) ->
parse_map5(KeyType, ValueType, Tk, String, Acc) -> parse_map5(KeyType, ValueType, Tk, String, Acc) ->
case next_token(Tk, String) of case next_token(Tk, String) of
{ok, {{character, ",", _, _, _}, NewTk, NewString}} -> {ok, {{character, ",", _, _, _, _}, NewTk, NewString}} ->
parse_map(KeyType, ValueType, NewTk, NewString, Acc); parse_map(KeyType, ValueType, NewTk, NewString, Acc);
{ok, {{character, "}", _, _, _}, NewTk, NewString}} -> {ok, {{character, "}", _, _, _, _}, NewTk, NewString}} ->
{ok, {Acc, NewTk, NewString}}; {ok, {Acc, NewTk, NewString}};
{ok, {{_, S, Row, Start, End}}} -> {ok, {{_, S, _, Row, Start, End}}} ->
{error, {unexpected_token, S, Row, Start, End}} {error, {unexpected_token, S, Row, Start, End}}
end. end.