Parse long hex escape codes

This doesn't work super consistently in the compiler, for codepoints above 127, but it should work fine for us, so, oh well!
This commit is contained in:
Jarvis Carroll 2026-02-03 00:41:00 +00:00
parent 272ed01fdc
commit 17f635af61

View File

@ -103,12 +103,13 @@ reverse_combine_nibbles([D1], Acc) ->
reverse_combine_nibbles([], Acc) -> reverse_combine_nibbles([], Acc) ->
Acc. Acc.
string_token(Start, {tk, Row, Col}, [$\\, $x, A, B | Rest], SourceChars, Value) -> string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
case escape_hex_code(A, B) of case escape_hex_code({tk, Row, Col}, {tk, Row + 2, Col}, String, "x\\" ++ SourceChars) of
{ok, ByteVal} -> {ok, {Codepoint, NewSourceChars, NewTk, NewString}} ->
string_token(Start, {tk, Row + 4, Col}, Rest, [B, A, $x, $\ | SourceChars], <<Value/binary, ByteVal>>); NewValue = <<Value/binary, Codepoint/utf8>>,
error -> string_token(Start, NewTk, NewString, NewSourceChars, NewValue);
{error, {invalid_escape_code, [$\\, $x, A, B], Row, Col}} {error, Reason} ->
{error, Reason}
end; end;
string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) -> string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
case escape_char(C) of case escape_char(C) of
@ -122,16 +123,34 @@ string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) ->
Token = {string, SourceStr, Value, Row, Start, End}, Token = {string, SourceStr, Value, Row, Start, End},
{ok, {Token, {tk, Row, End}, Rest}}; {ok, {Token, {tk, Row, End}, Rest}};
string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) -> string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <<Value/binary, C>>). % TODO: ERTS probably had to convert this FROM utf8 at some point, so why
% bother, if we need to convert it back? I guess we could accept iolists if
% we really wanted to waste time on this point...
string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
escape_hex_code(A, B) when ?IS_HEX(A), ?IS_HEX(B) -> escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) ->
escape_long_hex_code(Start, {tk, Row + 1, Col}, String, "{" ++ SourceChars, 0);
escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
% As of writing this, the Sophia compiler will convert this byte from % As of writing this, the Sophia compiler will convert this byte from
% extended ASCII to unicode... But it really shouldn't. The literal parser % extended ASCII to unicode... But it really shouldn't. The literal parser
% does what the compiler should do. % does what the compiler should do.
Byte = convert_digit(A) * 16 + convert_digit(B), Byte = convert_digit(A) * 16 + convert_digit(B),
{ok, Byte}; {ok, {Byte, [B, A | SourceChars], {tk, Row + 2, Col}, String}};
escape_hex_code(_, _) -> escape_hex_code({tk, Row1, Col1}, _, _, _) ->
error. {error, {invalid_escape_code, "\\x", Row1, Col1}}.
escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) ->
{ok, {Value, "}" ++ SourceChars, {tk, Row + 1, Col}, String}};
escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
NewSourceChars = [C | SourceChars],
NewValue = 16 * Value + convert_digit(C),
escape_long_hex_code(Start, {tk, Row + 1, Col}, String, NewSourceChars, NewValue);
escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) ->
{error, {invalid_hexadecimal, [C], Row, Col}};
escape_long_hex_code(_, Tk, [], SourceChars, Value) ->
% Just return as if the escape code were closed, and let the string parser
% produce an unclosed string error instead.
{ok, {Value, SourceChars, Tk, []}}.
escape_char($b) -> {ok, $\b}; escape_char($b) -> {ok, $\b};
escape_char($e) -> {ok, $\e}; escape_char($e) -> {ok, $\e};
@ -747,7 +766,7 @@ anon_types_test() ->
string_escape_codes_test() -> string_escape_codes_test() ->
check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""),
check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""), check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""),
check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""), check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""),
ok. ok.
records_test() -> records_test() ->