From 17f635af618463c68e219fee73e74821b6903c7b Mon Sep 17 00:00:00 2001 From: Jarvis Carroll Date: Tue, 3 Feb 2026 00:41:00 +0000 Subject: [PATCH] Parse long hex escape codes This doesn't work super consistently in the compiler, for codepoints above 127, but it should work fine for us, so, oh well! --- src/hz_sophia.erl | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index 5017a92..32bbde4 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -103,12 +103,13 @@ reverse_combine_nibbles([D1], Acc) -> reverse_combine_nibbles([], Acc) -> Acc. -string_token(Start, {tk, Row, Col}, [$\\, $x, A, B | Rest], SourceChars, Value) -> - case escape_hex_code(A, B) of - {ok, ByteVal} -> - string_token(Start, {tk, Row + 4, Col}, Rest, [B, A, $x, $\ | SourceChars], <>); - error -> - {error, {invalid_escape_code, [$\\, $x, A, B], Row, Col}} +string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) -> + case escape_hex_code({tk, Row, Col}, {tk, Row + 2, Col}, String, "x\\" ++ SourceChars) of + {ok, {Codepoint, NewSourceChars, NewTk, NewString}} -> + NewValue = <>, + string_token(Start, NewTk, NewString, NewSourceChars, NewValue); + {error, Reason} -> + {error, Reason} end; string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) -> case escape_char(C) of @@ -122,16 +123,34 @@ string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) -> Token = {string, SourceStr, Value, Row, Start, End}, {ok, {Token, {tk, Row, End}, Rest}}; string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) -> - string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <>). + % TODO: ERTS probably had to convert this FROM utf8 at some point, so why + % bother, if we need to convert it back? I guess we could accept iolists if + % we really wanted to waste time on this point... + string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <>). -escape_hex_code(A, B) when ?IS_HEX(A), ?IS_HEX(B) -> +escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) -> + escape_long_hex_code(Start, {tk, Row + 1, Col}, String, "{" ++ SourceChars, 0); +escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) -> % As of writing this, the Sophia compiler will convert this byte from % extended ASCII to unicode... But it really shouldn't. The literal parser % does what the compiler should do. Byte = convert_digit(A) * 16 + convert_digit(B), - {ok, Byte}; -escape_hex_code(_, _) -> - error. + {ok, {Byte, [B, A | SourceChars], {tk, Row + 2, Col}, String}}; +escape_hex_code({tk, Row1, Col1}, _, _, _) -> + {error, {invalid_escape_code, "\\x", Row1, Col1}}. + +escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) -> + {ok, {Value, "}" ++ SourceChars, {tk, Row + 1, Col}, String}}; +escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) -> + NewSourceChars = [C | SourceChars], + NewValue = 16 * Value + convert_digit(C), + escape_long_hex_code(Start, {tk, Row + 1, Col}, String, NewSourceChars, NewValue); +escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) -> + {error, {invalid_hexadecimal, [C], Row, Col}}; +escape_long_hex_code(_, Tk, [], SourceChars, Value) -> + % Just return as if the escape code were closed, and let the string parser + % produce an unclosed string error instead. + {ok, {Value, SourceChars, Tk, []}}. escape_char($b) -> {ok, $\b}; escape_char($e) -> {ok, $\e}; @@ -747,7 +766,7 @@ anon_types_test() -> string_escape_codes_test() -> check_parser("\" \\b\\e\\f\\n\\r\\t\\v\\\"\\\\ \""), check_parser("\"\\x00\\x11\\x77\\x4a\\x4A\""), - check_parser("\"\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\""), + check_parser("\"\\x{0}\\x{7}\\x{7F}\\x{07F}\\x{007F}\\x{0007F}\\x{0000007F}\""), ok. records_test() ->