From e98edd4eefc0d2a6f196fb4ce9823837d3691390 Mon Sep 17 00:00:00 2001 From: Hans Svensson Date: Mon, 24 Feb 2020 15:35:54 +0100 Subject: [PATCH] Handle UTF-8 in character literals Also handle `\x{hhh..}` in strings... Character literals has to be a single character, not composite. + tests (and the corresponding fix to the char literal pretty printer) --- src/aeso_pretty.erl | 9 ++++-- src/aeso_scan.erl | 58 +++++++++++++++++++++----------------- test/aeso_parser_tests.erl | 5 ++-- test/contracts/utf8.aes | 21 ++++++++++++++ 4 files changed, 63 insertions(+), 30 deletions(-) create mode 100644 test/contracts/utf8.aes diff --git a/src/aeso_pretty.erl b/src/aeso_pretty.erl index bf00107..919c9ef 100644 --- a/src/aeso_pretty.erl +++ b/src/aeso_pretty.erl @@ -369,8 +369,13 @@ expr_p(_, {char, _, C}) -> case C of $' -> text("'\\''"); $" -> text("'\"'"); - _ -> S = lists:flatten(io_lib:format("~p", [[C]])), - text("'" ++ tl(lists:droplast(S)) ++ "'") + _ when C < 16#80 -> + S = lists:flatten(io_lib:format("~p", [[C]])), + text("'" ++ tl(lists:droplast(S)) ++ "'"); + _ -> + S = lists:flatten( + io_lib:format("'~ts'", [list_to_binary(aeso_scan:utf8_encode([C]))])), + text(S) end; %% -- Names expr_p(_, E = {id, _, _}) -> name(E); diff --git a/src/aeso_scan.erl b/src/aeso_scan.erl index 1c30016..e81757f 100644 --- a/src/aeso_scan.erl +++ b/src/aeso_scan.erl @@ -7,7 +7,7 @@ %%%------------------------------------------------------------------- -module(aeso_scan). --export([scan/1]). +-export([scan/1, utf8_encode/1]). -import(aeso_scan_lib, [token/1, token/2, symbol/0, skip/0, override/2, push/2, pop/1]). @@ -28,7 +28,13 @@ lexer() -> QID = ["(", CON, "\\.)+", ID], QCON = ["(", CON, "\\.)+", CON], OP = "[=!<>+\\-*/:&|?~@^]+", - CHAR = "'([^'\\\\]|(\\\\.))'", + %% Five cases for a character + %% * 1 7-bit ascii, not \ or ' + %% * 2-4 8-bit values (UTF8) + %% * \ followed by a known modifier [aernrtv] + %% * \xhh + %% * \x{hhh...} + CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", STRING = "\"([^\"\\\\]|(\\\\.))*\"", CommentStart = {"/\\*", push(comment, skip())}, @@ -77,34 +83,34 @@ scan(String) -> %% -- Helpers ---------------------------------------------------------------- parse_string([$" | Chars]) -> - unescape(Chars). + unicode:characters_to_nfc_binary(unescape(Chars)). -parse_char([$', $\\, Code, $']) -> - case Code of - $' -> $'; - $\\ -> $\\; - $b -> $\b; - $e -> $\e; - $f -> $\f; - $n -> $\n; - $r -> $\r; - $t -> $\t; - $v -> $\v; - _ -> {error, "Bad control sequence: \\" ++ [Code]} - end; -parse_char([$', C, $']) -> C. +parse_char([$' | Chars]) -> + case unicode:characters_to_nfc_list(unescape($', Chars, [])) of + [Char] -> Char; + _Bad -> {error, "Bad character literal: '" ++ Chars} + end. -unescape(Str) -> unescape(Str, []). +utf8_encode(Cs) -> + binary_to_list(unicode:characters_to_binary(Cs)). -unescape([$"], Acc) -> +unescape(Str) -> unescape($", Str, []). + +unescape(Delim, [Delim], Acc) -> list_to_binary(lists:reverse(Acc)); -unescape([$\\, $x, D1, D2 | Chars ], Acc) -> +unescape(Delim, [$\\, $x, ${ | Chars ], Acc) -> + {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars), + C = list_to_integer(Ds, 16), + Utf8Cs = binary_to_list(unicode:characters_to_binary([C])), + unescape(Delim, Cs, [Utf8Cs | Acc]); +unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) -> C = list_to_integer([D1, D2], 16), - unescape(Chars, [C | Acc]); -unescape([$\\, Code | Chars], Acc) -> - Ok = fun(C) -> unescape(Chars, [C | Acc]) end, + Utf8Cs = binary_to_list(unicode:characters_to_binary([C])), + unescape(Delim, Chars, [Utf8Cs | Acc]); +unescape(Delim, [$\\, Code | Chars], Acc) -> + Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end, case Code of - $" -> Ok($"); + Delim -> Ok(Delim); $\\ -> Ok($\\); $b -> Ok($\b); $e -> Ok($\e); @@ -115,8 +121,8 @@ unescape([$\\, Code | Chars], Acc) -> $v -> Ok($\v); _ -> error("Bad control sequence: \\" ++ [Code]) %% TODO end; -unescape([C | Chars], Acc) -> - unescape(Chars, [C | Acc]). +unescape(Delim, [C | Chars], Acc) -> + unescape(Delim, Chars, [C | Acc]). strip_underscores(S) -> lists:filter(fun(C) -> C /= $_ end, S). diff --git a/test/aeso_parser_tests.erl b/test/aeso_parser_tests.erl index 21d7ff7..c978b0a 100644 --- a/test/aeso_parser_tests.erl +++ b/test/aeso_parser_tests.erl @@ -63,7 +63,8 @@ simple_contracts_test_() -> %% Parse tests of example contracts [ {lists:concat(["Parse the ", Contract, " contract."]), fun() -> roundtrip_contract(Contract) end} - || Contract <- [counter, voting, all_syntax, '05_greeter', aeproof, multi_sig, simple_storage, fundme, dutch_auction] ] + || Contract <- [counter, voting, all_syntax, '05_greeter', aeproof, + multi_sig, simple_storage, fundme, dutch_auction, utf8] ] }. parse_contract(Name) -> @@ -85,7 +86,7 @@ parse_expr(Text) -> round_trip(Text) -> Contract = parse_string(Text), Text1 = prettypr:format(aeso_pretty:decls(strip_stdlib(Contract))), - Contract1 = parse_string(Text1), + Contract1 = parse_string(aeso_scan:utf8_encode(Text1)), NoSrcLoc = remove_line_numbers(Contract), NoSrcLoc1 = remove_line_numbers(Contract1), ?assertMatch(NoSrcLoc, diff(NoSrcLoc, NoSrcLoc1)). diff --git a/test/contracts/utf8.aes b/test/contracts/utf8.aes new file mode 100644 index 0000000..3e82b6b --- /dev/null +++ b/test/contracts/utf8.aes @@ -0,0 +1,21 @@ +contract UTF8 = + entrypoint f1() : char = '1' + entrypoint f2() : char = '+' + entrypoint f3() : char = 'd' + entrypoint f4() : char = 'X' + entrypoint f5() : char = 'å' + entrypoint f6() : char = 'Ä' + entrypoint f7() : char = 'æ' + entrypoint f8() : char = 'ë' + entrypoint f9() : char = 'ẻ' + entrypoint f10() : char = '\x27' + entrypoint f11() : char = '\x{2200}' + entrypoint f12() : char = '💩' + entrypoint f13() : char = '\n' + + + + // entrypoint f13() : char = 'e̊' + // entrypoint f14() : char = '\Ì' + + // '💩' vs. map('a,'b)