From e98edd4eefc0d2a6f196fb4ce9823837d3691390 Mon Sep 17 00:00:00 2001
From: Hans Svensson <hans.svensson@quviq.com>
Date: Mon, 24 Feb 2020 15:35:54 +0100
Subject: [PATCH] Handle UTF-8 in character literals

Also handle `\x{hhh..}` in strings... Character literals has to be a single character, not composite.

+ tests (and the corresponding fix to the char literal pretty printer)
---
 src/aeso_pretty.erl        |  9 ++++--
 src/aeso_scan.erl          | 58 +++++++++++++++++++++-----------------
 test/aeso_parser_tests.erl |  5 ++--
 test/contracts/utf8.aes    | 21 ++++++++++++++
 4 files changed, 63 insertions(+), 30 deletions(-)
 create mode 100644 test/contracts/utf8.aes

diff --git a/src/aeso_pretty.erl b/src/aeso_pretty.erl
index bf00107..919c9ef 100644
--- a/src/aeso_pretty.erl
+++ b/src/aeso_pretty.erl
@@ -369,8 +369,13 @@ expr_p(_, {char, _, C}) ->
     case C of
         $' -> text("'\\''");
         $" -> text("'\"'");
-        _  -> S = lists:flatten(io_lib:format("~p", [[C]])),
-              text("'" ++ tl(lists:droplast(S)) ++ "'")
+        _ when C < 16#80 ->
+            S = lists:flatten(io_lib:format("~p", [[C]])),
+            text("'" ++ tl(lists:droplast(S)) ++ "'");
+        _  ->
+            S = lists:flatten(
+                  io_lib:format("'~ts'", [list_to_binary(aeso_scan:utf8_encode([C]))])),
+            text(S)
     end;
 %% -- Names
 expr_p(_, E = {id, _, _})   -> name(E);
diff --git a/src/aeso_scan.erl b/src/aeso_scan.erl
index 1c30016..e81757f 100644
--- a/src/aeso_scan.erl
+++ b/src/aeso_scan.erl
@@ -7,7 +7,7 @@
 %%%-------------------------------------------------------------------
 -module(aeso_scan).
 
--export([scan/1]).
+-export([scan/1, utf8_encode/1]).
 
 -import(aeso_scan_lib, [token/1, token/2, symbol/0, skip/0,
                         override/2, push/2, pop/1]).
@@ -28,7 +28,13 @@ lexer() ->
     QID      = ["(", CON, "\\.)+", ID],
     QCON     = ["(", CON, "\\.)+", CON],
     OP       = "[=!<>+\\-*/:&|?~@^]+",
-    CHAR     = "'([^'\\\\]|(\\\\.))'",
+    %% Five cases for a character
+    %%  * 1 7-bit ascii, not \ or '
+    %%  * 2-4 8-bit values (UTF8)
+    %%  * \ followed by a known modifier [aernrtv]
+    %%  * \xhh
+    %%  * \x{hhh...}
+    CHAR     = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
     STRING   = "\"([^\"\\\\]|(\\\\.))*\"",
 
     CommentStart = {"/\\*", push(comment, skip())},
@@ -77,34 +83,34 @@ scan(String) ->
 %% -- Helpers ----------------------------------------------------------------
 
 parse_string([$" | Chars]) ->
-    unescape(Chars).
+    unicode:characters_to_nfc_binary(unescape(Chars)).
 
-parse_char([$', $\\, Code, $']) ->
-    case Code of
-        $'  -> $';
-        $\\ -> $\\;
-        $b  -> $\b;
-        $e  -> $\e;
-        $f  -> $\f;
-        $n  -> $\n;
-        $r  -> $\r;
-        $t  -> $\t;
-        $v  -> $\v;
-        _   -> {error, "Bad control sequence: \\" ++ [Code]}
-    end;
-parse_char([$', C, $']) -> C.
+parse_char([$' | Chars]) ->
+    case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
+        [Char] -> Char;
+        _Bad   -> {error, "Bad character literal: '" ++ Chars}
+    end.
 
-unescape(Str) -> unescape(Str, []).
+utf8_encode(Cs) ->
+    binary_to_list(unicode:characters_to_binary(Cs)).
 
-unescape([$"], Acc) ->
+unescape(Str) -> unescape($", Str, []).
+
+unescape(Delim, [Delim], Acc) ->
     list_to_binary(lists:reverse(Acc));
-unescape([$\\, $x, D1, D2 | Chars ], Acc) ->
+unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
+    {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
+    C = list_to_integer(Ds, 16),
+    Utf8Cs = binary_to_list(unicode:characters_to_binary([C])),
+    unescape(Delim, Cs, [Utf8Cs | Acc]);
+unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
     C = list_to_integer([D1, D2], 16),
-    unescape(Chars, [C | Acc]);
-unescape([$\\, Code | Chars], Acc) ->
-    Ok = fun(C) -> unescape(Chars, [C | Acc]) end,
+    Utf8Cs = binary_to_list(unicode:characters_to_binary([C])),
+    unescape(Delim, Chars, [Utf8Cs | Acc]);
+unescape(Delim, [$\\, Code | Chars], Acc) ->
+    Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
     case Code of
-        $"  -> Ok($");
+        Delim -> Ok(Delim);
         $\\ -> Ok($\\);
         $b  -> Ok($\b);
         $e  -> Ok($\e);
@@ -115,8 +121,8 @@ unescape([$\\, Code | Chars], Acc) ->
         $v  -> Ok($\v);
         _   -> error("Bad control sequence: \\" ++ [Code])  %% TODO
     end;
-unescape([C | Chars], Acc) ->
-    unescape(Chars, [C | Acc]).
+unescape(Delim, [C | Chars], Acc) ->
+    unescape(Delim, Chars, [C | Acc]).
 
 strip_underscores(S) ->
     lists:filter(fun(C) -> C /= $_ end, S).
diff --git a/test/aeso_parser_tests.erl b/test/aeso_parser_tests.erl
index 21d7ff7..c978b0a 100644
--- a/test/aeso_parser_tests.erl
+++ b/test/aeso_parser_tests.erl
@@ -63,7 +63,8 @@ simple_contracts_test_() ->
      %% Parse tests of example contracts
      [ {lists:concat(["Parse the ", Contract, " contract."]),
         fun() -> roundtrip_contract(Contract) end}
-        || Contract <- [counter, voting, all_syntax, '05_greeter', aeproof, multi_sig, simple_storage, fundme, dutch_auction] ]
+        || Contract <- [counter, voting, all_syntax, '05_greeter', aeproof,
+                        multi_sig, simple_storage, fundme, dutch_auction, utf8] ]
     }.
 
 parse_contract(Name) ->
@@ -85,7 +86,7 @@ parse_expr(Text) ->
 round_trip(Text) ->
     Contract  = parse_string(Text),
     Text1     = prettypr:format(aeso_pretty:decls(strip_stdlib(Contract))),
-    Contract1 = parse_string(Text1),
+    Contract1 = parse_string(aeso_scan:utf8_encode(Text1)),
     NoSrcLoc  = remove_line_numbers(Contract),
     NoSrcLoc1 = remove_line_numbers(Contract1),
     ?assertMatch(NoSrcLoc, diff(NoSrcLoc, NoSrcLoc1)).
diff --git a/test/contracts/utf8.aes b/test/contracts/utf8.aes
new file mode 100644
index 0000000..3e82b6b
--- /dev/null
+++ b/test/contracts/utf8.aes
@@ -0,0 +1,21 @@
+contract UTF8 =
+  entrypoint f1() : char = '1'
+  entrypoint f2() : char = '+'
+  entrypoint f3() : char = 'd'
+  entrypoint f4() : char = 'X'
+  entrypoint f5() : char = 'å'
+  entrypoint f6() : char = 'Ä'
+  entrypoint f7() : char = 'æ'
+  entrypoint f8() : char = 'ë'
+  entrypoint f9() : char = 'ẻ'
+  entrypoint f10() : char = '\x27'
+  entrypoint f11() : char = '\x{2200}'
+  entrypoint f12() : char = '💩'
+  entrypoint f13() : char = '\n'
+
+
+
+  // entrypoint f13() : char = 'e̊'
+  // entrypoint f14() : char = '\Ì'
+
+  // '💩' vs. map('a,'b)