Handle UTF-8 in character literals
Also handle `\x{hhh..}` in strings... Character literals has to be a single character, not composite. + tests (and the corresponding fix to the char literal pretty printer)
This commit is contained in:
parent
2bad76314f
commit
e98edd4eef
@ -369,8 +369,13 @@ expr_p(_, {char, _, C}) ->
|
||||
case C of
|
||||
$' -> text("'\\''");
|
||||
$" -> text("'\"'");
|
||||
_ -> S = lists:flatten(io_lib:format("~p", [[C]])),
|
||||
text("'" ++ tl(lists:droplast(S)) ++ "'")
|
||||
_ when C < 16#80 ->
|
||||
S = lists:flatten(io_lib:format("~p", [[C]])),
|
||||
text("'" ++ tl(lists:droplast(S)) ++ "'");
|
||||
_ ->
|
||||
S = lists:flatten(
|
||||
io_lib:format("'~ts'", [list_to_binary(aeso_scan:utf8_encode([C]))])),
|
||||
text(S)
|
||||
end;
|
||||
%% -- Names
|
||||
expr_p(_, E = {id, _, _}) -> name(E);
|
||||
|
@ -7,7 +7,7 @@
|
||||
%%%-------------------------------------------------------------------
|
||||
-module(aeso_scan).
|
||||
|
||||
-export([scan/1]).
|
||||
-export([scan/1, utf8_encode/1]).
|
||||
|
||||
-import(aeso_scan_lib, [token/1, token/2, symbol/0, skip/0,
|
||||
override/2, push/2, pop/1]).
|
||||
@ -28,7 +28,13 @@ lexer() ->
|
||||
QID = ["(", CON, "\\.)+", ID],
|
||||
QCON = ["(", CON, "\\.)+", CON],
|
||||
OP = "[=!<>+\\-*/:&|?~@^]+",
|
||||
CHAR = "'([^'\\\\]|(\\\\.))'",
|
||||
%% Five cases for a character
|
||||
%% * 1 7-bit ascii, not \ or '
|
||||
%% * 2-4 8-bit values (UTF8)
|
||||
%% * \ followed by a known modifier [aernrtv]
|
||||
%% * \xhh
|
||||
%% * \x{hhh...}
|
||||
CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'",
|
||||
STRING = "\"([^\"\\\\]|(\\\\.))*\"",
|
||||
|
||||
CommentStart = {"/\\*", push(comment, skip())},
|
||||
@ -77,34 +83,34 @@ scan(String) ->
|
||||
%% -- Helpers ----------------------------------------------------------------
|
||||
|
||||
parse_string([$" | Chars]) ->
|
||||
unescape(Chars).
|
||||
unicode:characters_to_nfc_binary(unescape(Chars)).
|
||||
|
||||
parse_char([$', $\\, Code, $']) ->
|
||||
case Code of
|
||||
$' -> $';
|
||||
$\\ -> $\\;
|
||||
$b -> $\b;
|
||||
$e -> $\e;
|
||||
$f -> $\f;
|
||||
$n -> $\n;
|
||||
$r -> $\r;
|
||||
$t -> $\t;
|
||||
$v -> $\v;
|
||||
_ -> {error, "Bad control sequence: \\" ++ [Code]}
|
||||
end;
|
||||
parse_char([$', C, $']) -> C.
|
||||
parse_char([$' | Chars]) ->
|
||||
case unicode:characters_to_nfc_list(unescape($', Chars, [])) of
|
||||
[Char] -> Char;
|
||||
_Bad -> {error, "Bad character literal: '" ++ Chars}
|
||||
end.
|
||||
|
||||
unescape(Str) -> unescape(Str, []).
|
||||
utf8_encode(Cs) ->
|
||||
binary_to_list(unicode:characters_to_binary(Cs)).
|
||||
|
||||
unescape([$"], Acc) ->
|
||||
unescape(Str) -> unescape($", Str, []).
|
||||
|
||||
unescape(Delim, [Delim], Acc) ->
|
||||
list_to_binary(lists:reverse(Acc));
|
||||
unescape([$\\, $x, D1, D2 | Chars ], Acc) ->
|
||||
unescape(Delim, [$\\, $x, ${ | Chars ], Acc) ->
|
||||
{Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars),
|
||||
C = list_to_integer(Ds, 16),
|
||||
Utf8Cs = binary_to_list(unicode:characters_to_binary([C])),
|
||||
unescape(Delim, Cs, [Utf8Cs | Acc]);
|
||||
unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) ->
|
||||
C = list_to_integer([D1, D2], 16),
|
||||
unescape(Chars, [C | Acc]);
|
||||
unescape([$\\, Code | Chars], Acc) ->
|
||||
Ok = fun(C) -> unescape(Chars, [C | Acc]) end,
|
||||
Utf8Cs = binary_to_list(unicode:characters_to_binary([C])),
|
||||
unescape(Delim, Chars, [Utf8Cs | Acc]);
|
||||
unescape(Delim, [$\\, Code | Chars], Acc) ->
|
||||
Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end,
|
||||
case Code of
|
||||
$" -> Ok($");
|
||||
Delim -> Ok(Delim);
|
||||
$\\ -> Ok($\\);
|
||||
$b -> Ok($\b);
|
||||
$e -> Ok($\e);
|
||||
@ -115,8 +121,8 @@ unescape([$\\, Code | Chars], Acc) ->
|
||||
$v -> Ok($\v);
|
||||
_ -> error("Bad control sequence: \\" ++ [Code]) %% TODO
|
||||
end;
|
||||
unescape([C | Chars], Acc) ->
|
||||
unescape(Chars, [C | Acc]).
|
||||
unescape(Delim, [C | Chars], Acc) ->
|
||||
unescape(Delim, Chars, [C | Acc]).
|
||||
|
||||
strip_underscores(S) ->
|
||||
lists:filter(fun(C) -> C /= $_ end, S).
|
||||
|
@ -63,7 +63,8 @@ simple_contracts_test_() ->
|
||||
%% Parse tests of example contracts
|
||||
[ {lists:concat(["Parse the ", Contract, " contract."]),
|
||||
fun() -> roundtrip_contract(Contract) end}
|
||||
|| Contract <- [counter, voting, all_syntax, '05_greeter', aeproof, multi_sig, simple_storage, fundme, dutch_auction] ]
|
||||
|| Contract <- [counter, voting, all_syntax, '05_greeter', aeproof,
|
||||
multi_sig, simple_storage, fundme, dutch_auction, utf8] ]
|
||||
}.
|
||||
|
||||
parse_contract(Name) ->
|
||||
@ -85,7 +86,7 @@ parse_expr(Text) ->
|
||||
round_trip(Text) ->
|
||||
Contract = parse_string(Text),
|
||||
Text1 = prettypr:format(aeso_pretty:decls(strip_stdlib(Contract))),
|
||||
Contract1 = parse_string(Text1),
|
||||
Contract1 = parse_string(aeso_scan:utf8_encode(Text1)),
|
||||
NoSrcLoc = remove_line_numbers(Contract),
|
||||
NoSrcLoc1 = remove_line_numbers(Contract1),
|
||||
?assertMatch(NoSrcLoc, diff(NoSrcLoc, NoSrcLoc1)).
|
||||
|
21
test/contracts/utf8.aes
Normal file
21
test/contracts/utf8.aes
Normal file
@ -0,0 +1,21 @@
|
||||
contract UTF8 =
|
||||
entrypoint f1() : char = '1'
|
||||
entrypoint f2() : char = '+'
|
||||
entrypoint f3() : char = 'd'
|
||||
entrypoint f4() : char = 'X'
|
||||
entrypoint f5() : char = 'å'
|
||||
entrypoint f6() : char = 'Ä'
|
||||
entrypoint f7() : char = 'æ'
|
||||
entrypoint f8() : char = 'ë'
|
||||
entrypoint f9() : char = 'ẻ'
|
||||
entrypoint f10() : char = '\x27'
|
||||
entrypoint f11() : char = '\x{2200}'
|
||||
entrypoint f12() : char = '💩'
|
||||
entrypoint f13() : char = '\n'
|
||||
|
||||
|
||||
|
||||
// entrypoint f13() : char = 'e̊'
|
||||
// entrypoint f14() : char = '\Ì'
|
||||
|
||||
// '💩' vs. map('a,'b)
|
Loading…
x
Reference in New Issue
Block a user