%%% -*- erlang-indent-level:4; indent-tabs-mode: nil -*- %%%------------------------------------------------------------------- %%% @copyright (C) 2017, Aeternity Anstalt %%% @doc The Sophia lexer. %%% %%% @end %%%------------------------------------------------------------------- -module(aeso_scan). -export([scan/1, utf8_encode/1]). -import(aeso_scan_lib, [token/1, token/2, symbol/0, skip/0, override/2, push/2, pop/1]). lexer() -> Number = fun(Digit) -> [Digit, "+(_", Digit, "+)*"] end, DIGIT = "[0-9]", HEXDIGIT = "[0-9a-fA-F]", LOWER = "[a-z_]", UPPER = "[A-Z]", CON = [UPPER, "[a-zA-Z0-9_]*"], INT = Number(DIGIT), HEX = ["0x", Number(HEXDIGIT)], BYTES = ["#", Number(HEXDIGIT)], WS = "[\\000-\\ ]+", ID = [LOWER, "[a-zA-Z0-9_']*"], TVAR = ["'", ID], QID = ["(", CON, "\\.)+", ID], QCON = ["(", CON, "\\.)+", CON], OP = "[=!<>+\\-*/:&|?~@^]+", %% Five cases for a character %% * 1 7-bit ascii, not \ or ' %% * 2-4 8-bit values (UTF8) %% * \ followed by a known modifier [aernrtv] %% * \xhh %% * \x{hhh...} CHAR = "'(([\\x00-\\x26\\x28-\\x5b\\x5d-\\x7f])|([\\x00-\\xff][\\x80-\\xff]{1,3})|(\\\\[befnrtv'\\\\])|(\\\\x[0-9a-fA-F]{2,2})|(\\\\x\\{[0-9a-fA-F]*\\}))'", STRING = "\"([^\"\\\\]|(\\\\.))*\"", CommentStart = {"/\\*", push(comment, skip())}, CommentRules = [ CommentStart , {"\\*/", pop(skip())} , {"[^/*]+|[/*]", skip()} ], Keywords = ["contract", "include", "let", "switch", "type", "record", "datatype", "if", "elif", "else", "function", "stateful", "payable", "true", "false", "mod", "public", "entrypoint", "private", "indexed", "namespace", "interface", "main", "using", "as", "for", "hiding", "band", "bor", "bxor", "bnot" ], KW = string:join(Keywords, "|"), Rules = %% Comments and whitespace [ CommentStart , {"//.*", skip()} , {WS, skip()} %% Special characters , {"\\.\\.|[,.;()\\[\\]{}]", symbol()} %% Literals , {CHAR, token(char, fun parse_char/1)} , {STRING, token(string, fun parse_string/1)} , {HEX, token(hex, fun parse_hex/1)} , {INT, token(int, fun parse_int/1)} , {BYTES, token(bytes, fun parse_bytes/1)} %% Identifiers (qualified first!) , {QID, token(qid, fun(S) -> string:tokens(S, ".") end)} , {QCON, token(qcon, fun(S) -> string:tokens(S, ".") end)} , {TVAR, token(tvar)} , override({ID, token(id)}, {KW, symbol()}) %% Keywords override identifiers. Need to , {CON, token(con)} %% use override to avoid lexing "lettuce" %% as ['let', {id, "tuce"}]. %% Operators , {OP, symbol()} ], [{code, Rules}, {comment, CommentRules}]. scan(String) -> Lexer = aeso_scan_lib:compile(lexer()), aeso_scan_lib:string(Lexer, code, String). %% -- Helpers ---------------------------------------------------------------- parse_string([$" | Chars]) -> unicode:characters_to_nfc_binary(unescape(Chars)). parse_char([$' | Chars]) -> case unicode:characters_to_nfc_list(unescape($', Chars, [])) of [Char] -> Char; _Bad -> {error, "Bad character literal: '" ++ Chars} end. utf8_encode(Cs) -> binary_to_list(unicode:characters_to_binary(Cs)). unescape(Str) -> unescape($", Str, []). unescape(Delim, [Delim], Acc) -> list_to_binary(lists:reverse(Acc)); unescape(Delim, [$\\, $x, ${ | Chars ], Acc) -> {Ds, [_ | Cs]} = lists:splitwith(fun($}) -> false ; (_) -> true end, Chars), C = list_to_integer(Ds, 16), Utf8Cs = binary_to_list(unicode:characters_to_binary([C])), unescape(Delim, Cs, [Utf8Cs | Acc]); unescape(Delim, [$\\, $x, D1, D2 | Chars ], Acc) -> C = list_to_integer([D1, D2], 16), Utf8Cs = binary_to_list(unicode:characters_to_binary([C])), unescape(Delim, Chars, [Utf8Cs | Acc]); unescape(Delim, [$\\, Code | Chars], Acc) -> Ok = fun(C) -> unescape(Delim, Chars, [C | Acc]) end, case Code of Delim -> Ok(Delim); $\\ -> Ok($\\); $b -> Ok($\b); $e -> Ok($\e); $f -> Ok($\f); $n -> Ok($\n); $r -> Ok($\r); $t -> Ok($\t); $v -> Ok($\v); _ -> error("Bad control sequence: \\" ++ [Code]) %% TODO end; unescape(Delim, [C | Chars], Acc) -> unescape(Delim, Chars, [C | Acc]). strip_underscores(S) -> lists:filter(fun(C) -> C /= $_ end, S). parse_hex("0x" ++ S) -> list_to_integer(strip_underscores(S), 16). parse_int(S) -> list_to_integer(strip_underscores(S)). parse_bytes("#" ++ S0) -> S = strip_underscores(S0), N = list_to_integer(S, 16), Digits = (length(S) + 1) div 2, <>.