Fix lexer row/column calculations.
This commit is contained in:
parent
17f635af61
commit
493bdb990c
@ -4,10 +4,13 @@
|
|||||||
-copyright("Jarvis Carroll <spiveehere@gmail.com>").
|
-copyright("Jarvis Carroll <spiveehere@gmail.com>").
|
||||||
-license("GPL-3.0-or-later").
|
-license("GPL-3.0-or-later").
|
||||||
|
|
||||||
-export([check_parser/1]).
|
-export([parse_literal/1, parse_literal/2, check_parser/1]).
|
||||||
|
|
||||||
-include_lib("eunit/include/eunit.hrl").
|
-include_lib("eunit/include/eunit.hrl").
|
||||||
|
|
||||||
|
parse_literal(String) ->
|
||||||
|
parse_literal(unknown_type(), String).
|
||||||
|
|
||||||
parse_literal(Type, String) ->
|
parse_literal(Type, String) ->
|
||||||
case parse_expression(Type, {tk, 1, 1}, String) of
|
case parse_expression(Type, {tk, 1, 1}, String) of
|
||||||
{ok, {Result, NewTk, NewString}} ->
|
{ok, {Result, NewTk, NewString}} ->
|
||||||
@ -37,55 +40,55 @@ parse_literal2(Result, Tk, String) ->
|
|||||||
next_token({tk, Row, Col}, []) ->
|
next_token({tk, Row, Col}, []) ->
|
||||||
{ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
|
{ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}};
|
||||||
next_token({tk, Row, Col}, " " ++ Rest) ->
|
next_token({tk, Row, Col}, " " ++ Rest) ->
|
||||||
next_token({tk, Row + 1, Col}, Rest);
|
next_token({tk, Row, Col + 1}, Rest);
|
||||||
next_token({tk, Row, Col}, "\t" ++ Rest) ->
|
next_token({tk, Row, Col}, "\t" ++ Rest) ->
|
||||||
next_token({tk, Row + 1, Col}, Rest);
|
next_token({tk, Row, Col + 1}, Rest);
|
||||||
next_token({tk, _, Col}, "\r\n" ++ Rest) ->
|
next_token({tk, Row, _}, "\r\n" ++ Rest) ->
|
||||||
next_token({tk, 1, Col + 1}, Rest);
|
next_token({tk, Row + 1, 1}, Rest);
|
||||||
next_token({tk, _, Col}, "\r" ++ Rest) ->
|
next_token({tk, Row, _}, "\r" ++ Rest) ->
|
||||||
next_token({tk, 1, Col + 1}, Rest);
|
next_token({tk, Row + 1, 1}, Rest);
|
||||||
next_token({tk, _, Col}, "\n" ++ Rest) ->
|
next_token({tk, Row, _}, "\n" ++ Rest) ->
|
||||||
next_token({tk, 1, Col + 1}, Rest);
|
next_token({tk, Row + 1, 1}, Rest);
|
||||||
next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
|
next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) ->
|
||||||
alphanum_token(Tk, Tk, String, []);
|
alphanum_token(Tk, Tk, String, []);
|
||||||
next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
|
next_token(Tk, [C | _] = String) when ?IS_NUM(C) ->
|
||||||
num_token(Tk, Tk, String, [], 0);
|
num_token(Tk, Tk, String, [], 0);
|
||||||
next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
|
next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) ->
|
||||||
bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []);
|
bytes_token({tk, Row, Col}, {tk, Row, Col + 1}, [C | Rest], "#", []);
|
||||||
next_token({tk, Row, Col}, "\"" ++ Rest) ->
|
next_token({tk, Row, Col}, "\"" ++ Rest) ->
|
||||||
string_token({tk, Row, Col}, {tk, Row + 1, Col}, Rest, "\"", <<>>);
|
string_token({tk, Row, Col}, {tk, Row, Col + 1}, Rest, "\"", <<>>);
|
||||||
next_token({tk, Row, Col}, [Char | Rest]) ->
|
next_token({tk, Row, Col}, [Char | Rest]) ->
|
||||||
Token = {character, [Char], Char, Row, Col, Col},
|
Token = {character, [Char], Char, Row, Col, Col},
|
||||||
{ok, {Token, {tk, Row + 1, Col}, Rest}}.
|
{ok, {Token, {tk, Row, Col + 1}, Rest}}.
|
||||||
|
|
||||||
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
|
alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) ->
|
||||||
alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]);
|
alphanum_token(Start, {tk, Row, Col + 1}, Rest, [C | Acc]);
|
||||||
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
|
alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) ->
|
||||||
AlphaString = lists:reverse(Acc),
|
AlphaString = lists:reverse(Acc),
|
||||||
Token = {alphanum, AlphaString, AlphaString, Row, Start, End},
|
Token = {alphanum, AlphaString, AlphaString, Row, Start, End - 1},
|
||||||
{ok, {Token, {tk, Row, End}, String}}.
|
{ok, {Token, {tk, Row, End}, String}}.
|
||||||
|
|
||||||
num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
|
num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) ->
|
||||||
NewValue = Value * 10 + (C - $0),
|
NewValue = Value * 10 + (C - $0),
|
||||||
num_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], NewValue);
|
num_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], NewValue);
|
||||||
num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
|
num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) ->
|
||||||
NewValue = Value * 10 + (C - $0),
|
NewValue = Value * 10 + (C - $0),
|
||||||
num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Chars], NewValue);
|
num_token(Start, {tk, Row, Col + 2}, Rest, [C, $_ | Chars], NewValue);
|
||||||
num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
|
num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) ->
|
||||||
NumString = lists:reverse(Chars),
|
NumString = lists:reverse(Chars),
|
||||||
Token = {integer, NumString, Value, Row, Start, End},
|
Token = {integer, NumString, Value, Row, Start, End - 1},
|
||||||
{ok, {Token, {tk, Row, End}, String}}.
|
{ok, {Token, {tk, Row, End}, String}}.
|
||||||
|
|
||||||
bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
|
bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) ->
|
||||||
Digit = convert_digit(C),
|
Digit = convert_digit(C),
|
||||||
bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], [Digit | Digits]);
|
bytes_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]);
|
||||||
bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
|
bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) ->
|
||||||
Digit = convert_digit(C),
|
Digit = convert_digit(C),
|
||||||
bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Chars], [Digit | Digits]);
|
bytes_token(Start, {tk, Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]);
|
||||||
bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
|
bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) ->
|
||||||
BytesString = lists:reverse(Chars),
|
BytesString = lists:reverse(Chars),
|
||||||
Value = reverse_combine_nibbles(Digits, <<>>),
|
Value = reverse_combine_nibbles(Digits, <<>>),
|
||||||
Token = {bytes, BytesString, Value, Row, Start, End},
|
Token = {bytes, BytesString, Value, Row, Start, End - 1},
|
||||||
{ok, {Token, {tk, Row, End}, String}}.
|
{ok, {Token, {tk, Row, End}, String}}.
|
||||||
|
|
||||||
convert_digit(C) when C >= $0, C =< $9 ->
|
convert_digit(C) when C >= $0, C =< $9 ->
|
||||||
@ -104,7 +107,7 @@ reverse_combine_nibbles([], Acc) ->
|
|||||||
Acc.
|
Acc.
|
||||||
|
|
||||||
string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
|
string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
|
||||||
case escape_hex_code({tk, Row, Col}, {tk, Row + 2, Col}, String, "x\\" ++ SourceChars) of
|
case escape_hex_code({tk, Row, Col}, {tk, Row, Col + 2}, String, "x\\" ++ SourceChars) of
|
||||||
{ok, {Codepoint, NewSourceChars, NewTk, NewString}} ->
|
{ok, {Codepoint, NewSourceChars, NewTk, NewString}} ->
|
||||||
NewValue = <<Value/binary, Codepoint/utf8>>,
|
NewValue = <<Value/binary, Codepoint/utf8>>,
|
||||||
string_token(Start, NewTk, NewString, NewSourceChars, NewValue);
|
string_token(Start, NewTk, NewString, NewSourceChars, NewValue);
|
||||||
@ -114,37 +117,37 @@ string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) ->
|
|||||||
string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
|
string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) ->
|
||||||
case escape_char(C) of
|
case escape_char(C) of
|
||||||
{ok, ByteVal} ->
|
{ok, ByteVal} ->
|
||||||
string_token(Start, {tk, Row + 2, Col}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
|
string_token(Start, {tk, Row, Col + 2}, Rest, [C, $\ | SourceChars], <<Value/binary, ByteVal>>);
|
||||||
error ->
|
error ->
|
||||||
{error, {invalid_escape_code, [C], Row, Col}}
|
{error, {invalid_escape_code, [C], Row, Col}}
|
||||||
end;
|
end;
|
||||||
string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) ->
|
string_token({tk, _, Start}, {tk, Row, Col}, [$" | Rest], SourceChars, Value) ->
|
||||||
SourceStr = lists:reverse([$" | SourceChars]),
|
SourceStr = lists:reverse([$" | SourceChars]),
|
||||||
Token = {string, SourceStr, Value, Row, Start, End},
|
Token = {string, SourceStr, Value, Row, Start, Col},
|
||||||
{ok, {Token, {tk, Row, End}, Rest}};
|
{ok, {Token, {tk, Row, Col + 1}, Rest}};
|
||||||
string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
|
string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) ->
|
||||||
% TODO: ERTS probably had to convert this FROM utf8 at some point, so why
|
% TODO: ERTS probably had to convert this FROM utf8 at some point, so why
|
||||||
% bother, if we need to convert it back? I guess we could accept iolists if
|
% bother, if we need to convert it back? I guess we could accept iolists if
|
||||||
% we really wanted to waste time on this point...
|
% we really wanted to waste time on this point...
|
||||||
string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
|
string_token(Start, {tk, Row, Col + 1}, Rest, [C | SourceChars], <<Value/binary, C/utf8>>).
|
||||||
|
|
||||||
escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) ->
|
escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) ->
|
||||||
escape_long_hex_code(Start, {tk, Row + 1, Col}, String, "{" ++ SourceChars, 0);
|
escape_long_hex_code(Start, {tk, Row, Col + 1}, String, "{" ++ SourceChars, 0);
|
||||||
escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
|
escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) ->
|
||||||
% As of writing this, the Sophia compiler will convert this byte from
|
% As of writing this, the Sophia compiler will convert this byte from
|
||||||
% extended ASCII to unicode... But it really shouldn't. The literal parser
|
% extended ASCII to unicode... But it really shouldn't. The literal parser
|
||||||
% does what the compiler should do.
|
% does what the compiler should do.
|
||||||
Byte = convert_digit(A) * 16 + convert_digit(B),
|
Byte = convert_digit(A) * 16 + convert_digit(B),
|
||||||
{ok, {Byte, [B, A | SourceChars], {tk, Row + 2, Col}, String}};
|
{ok, {Byte, [B, A | SourceChars], {tk, Row, Col + 2}, String}};
|
||||||
escape_hex_code({tk, Row1, Col1}, _, _, _) ->
|
escape_hex_code({tk, Row1, Col1}, _, _, _) ->
|
||||||
{error, {invalid_escape_code, "\\x", Row1, Col1}}.
|
{error, {invalid_escape_code, "\\x", Row1, Col1}}.
|
||||||
|
|
||||||
escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) ->
|
escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) ->
|
||||||
{ok, {Value, "}" ++ SourceChars, {tk, Row + 1, Col}, String}};
|
{ok, {Value, "}" ++ SourceChars, {tk, Row, Col + 1}, String}};
|
||||||
escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
|
escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) ->
|
||||||
NewSourceChars = [C | SourceChars],
|
NewSourceChars = [C | SourceChars],
|
||||||
NewValue = 16 * Value + convert_digit(C),
|
NewValue = 16 * Value + convert_digit(C),
|
||||||
escape_long_hex_code(Start, {tk, Row + 1, Col}, String, NewSourceChars, NewValue);
|
escape_long_hex_code(Start, {tk, Row, Col + 1}, String, NewSourceChars, NewValue);
|
||||||
escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) ->
|
escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) ->
|
||||||
{error, {invalid_hexadecimal, [C], Row, Col}};
|
{error, {invalid_hexadecimal, [C], Row, Col}};
|
||||||
escape_long_hex_code(_, Tk, [], SourceChars, Value) ->
|
escape_long_hex_code(_, Tk, [], SourceChars, Value) ->
|
||||||
@ -826,4 +829,28 @@ variant_test() ->
|
|||||||
|
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
lexer_offset_test() ->
|
||||||
|
% Test that various tokens report their position correctly.
|
||||||
|
{error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"),
|
||||||
|
{error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"),
|
||||||
|
{error, {unexpected_token, "abc", 1, 5, 7}} = parse_literal("123 abc"),
|
||||||
|
{error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"),
|
||||||
|
{error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""),
|
||||||
|
{error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""),
|
||||||
|
|
||||||
|
% Check that the tokenizer knows its position correctly *after* various
|
||||||
|
% tokens.
|
||||||
|
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"),
|
||||||
|
ABCType = {"mytype", already_normalized, {variant, [{"abc", []}]}},
|
||||||
|
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "abc 123"),
|
||||||
|
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"),
|
||||||
|
{error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"),
|
||||||
|
{error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"),
|
||||||
|
|
||||||
|
% Check that the tokenizer accounts for various line separators correctly.
|
||||||
|
{error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\nabc"),
|
||||||
|
{error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\r\nabc"),
|
||||||
|
{error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\rabc"),
|
||||||
|
|
||||||
|
ok.
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user