From 493bdb990c97492ebd0c743cbc6e69066bb30ea3 Mon Sep 17 00:00:00 2001 From: Jarvis Carroll Date: Tue, 3 Feb 2026 01:42:17 +0000 Subject: [PATCH] Fix lexer row/column calculations. --- src/hz_sophia.erl | 87 +++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/src/hz_sophia.erl b/src/hz_sophia.erl index 32bbde4..b43a48c 100644 --- a/src/hz_sophia.erl +++ b/src/hz_sophia.erl @@ -4,10 +4,13 @@ -copyright("Jarvis Carroll "). -license("GPL-3.0-or-later"). --export([check_parser/1]). +-export([parse_literal/1, parse_literal/2, check_parser/1]). -include_lib("eunit/include/eunit.hrl"). +parse_literal(String) -> + parse_literal(unknown_type(), String). + parse_literal(Type, String) -> case parse_expression(Type, {tk, 1, 1}, String) of {ok, {Result, NewTk, NewString}} -> @@ -37,55 +40,55 @@ parse_literal2(Result, Tk, String) -> next_token({tk, Row, Col}, []) -> {ok, {{eof, "", Row, Col, Col}, {tk, Row, Col}, []}}; next_token({tk, Row, Col}, " " ++ Rest) -> - next_token({tk, Row + 1, Col}, Rest); + next_token({tk, Row, Col + 1}, Rest); next_token({tk, Row, Col}, "\t" ++ Rest) -> - next_token({tk, Row + 1, Col}, Rest); -next_token({tk, _, Col}, "\r\n" ++ Rest) -> - next_token({tk, 1, Col + 1}, Rest); -next_token({tk, _, Col}, "\r" ++ Rest) -> - next_token({tk, 1, Col + 1}, Rest); -next_token({tk, _, Col}, "\n" ++ Rest) -> - next_token({tk, 1, Col + 1}, Rest); + next_token({tk, Row, Col + 1}, Rest); +next_token({tk, Row, _}, "\r\n" ++ Rest) -> + next_token({tk, Row + 1, 1}, Rest); +next_token({tk, Row, _}, "\r" ++ Rest) -> + next_token({tk, Row + 1, 1}, Rest); +next_token({tk, Row, _}, "\n" ++ Rest) -> + next_token({tk, Row + 1, 1}, Rest); next_token(Tk, [C | _] = String) when ?IS_ALPHA(C) -> alphanum_token(Tk, Tk, String, []); next_token(Tk, [C | _] = String) when ?IS_NUM(C) -> num_token(Tk, Tk, String, [], 0); next_token({tk, Row, Col}, [$#, C | Rest]) when ?IS_HEX(C) -> - bytes_token({tk, Row, Col}, {tk, Row + 1, Col}, [C | Rest], "#", []); + bytes_token({tk, Row, Col}, {tk, Row, Col + 1}, [C | Rest], "#", []); next_token({tk, Row, Col}, "\"" ++ Rest) -> - string_token({tk, Row, Col}, {tk, Row + 1, Col}, Rest, "\"", <<>>); + string_token({tk, Row, Col}, {tk, Row, Col + 1}, Rest, "\"", <<>>); next_token({tk, Row, Col}, [Char | Rest]) -> Token = {character, [Char], Char, Row, Col, Col}, - {ok, {Token, {tk, Row + 1, Col}, Rest}}. + {ok, {Token, {tk, Row, Col + 1}, Rest}}. alphanum_token(Start, {tk, Row, Col}, [C | Rest], Acc) when ?IS_ALPHANUM(C) -> - alphanum_token(Start, {tk, Row, Col}, Rest, [C | Acc]); + alphanum_token(Start, {tk, Row, Col + 1}, Rest, [C | Acc]); alphanum_token({tk, _, Start}, {tk, Row, End}, String, Acc) -> AlphaString = lists:reverse(Acc), - Token = {alphanum, AlphaString, AlphaString, Row, Start, End}, + Token = {alphanum, AlphaString, AlphaString, Row, Start, End - 1}, {ok, {Token, {tk, Row, End}, String}}. num_token(Start, {tk, Row, Col}, [C | Rest], Chars, Value) when ?IS_NUM(C) -> NewValue = Value * 10 + (C - $0), - num_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], NewValue); + num_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], NewValue); num_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Value) when ?IS_NUM(C) -> NewValue = Value * 10 + (C - $0), - num_token(Start, {tk, Row + 2, Col}, Rest, [C, $_ | Chars], NewValue); + num_token(Start, {tk, Row, Col + 2}, Rest, [C, $_ | Chars], NewValue); num_token({tk, _, Start}, {tk, Row, End}, String, Chars, Value) -> NumString = lists:reverse(Chars), - Token = {integer, NumString, Value, Row, Start, End}, + Token = {integer, NumString, Value, Row, Start, End - 1}, {ok, {Token, {tk, Row, End}, String}}. bytes_token(Start, {tk, Row, Col}, [C | Rest], Chars, Digits) when ?IS_HEX(C) -> Digit = convert_digit(C), - bytes_token(Start, {tk, Row + 1, Col}, Rest, [C | Chars], [Digit | Digits]); + bytes_token(Start, {tk, Row, Col + 1}, Rest, [C | Chars], [Digit | Digits]); bytes_token(Start, {tk, Row, Col}, [$_, C | Rest], Chars, Digits) when ?IS_HEX(C) -> Digit = convert_digit(C), - bytes_token(Start, {tk, Row + 1, Col}, Rest, [C, $_ | Chars], [Digit | Digits]); + bytes_token(Start, {tk, Row, Col + 1}, Rest, [C, $_ | Chars], [Digit | Digits]); bytes_token({tk, _, Start}, {tk, Row, End}, String, Chars, Digits) -> BytesString = lists:reverse(Chars), Value = reverse_combine_nibbles(Digits, <<>>), - Token = {bytes, BytesString, Value, Row, Start, End}, + Token = {bytes, BytesString, Value, Row, Start, End - 1}, {ok, {Token, {tk, Row, End}, String}}. convert_digit(C) when C >= $0, C =< $9 -> @@ -104,7 +107,7 @@ reverse_combine_nibbles([], Acc) -> Acc. string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) -> - case escape_hex_code({tk, Row, Col}, {tk, Row + 2, Col}, String, "x\\" ++ SourceChars) of + case escape_hex_code({tk, Row, Col}, {tk, Row, Col + 2}, String, "x\\" ++ SourceChars) of {ok, {Codepoint, NewSourceChars, NewTk, NewString}} -> NewValue = <>, string_token(Start, NewTk, NewString, NewSourceChars, NewValue); @@ -114,37 +117,37 @@ string_token(Start, {tk, Row, Col}, "\\x" ++ String, SourceChars, Value) -> string_token(Start, {tk, Row, Col}, [$\\, C | Rest], SourceChars, Value) -> case escape_char(C) of {ok, ByteVal} -> - string_token(Start, {tk, Row + 2, Col}, Rest, [C, $\ | SourceChars], <>); + string_token(Start, {tk, Row, Col + 2}, Rest, [C, $\ | SourceChars], <>); error -> {error, {invalid_escape_code, [C], Row, Col}} end; -string_token({tk, _, Start}, {tk, Row, End}, [$" | Rest], SourceChars, Value) -> +string_token({tk, _, Start}, {tk, Row, Col}, [$" | Rest], SourceChars, Value) -> SourceStr = lists:reverse([$" | SourceChars]), - Token = {string, SourceStr, Value, Row, Start, End}, - {ok, {Token, {tk, Row, End}, Rest}}; + Token = {string, SourceStr, Value, Row, Start, Col}, + {ok, {Token, {tk, Row, Col + 1}, Rest}}; string_token(Start, {tk, Row, Col}, [C | Rest], SourceChars, Value) -> % TODO: ERTS probably had to convert this FROM utf8 at some point, so why % bother, if we need to convert it back? I guess we could accept iolists if % we really wanted to waste time on this point... - string_token(Start, {tk, Row + 1, Col}, Rest, [C | SourceChars], <>). + string_token(Start, {tk, Row, Col + 1}, Rest, [C | SourceChars], <>). escape_hex_code(Start, {tk, Row, Col}, "{" ++ String, SourceChars) -> - escape_long_hex_code(Start, {tk, Row + 1, Col}, String, "{" ++ SourceChars, 0); + escape_long_hex_code(Start, {tk, Row, Col + 1}, String, "{" ++ SourceChars, 0); escape_hex_code(_, {tk, Row, Col}, [A, B | String], SourceChars) when ?IS_HEX(A), ?IS_HEX(B) -> % As of writing this, the Sophia compiler will convert this byte from % extended ASCII to unicode... But it really shouldn't. The literal parser % does what the compiler should do. Byte = convert_digit(A) * 16 + convert_digit(B), - {ok, {Byte, [B, A | SourceChars], {tk, Row + 2, Col}, String}}; + {ok, {Byte, [B, A | SourceChars], {tk, Row, Col + 2}, String}}; escape_hex_code({tk, Row1, Col1}, _, _, _) -> {error, {invalid_escape_code, "\\x", Row1, Col1}}. escape_long_hex_code(_, {tk, Row, Col}, "}" ++ String, SourceChars, Value) -> - {ok, {Value, "}" ++ SourceChars, {tk, Row + 1, Col}, String}}; + {ok, {Value, "}" ++ SourceChars, {tk, Row, Col + 1}, String}}; escape_long_hex_code(Start, {tk, Row, Col}, [C | String], SourceChars, Value) when ?IS_HEX(C) -> NewSourceChars = [C | SourceChars], NewValue = 16 * Value + convert_digit(C), - escape_long_hex_code(Start, {tk, Row + 1, Col}, String, NewSourceChars, NewValue); + escape_long_hex_code(Start, {tk, Row, Col + 1}, String, NewSourceChars, NewValue); escape_long_hex_code(_, {tk, Row, Col}, [C | _], _, _) -> {error, {invalid_hexadecimal, [C], Row, Col}}; escape_long_hex_code(_, Tk, [], SourceChars, Value) -> @@ -826,4 +829,28 @@ variant_test() -> ok. +lexer_offset_test() -> + % Test that various tokens report their position correctly. + {error, {unexpected_token, "456", 1, 5, 7}} = parse_literal("123 456"), + {error, {unexpected_token, "[", 1, 5, 5}} = parse_literal("123 [0]"), + {error, {unexpected_token, "abc", 1, 5, 7}} = parse_literal("123 abc"), + {error, {unexpected_token, "#AA", 1, 5, 7}} = parse_literal("123 #AA"), + {error, {unexpected_token, "\"x\"", 1, 5, 7}} = parse_literal("123 \"x\""), + {error, {unexpected_token, "\"\\x{123}\"", 1, 5, 13}} = parse_literal("123 \"\\x{123}\""), + + % Check that the tokenizer knows its position correctly *after* various + % tokens. + {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("[0] 123"), + ABCType = {"mytype", already_normalized, {variant, [{"abc", []}]}}, + {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal(ABCType, "abc 123"), + {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("#AA 123"), + {error, {unexpected_token, "123", 1, 5, 7}} = parse_literal("\"x\" 123"), + {error, {unexpected_token, "123", 1, 11, 13}} = parse_literal("\"\\x{123}\" 123"), + + % Check that the tokenizer accounts for various line separators correctly. + {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\nabc"), + {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\r\nabc"), + {error, {unexpected_token, "abc", 2, 1, 3}} = parse_literal("123\rabc"), + + ok.