Register individual types, more docs

This commit is contained in:
Ulf Wiger 2025-04-05 13:20:30 +02:00
parent 4663a0f57e
commit 3ede4f22e1
2 changed files with 319 additions and 61 deletions

View File

@ -13,3 +13,91 @@ Test
----
$ rebar3 eunit
Dynamic encoding
----
The module `gmser_dyn` offers dynamic encoding support, encoding most 'regular'
Erlang data types into an internal RLP representation.
Main API:
* `encode(term()) -> iolist()`
* `encode_typed(template(), term()) -> iolist()`
* `decode(iolist()) -> term()`
* `serialize(term()) -> binary()`
* `serialize_typed(template(), term()) -> binary()`
* `deserialize(binary()) -> term()`
The basic types supported by the encoder are:
* `non_neg_integer()` (`int` , code: 248)
* `binary()` (`binary`, code: 249)
* `boolean()` (`bool` , code: 250)
* `list()` (`list` , code: 251)
* `map()` (`map` , code: 252)
* `tuple()` (`tuple` , code: 253)
* `gmser_id:id()` (`id` , code: 254)
* `atom()` (`label` , code: 255)
When encoding `map` types, the map elements are first sorted.
When specifying a map type for template-driven encoding, use
the `#{items => [{Key, Value}]}` construct.
Labels
----
Labels correspond to (existing) atoms in Erlang.
Decoding of a label results in a call to `binary_to_existing_atom/2`, so will
fail if the corresponding atom does not already exist.
It's possible to cache labels for more compact encoding.
Note that when caching labels, the same cache mapping needs to be used on the
decoder side.
Labels are encoded as `[<<255>>, << AtomToBinary/binary >>]`.
If a cached label is used, the encoding becomes `[<<255>, [Ix]]`, where
`Ix` is the integer-encoded index value of the cached label.
Examples
----
Dynamically encoded objects have the basic structure `[<<0>>,V,Obj]`, where `V` is the
integer-coded version, and `Obj` is the top-level encoding on the form `[Tag,Data]`.
```erlang
E = fun(T) -> io:fwrite("~w~n", [gmser_dyn:encode(T)]) end.
E(17) -> [<<0>>,<<1>>,[<<248>>,<<17>>]]
E(<<"abc">>) -> [<<0>>,<<1>>,[<<249>>,<<97,98,99>>]]
E(true) -> [<<0>>,<<1>>,[<<250>>,<<1>>]]
E(false) -> [<<0>>,<<1>>,[<<250>>,<<0>>]]
E([1,2]) -> [<<0>>,<<1>>,[<<251>>,[[<<248>>,<<1>>],[<<248>>,<<2>>]]]]
E({1,2}) -> [<<0>>,<<1>>,[<<253>>,[[<<248>>,<<1>>],[<<248>>,<<2>>]]]]
E(#{a=>1, b=>2}) ->
[<<0>>,<<1>>,[<<252>>,[[[<<255>>,<<97>>],[<<248>>,<<1>>]],[[<<255>>,<<98>>],[<<248>>,<<2>>]]]]]
E(gmser_id:create(account,<<1:256>>)) ->
[<<0>>,<<1>>,[<<254>>,<<1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1>>]]
```
Note that tuples and list are encoded the same way, except for the initial type tag.
Maps are encoded as `[<Map>, [KV1, KV2, ...]]`, where `[KV1, KV2, ...]` is the sorted
list of key-value tuples from `map:to_list(Map)`, but with the `tuple` type tag omitted.
Template-driven encoding
----
Templates can be provided to the encoder by either naming an already registered
type, or by passing a template directly. The template will then be enforced, and
used to slightly compress the encoding.
In the following example, as the encoder knows that `{11,12}` is encoded as a
tuple of two integers, it can omit the inner type tags.
```erlang
ET = fun(Type,Term) -> io:fwrite("~w~n", [gmser_dyn:encode_typed(Type,Term)]) end.
ET({int,int}, {11,12}) ->[<<0>>,<<1>>,[<<253>>,[<<11>>,<<12>>]]]
ET({int,int}, {11,a}) ->
** exception error: {illegal,int,a} ...
```

View File

@ -14,11 +14,17 @@
, deserialize/1
, deserialize/2 ]).
%% register a type schema, inspect existing schema
-export([ register_types/1
, registered_types/0
, types_from_list/1
, revert_to_default_types/0
, dynamic_types/0 ]).
%% Register individual types, or cache labels
-export([ register_type/3
, cache_label/2 ]).
-import(gmserialization, [ decode_field/2 ]).
-define(VSN, 1).
@ -88,7 +94,7 @@ dynamic_types() ->
, 252 => map
, 253 => tuple
, 254 => id
, 255 => label }
, 255 => label}
, rev =>
#{ int => 248
, binary => 249
@ -98,6 +104,8 @@ dynamic_types() ->
, tuple => 253
, id => 254
, label => 255}
, labels => #{}
, rev_labels => #{}
, templates =>
#{ int => int
, binary => binary
@ -113,66 +121,13 @@ dynamic_types() ->
vsn(Types) ->
maps:get(vsn, Types, ?VSN).
register_types(Types) when is_map(Types) ->
Codes = maps:get(codes, Types, #{}),
Rev = rev_codes(Codes),
Templates = maps:get(templates, Types, #{}),
#{codes := Codes0, rev := Rev0, templates := Templates0} =
dynamic_types(),
Merged = #{ codes => maps:merge(Codes0, Codes)
, rev => maps:merge(Rev0, Rev)
, templates => maps:merge(Templates0, Templates) },
assert_sizes(Merged),
assert_mappings(Merged),
persistent_term:put({?MODULE, types}, Merged).
revert_to_default_types() ->
persistent_term:put({?MODULE, types}, dynamic_types()).
assert_sizes(#{codes := Codes, rev := Rev, templates := Ts} = Types) ->
assert_sizes(map_size(Codes), map_size(Rev), map_size(Ts), Types).
assert_sizes(Sz, Sz, Sz, _) ->
ok;
assert_sizes(Sz, RSz, Sz, Types) when RSz =/= Sz ->
%% Wrong size reverse mapping must mean duplicate mappings
%% We auto-generate the reverse-mappings, so we know there aren't
%% too many of them
?LOG_ERROR("Reverse mapping size doesn't match codes size", []),
Codes = maps:get(codes, Types),
CodeVals = maps:values(Codes),
Duplicates = CodeVals -- lists:usort(CodeVals),
error({duplicate_mappings, Duplicates, Types});
assert_sizes(Sz, _, TSz, Types) when Sz > TSz ->
?LOG_ERROR("More codes than templates", []),
Tags = maps:keys(maps:get(rev, Types)),
Templates = maps:get(templates, Types),
Missing = [T || T <- Tags,
not is_map_key(T, Templates)],
error({missing_mappings, Missing, Types});
assert_sizes(Sz, _, TSz, Types) when TSz > Sz ->
%% More mappings than codes. May not be horrible.
%% We check that all codes have mappings elsewhere.
?LOG_WARNING("More templates than codes in ~p", [Types]),
ok.
assert_mappings(#{rev := Rev, templates := Ts} = Types) ->
Tags = maps:keys(Rev),
case [T || T <- Tags,
not is_map_key(T, Ts)] of
[] ->
ok;
Missing ->
?LOG_ERROR("Missing templates for ~p", [Missing]),
error({missing_templates, Missing, Types})
end.
rev_codes(Codes) ->
L = maps:to_list(Codes),
maps:from_list([{V, K} || {K, V} <- L]).
registered_types() ->
persistent_term:get({?MODULE, types}, dynamic_types()).
case persistent_term:get({?MODULE, types}, undefined) of
undefined ->
dynamic_types();
Types when is_map(Types) ->
Types
end.
template(TagOrCode, Vsn, Types) ->
{Tag, Template} = get_template(TagOrCode, Types),
@ -190,6 +145,9 @@ dyn_template_(F, Vsn) ->
true -> F
end.
find_cached_label(Lbl, #{labels := Lbls}) ->
maps:find(Lbl, Lbls).
decode_(Fields, Vsn, Types, Acc) ->
{_Tag, Term, Rest} = decode_field_(Fields, Vsn, Types),
Acc1 = [Term | Acc],
@ -228,7 +186,7 @@ encode_typed_(Code, Term, Vsn, #{codes := Codes} = Types) when is_map_key(Code,
{_Tag, Template} = template(Code, Vsn, Types),
[encode_basic(int, Code), encode_from_template(Template, Term, Vsn, Types)];
encode_typed_(Tag, Term, Vsn, #{templates := Ts} = Types) when is_map_key(Tag, Ts) ->
Template = maps:get(Tag, Ts),
Template = dyn_template_(maps:get(Tag, Ts), Vsn),
[emit_code(Tag, Types), encode_from_template(Template, Term, Vsn, Types)];
encode_typed_(MaybeTemplate, Term, Vsn, Types) ->
encode_maybe_template(MaybeTemplate, Term, Vsn, Types).
@ -286,6 +244,9 @@ decode_from_template(Type, V, Vsn, Types) when is_tuple(Type), is_list(V) ->
Zipped = lists:zip(tuple_to_list(Type), V),
Items = [decode_from_template(T1, V1, Vsn, Types) || {T1, V1} <- Zipped],
list_to_tuple(Items);
decode_from_template(label, [C], _, #{rev_labels := RLbls}) ->
Code = decode_basic(int, C),
maps:get(Code, RLbls);
decode_from_template(Type, Fld, _, _) when Type == int
; Type == binary
; Type == bool
@ -310,6 +271,13 @@ encode_from_template([Type], List, _, Vsn, Types) ->
[encode_from_template(Type, V, false, Vsn, Types) || V <- List];
encode_from_template(Type, List, _, Vsn, Types) when is_list(Type), is_list(List) ->
encode_fields(Type, List, Vsn, Types);
encode_from_template(label, V, _, _, Types) ->
case find_cached_label(V, Types) of
error ->
encode_basic(label, V);
{ok, Code} when is_integer(Code) ->
[encode_basic(int, Code)]
end;
encode_from_template(Type, V, _, _, _Types) when Type == id
; Type == binary
; Type == bool
@ -365,6 +333,152 @@ rlp_decode(Bin) ->
rlp_encode(Fields) ->
gmser_rlp:encode(Fields).
%% ===========================================================================
%% Type registration and validation code
register_types(Types) when is_map(Types) ->
Codes = maps:get(codes, Types, #{}),
Rev = rev_codes(Codes),
Templates = maps:get(templates, Types, #{}),
Labels = maps:get(labels, Types, #{}),
#{codes := Codes0, rev := Rev0, labels := Labels0, templates := Templates0} =
dynamic_types(),
Merged = #{ codes => maps:merge(Codes0, Codes)
, rev => maps:merge(Rev0, Rev)
, templates => maps:merge(Templates0, Templates)
, labels => maps:merge(Labels0, Labels) },
assert_sizes(Merged),
assert_mappings(Merged),
Merged1 = assert_label_cache(Merged),
put_types(Merged1).
put_types(Types) ->
persistent_term:put({?MODULE, types}, Types).
types_from_list(L) ->
lists:foldl(fun elem_to_type/2, dynamic_types(), L).
register_type(Code, Tag, Template) when is_integer(Code), Code >= 0 ->
#{codes := Codes, rev := Rev, templates := Temps} = Types = registered_types(),
case {is_map_key(Code, Codes), is_map_key(Tag, Rev)} of
{false, false} ->
New = Types#{ codes := Codes#{Code => Tag}
, rev := Rev#{Tag => Code}
, templates := Temps#{Tag => Template} },
put_types(New),
New;
{true, _} -> error(code_exists);
{_, true} -> error(tag_exists)
end.
cache_label(Code, Label) when is_integer(Code), Code >= 0, is_atom(Label) ->
#{labels := Lbls, rev_labels := RevLbls} = Types = registered_types(),
case {is_map_key(Label, Lbls), is_map_key(Code, RevLbls)} of
{false, false} ->
New = Types#{ labels := Lbls#{Label => Code}
, rev_labels := RevLbls#{Code => Label} },
put_types(New),
New;
{true,_} -> error(label_exists);
{_,true} -> error(code_exists)
end.
elem_to_type({Tag, Code, Template}, Acc) when is_atom(Tag), is_integer(Code) ->
#{codes := Codes, rev := Rev, templates := Temps} = Acc,
case {is_map_key(Tag, Rev), is_map_key(Code, Codes)} of
{false, false} ->
Acc#{ codes := Codes#{Code => Tag}
, rev := Rev#{Tag => Code}
, templates => Temps#{Tag => Template}
};
{true, _} -> error({duplicate_tag, Tag});
{_, true} -> error({duplicate_code, Code})
end;
elem_to_type({labels, Lbls}, Acc) ->
lists:foldl(fun add_label/2, Acc, Lbls);
elem_to_type(Elem, _) ->
error({invalid_type_list_element, Elem}).
add_label({L, Code}, #{labels := Lbls, rev_labels := RevLbls} = Acc)
when is_atom(L), is_integer(Code), Code > 0 ->
case {is_map_key(L, Lbls), is_map_key(Code, RevLbls)} of
{false, false} ->
Acc#{labels := Lbls#{L => Code},
rev_labels := RevLbls#{Code => L}};
{true, _} -> error({duplicate_label, L});
{_, true} -> error({duplicate_label_code, Code})
end;
add_label(Elem, _) ->
error({invalid_label_elem, Elem}).
revert_to_default_types() ->
persistent_term:put({?MODULE, types}, dynamic_types()).
assert_sizes(#{codes := Codes, rev := Rev, templates := Ts} = Types) ->
assert_sizes(map_size(Codes), map_size(Rev), map_size(Ts), Types).
assert_sizes(Sz, Sz, Sz, _) ->
ok;
assert_sizes(Sz, RSz, Sz, Types) when RSz =/= Sz ->
%% Wrong size reverse mapping must mean duplicate mappings
%% We auto-generate the reverse-mappings, so we know there aren't
%% too many of them
?LOG_ERROR("Reverse mapping size doesn't match codes size", []),
Codes = maps:get(codes, Types),
CodeVals = maps:values(Codes),
Duplicates = CodeVals -- lists:usort(CodeVals),
error({duplicate_mappings, Duplicates, Types});
assert_sizes(Sz, _, TSz, Types) when Sz > TSz ->
?LOG_ERROR("More codes than templates", []),
Tags = maps:keys(maps:get(rev, Types)),
Templates = maps:get(templates, Types),
Missing = [T || T <- Tags,
not is_map_key(T, Templates)],
error({missing_mappings, Missing, Types});
assert_sizes(Sz, _, TSz, Types) when TSz > Sz ->
%% More mappings than codes. May not be horrible.
%% We check that all codes have mappings elsewhere.
?LOG_WARNING("More templates than codes in ~p", [Types]),
ok.
assert_mappings(#{rev := Rev, templates := Ts} = Types) ->
Tags = maps:keys(Rev),
case [T || T <- Tags,
not is_map_key(T, Ts)] of
[] ->
ok;
Missing ->
?LOG_ERROR("Missing templates for ~p", [Missing]),
error({missing_templates, Missing, Types})
end.
assert_label_cache(#{labels := Labels} = Types) ->
Ls = maps:keys(Labels),
case [L || L <- Ls, not is_atom(L)] of
[] -> ok;
_NonAtoms ->
error(non_atoms_in_label_cache)
end,
Rev = [{C,L} || {L,C} <- maps:to_list(Labels)],
case [C || {C,_} <- Rev, not is_integer(C)] of
[] -> ok;
_NonInts -> error(non_integer_label_cache_codes)
end,
RevLabels = maps:from_list(Rev),
case map_size(RevLabels) == map_size(Labels) of
true ->
Types#{rev_labels => RevLabels};
false ->
error(non_unique_label_cache_codes)
end.
rev_codes(Codes) ->
L = maps:to_list(Codes),
maps:from_list([{V, K} || {K, V} <- L]).
%% ===========================================================================
%% Unit tests
-ifdef(TEST).
@ -402,6 +516,10 @@ user_types_test_() ->
end,
[ ?_test(t_reg_typed_tuple())
, ?_test(t_reg_chain_objects_array())
, ?_test(t_reg_template_fun())
, ?_test(t_reg_template_vsnd_fun())
, ?_test(t_reg_label_cache())
, ?_test(t_reg_label_cache2())
]}.
t_round_trip(T) ->
@ -440,4 +558,56 @@ t_reg_chain_objects_array() ->
Enc = encode_typed(coa, Values),
Values = decode(Enc).
t_reg_template_fun() ->
Template = fun() -> {int,int} end,
New = register_type(1010, tup2f0, Template),
?debugFmt("New = ~p", [New]),
E = encode_typed(tup2f0, {3,4}),
{3,4} = decode(E),
ok.
t_reg_template_vsnd_fun() ->
Template = fun(1) -> {int,int} end,
New = register_type(1011, tup2f1, Template),
?debugFmt("New = ~p", [New]),
E = encode_typed(tup2f1, {3,4}),
{3,4} = decode(E),
ok.
t_reg_label_cache() ->
Enc0 = gmser_dyn:encode('1'),
?debugFmt("Enc0 (no cache): ~w", [Enc0]),
MyTypes1 = #{codes => #{1003 => lbl_tup2}, templates => #{ lbl_tup2 => {label,label} }},
register_types(MyTypes1),
Enc0a = gmser_dyn:encode_typed(lbl_tup2, {'1','1'}),
?debugFmt("Enc0a (no cache): ~w", [Enc0a]),
{'1','1'} = gmser_dyn:decode(Enc0a),
MyTypes2 = MyTypes1#{labels => #{'1' => 49}}, % atom_to_list('1') == [49]
register_types(MyTypes2),
Enc1 = gmser_dyn:encode('1'),
Enc1a = gmser_dyn:encode_typed(lbl_tup2, {'1','1'}),
?debugFmt("Enc1 (w/ cache): ~w", [Enc1]),
?debugFmt("Enc1a (w/ cache): ~w", [Enc1a]),
{'1','1'} = gmser_dyn:decode(Enc1a),
true = Enc0 =/= Enc1,
Enc2 = gmser_dyn:encode_typed(label, '1'),
?debugFmt("Enc2 (typed): ~w", [Enc2]),
true = Enc2 == Enc1,
true = Enc0a =/= Enc1a.
t_reg_label_cache2() ->
TFromL = gmser_dyn:types_from_list(
[ {lbl_tup2, 1003, {label, label}}
, {labels,
[{'1', 49}]}
]),
?debugFmt("TFromL = ~w", [TFromL]),
register_types(TFromL),
Tup = {'1', '1'},
Enc = gmser_dyn:encode_typed(lbl_tup2, Tup),
[<<0>>,<<1>>,[<<3,235>>,[[<<49>>],[<<49>>]]]] = Enc,
Tup = gmser_dyn:decode(Enc).
-endif.