From 4663a0f57ee9c0608ffd639ff38681587a382712 Mon Sep 17 00:00:00 2001 From: Ulf Wiger Date: Sun, 30 Mar 2025 23:00:10 +0200 Subject: [PATCH 1/3] gmser_dyn.erl fairly complete --- src/gmser_dyn.erl | 443 ++++++++++++++++++++++++++++++++++++++++ src/gmserialization.erl | 4 + 2 files changed, 447 insertions(+) create mode 100644 src/gmser_dyn.erl diff --git a/src/gmser_dyn.erl b/src/gmser_dyn.erl new file mode 100644 index 0000000..d241562 --- /dev/null +++ b/src/gmser_dyn.erl @@ -0,0 +1,443 @@ +-module(gmser_dyn). + +-export([ encode/1 + , encode/2 + , encode_typed/2 + , encode_typed/3 + , decode/1 + , decode/2 ]). + +-export([ serialize/1 + , serialize/2 + , serialize_typed/2 + , serialize_typed/3 + , deserialize/1 + , deserialize/2 ]). + +-export([ register_types/1 + , registered_types/0 + , revert_to_default_types/0 + , dynamic_types/0 ]). + +-import(gmserialization, [ decode_field/2 ]). + +-define(VSN, 1). + +-include_lib("kernel/include/logger.hrl"). + +-ifdef(TEST). +-compile([export_all, nowarn_export_all]). +-include_lib("eunit/include/eunit.hrl"). +-endif. + +serialize(Term) -> rlp_encode(encode(Term)). +serialize(Term, Types) -> rlp_encode(encode(Term, Types)). +serialize_typed(Type, Term) -> rlp_encode(encode_typed(Type, Term)). +serialize_typed(Type, Term, Types) -> rlp_encode(encode_typed(Type, Term, Types)). + +deserialize(Binary) -> decode(rlp_decode(Binary)). +deserialize(Binary, Types) -> decode(rlp_decode(Binary), Types). + + +encode(Term) -> + encode(Term, registered_types()). + +encode(Term, Types) -> + encode(Term, vsn(Types), Types). + +encode(Term, Vsn, Types) -> + [ encode_basic(int, 0) + , encode_basic(int, Vsn) + , encode_(Term, Vsn, Types) ]. + +encode_typed(Type, Term) -> + encode_typed(Type, Term, registered_types()). + +encode_typed(Type, Term, Types) -> + encode_typed(Type, Term, vsn(Types), Types). + +encode_typed(Type, Term, Vsn, Types) -> + [ encode_basic(int, 0) + , encode_basic(int, Vsn) + , encode_typed_(Type, Term, Vsn, Types) ]. + +decode(Fields) -> + decode(Fields, registered_types()). + +decode(Fields0, Types) -> + case decode_tag_and_vsn(Fields0) of + {0, Vsn, Fields} -> + [Val] = decode_(Fields, Vsn, Types, []), + Val; + Other -> + error({illegal_serialization, Other}) + end. + +decode_tag_and_vsn([TagBin, VsnBin | Fields]) -> + {decode_basic(int, TagBin), + decode_basic(int, VsnBin), + Fields}. + +dynamic_types() -> + #{ vsn => ?VSN + , codes => + #{ 248 => int + , 249 => binary + , 250 => bool + , 251 => list + , 252 => map + , 253 => tuple + , 254 => id + , 255 => label } + , rev => + #{ int => 248 + , binary => 249 + , bool => 250 + , list => 251 + , map => 252 + , tuple => 253 + , id => 254 + , label => 255} + , templates => + #{ int => int + , binary => binary + , bool => bool + , list => list + , map => map + , tuple => tuple + , id => id + , label => label + } + }. + +vsn(Types) -> + maps:get(vsn, Types, ?VSN). + +register_types(Types) when is_map(Types) -> + Codes = maps:get(codes, Types, #{}), + Rev = rev_codes(Codes), + Templates = maps:get(templates, Types, #{}), + #{codes := Codes0, rev := Rev0, templates := Templates0} = + dynamic_types(), + Merged = #{ codes => maps:merge(Codes0, Codes) + , rev => maps:merge(Rev0, Rev) + , templates => maps:merge(Templates0, Templates) }, + assert_sizes(Merged), + assert_mappings(Merged), + persistent_term:put({?MODULE, types}, Merged). + +revert_to_default_types() -> + persistent_term:put({?MODULE, types}, dynamic_types()). + +assert_sizes(#{codes := Codes, rev := Rev, templates := Ts} = Types) -> + assert_sizes(map_size(Codes), map_size(Rev), map_size(Ts), Types). + +assert_sizes(Sz, Sz, Sz, _) -> + ok; +assert_sizes(Sz, RSz, Sz, Types) when RSz =/= Sz -> + %% Wrong size reverse mapping must mean duplicate mappings + %% We auto-generate the reverse-mappings, so we know there aren't + %% too many of them + ?LOG_ERROR("Reverse mapping size doesn't match codes size", []), + Codes = maps:get(codes, Types), + CodeVals = maps:values(Codes), + Duplicates = CodeVals -- lists:usort(CodeVals), + error({duplicate_mappings, Duplicates, Types}); +assert_sizes(Sz, _, TSz, Types) when Sz > TSz -> + ?LOG_ERROR("More codes than templates", []), + Tags = maps:keys(maps:get(rev, Types)), + Templates = maps:get(templates, Types), + Missing = [T || T <- Tags, + not is_map_key(T, Templates)], + error({missing_mappings, Missing, Types}); +assert_sizes(Sz, _, TSz, Types) when TSz > Sz -> + %% More mappings than codes. May not be horrible. + %% We check that all codes have mappings elsewhere. + ?LOG_WARNING("More templates than codes in ~p", [Types]), + ok. + +assert_mappings(#{rev := Rev, templates := Ts} = Types) -> + Tags = maps:keys(Rev), + case [T || T <- Tags, + not is_map_key(T, Ts)] of + [] -> + ok; + Missing -> + ?LOG_ERROR("Missing templates for ~p", [Missing]), + error({missing_templates, Missing, Types}) + end. + +rev_codes(Codes) -> + L = maps:to_list(Codes), + maps:from_list([{V, K} || {K, V} <- L]). + +registered_types() -> + persistent_term:get({?MODULE, types}, dynamic_types()). + +template(TagOrCode, Vsn, Types) -> + {Tag, Template} = get_template(TagOrCode, Types), + {Tag, dyn_template_(Template, Vsn)}. + +get_template(Code, #{codes := Codes, templates := Ts}) when is_integer(Code) -> + Tag = maps:get(Code, Codes), + {Tag, maps:get(Tag, Ts)}; +get_template(Tag, #{templates := Ts}) when is_atom(Tag) -> + {Tag, maps:get(Tag, Ts)}. + +dyn_template_(F, Vsn) -> + if is_function(F, 0) -> F(); + is_function(F, 1) -> F(Vsn); + true -> F + end. + +decode_(Fields, Vsn, Types, Acc) -> + {_Tag, Term, Rest} = decode_field_(Fields, Vsn, Types), + Acc1 = [Term | Acc], + case Rest of + [] -> + lists:reverse(Acc1); + _ -> + decode_(Rest, Vsn, Types, Acc1) + end. + +decode_field_([H|T], Vsn, Types) -> + {CodeBin, Field, Rest} = + case H of + [C, F] -> {C, F, T}; + C when is_binary(C) -> {C, hd(T), tl(T)} + end, + Code = decode_basic(int, CodeBin), + {Tag, Template} = template(Code, Vsn, Types), + %% [Fld|Rest] = Fields, + Val = decode_from_template(Template, Field, Vsn, Types), + {Tag, Val, Rest}. + +encode_(Term, Vsn, Types) -> + encode_(Term, true, Vsn, Types). + +encode_(Term, Emit, Vsn, Types) -> + {Tag, Template} = auto_template(Term), + Enc = encode_from_template(Template, Term, Vsn, Types), + if Emit -> + [emit_code(Tag, Types), Enc]; + true -> + Enc + end. + +encode_typed_(Code, Term, Vsn, #{codes := Codes} = Types) when is_map_key(Code, Codes) -> + {_Tag, Template} = template(Code, Vsn, Types), + [encode_basic(int, Code), encode_from_template(Template, Term, Vsn, Types)]; +encode_typed_(Tag, Term, Vsn, #{templates := Ts} = Types) when is_map_key(Tag, Ts) -> + Template = maps:get(Tag, Ts), + [emit_code(Tag, Types), encode_from_template(Template, Term, Vsn, Types)]; +encode_typed_(MaybeTemplate, Term, Vsn, Types) -> + encode_maybe_template(MaybeTemplate, Term, Vsn, Types). + +encode_maybe_template(Pat, Term, Vsn, Types) when is_list(Pat); + is_tuple(Pat); + is_map(Pat) -> + {Tag, _} = auto_template(Pat), + [emit_code(Tag, Types), + encode_from_template(Pat, Term, Vsn, Types)]; +encode_maybe_template(Other, Term, _Vsn, _Types) -> + error({illegal_template, Other, Term}). + +auto_template({id,Tag,V}) when Tag == account + ; Tag == name + ; Tag == commitment + ; Tag == contract + ; Tag == channel + ; Tag == associate_chain + ; Tag == entry -> + if is_binary(V) -> {id, id}; + true -> + %% close, but no cigar + {tuple, tuple} + end; +auto_template(T) -> + if is_map(T) -> {map, map}; + is_list(T) -> {list, list}; + is_tuple(T) -> {tuple, tuple}; + is_binary(T) -> {binary, binary}; + is_boolean(T) -> {bool, bool}; + is_atom(T) -> {label, label}; % binary_to_existing_atom() + is_integer(T), + T >= 0 -> {int, int}; + true -> + error(invalid_type) + end. + +decode_from_template(list, Fld, Vsn, Types) -> + decode_(Fld, Vsn, Types, []); +decode_from_template(map, Fld, Vsn, Types) -> + TupleFields = [F || F <- Fld], + Items = [decode_from_template(tuple, T, Vsn, Types) + || T <- TupleFields], + maps:from_list(Items); +decode_from_template(tuple, Fld, Vsn, Types) -> + Items = decode_(Fld, Vsn, Types, []), + list_to_tuple(Items); +decode_from_template([Type], Fields, Vsn, Types) -> + [decode_from_template(Type, F, Vsn, Types) + || F <- Fields]; +decode_from_template(Type, V, Vsn, Types) when is_list(Type), is_list(V) -> + decode_fields(Type, V, Vsn, Types); +decode_from_template(Type, V, Vsn, Types) when is_tuple(Type), is_list(V) -> + Zipped = lists:zip(tuple_to_list(Type), V), + Items = [decode_from_template(T1, V1, Vsn, Types) || {T1, V1} <- Zipped], + list_to_tuple(Items); +decode_from_template(Type, Fld, _, _) when Type == int + ; Type == binary + ; Type == bool + ; Type == id + ; Type == label -> + decode_basic(Type, Fld). + +encode_from_template(Type, V, Vsn, Types) -> + encode_from_template(Type, V, true, Vsn, Types). + +encode_from_template(list, L, _, Vsn, Types) when is_list(L) -> + [encode_(V, Vsn, Types) || V <- L]; +encode_from_template(map, M, _, Vsn, Types) when is_map(M) -> + [encode_({K,V}, false, Vsn, Types) + || {K, V} <- lists:sort(maps:to_list(M))]; +encode_from_template(tuple, T, _, Vsn, Types) when is_tuple(T) -> + [encode_(V, Vsn, Types) || V <- tuple_to_list(T)]; +encode_from_template(T, V, _, Vsn, Types) when tuple_size(T) =:= tuple_size(V) -> + Zipped = lists:zip(tuple_to_list(T), tuple_to_list(V)), + [encode_from_template(T1, V1, false, Vsn, Types) || {T1, V1} <- Zipped]; +encode_from_template([Type], List, _, Vsn, Types) -> + [encode_from_template(Type, V, false, Vsn, Types) || V <- List]; +encode_from_template(Type, List, _, Vsn, Types) when is_list(Type), is_list(List) -> + encode_fields(Type, List, Vsn, Types); +encode_from_template(Type, V, _, _, _Types) when Type == id + ; Type == binary + ; Type == bool + ; Type == int + ; Type == label -> + encode_basic(Type, V); +encode_from_template(Type, V, _, _, _) -> + error({illegal, Type, V}). + + +%% Basically, dynamically encoding a statically defined object +encode_fields([{Field, Type}|TypesLeft], + [{Field, Val}|FieldsLeft], Vsn, Types) -> + [ encode_from_template(Type, Val, Vsn, Types) + | encode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; +encode_fields([{_Field, Type}|TypesLeft], + [Val |FieldsLeft], Vsn, Types) -> + %% Not sure if we want to try this ... + [ encode_from_template(Type, Val, Vsn, Types) + | encode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; +encode_fields([Type|TypesLeft], + [Val |FieldsLeft], Vsn, Types) when is_atom(Type) -> + %% Not sure about this either ... + [ encode_from_template(Type, Val, Vsn, Types) + | encode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; +encode_fields([], [], _, _) -> + []. + +decode_fields([{Tag, Type}|TypesLeft], + [Field |FieldsLeft], Vsn, Types) -> + + [ {Tag, decode_from_template(Type, Field, Vsn, Types)} + | decode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; +decode_fields([], [], _, _) -> + []. + +emit_code(Tag, #{rev := Tags}) -> + encode_basic(int, maps:get(Tag, Tags)). + +decode_basic(label, Fld) -> + binary_to_existing_atom(decode_basic(binary, Fld), utf8); +decode_basic(Type, Fld) -> + gmserialization:decode_field(Type, Fld). + +encode_basic(label, A) when is_atom(A) -> + encode_basic(binary, atom_to_binary(A, utf8)); +encode_basic(Type, Fld) -> + gmserialization:encode_field(Type, Fld). + +rlp_decode(Bin) -> + gmser_rlp:decode(Bin). + +rlp_encode(Fields) -> + gmser_rlp:encode(Fields). + + +-ifdef(TEST). + +trace() -> + dbg:tracer(), + dbg:tpl(?MODULE, x), + dbg:p(all, [c]). + +notrace() -> + dbg:ctpl('_'), + dbg:stop(). + +round_trip_test_() -> + [?_test(t_round_trip(T)) || + T <- t_sample_types() + ]. + +t_sample_types() -> + [ 5 + , <<"a">> + , [1,2,3] + , {<<"a">>,1} + , #{<<"a">> => 1} + , [#{1 => <<"c">>, [17] => true}] + , true + ]. + +user_types_test_() -> + {foreach, + fun() -> + revert_to_default_types() + end, + fun(_) -> + revert_to_default_types() + end, + [ ?_test(t_reg_typed_tuple()) + , ?_test(t_reg_chain_objects_array()) + ]}. + +t_round_trip(T) -> + ?debugVal(T), + ?assertMatch({T, T}, {T, decode(encode(T))}). + +t_reg_typed_tuple() -> + Type = {int, int, int}, + MyTypes = #{ codes => #{ 1001 => int_tup3 } + , templates => #{ int_tup3 => Type } + }, + register_types(MyTypes), + GoodTerm = {2,3,4}, + ?debugFmt("Type: ~p, GoodTerm = ~p", [Type, GoodTerm]), + Enc = encode_typed(int_tup3, GoodTerm), + GoodTerm = decode(Enc), + t_bad_typed_encode(int_tup3, {1,2,<<"a">>}, {illegal,int,<<"a">>}), + t_bad_typed_encode(int_tup3, {1,2,3,4}, {illegal, {int,int,int}, {1,2,3,4}}). + +t_bad_typed_encode(Type, Term, Error) -> + try encode_typed(Type, Term), + error({expected_error, Error}) + catch + error:Error -> + ok + end. + +t_reg_chain_objects_array() -> + Template = [{foo, {int, binary}}, {bar, [{int, int}]}, {baz, {int}}], + ?debugFmt("Template = ~p", [Template]), + MyTypes = #{ codes => #{ 1002 => coa } + , templates => #{ coa => Template } }, + register_types(MyTypes), + Values = [{foo, {1, <<"foo">>}}, {bar, [{1, 2}, {3, 4}, {5, 6}]}, {baz, {1}}], + ?debugFmt("Values = ~p", [Values]), + Enc = encode_typed(coa, Values), + Values = decode(Enc). + +-endif. diff --git a/src/gmserialization.erl b/src/gmserialization.erl index 2c12e6b..327823f 100644 --- a/src/gmserialization.erl +++ b/src/gmserialization.erl @@ -10,9 +10,11 @@ -vsn("0.1.2"). -export([ decode_fields/2 + , decode_field/2 , deserialize/5 , deserialize_tag_and_vsn/1 , encode_fields/2 + , encode_field/2 , serialize/4 ]). %%%=================================================================== @@ -23,6 +25,8 @@ , fields/0 ]). +-export_type([ encodable_term/0 ]). + -type template() :: [{field_name(), type()}]. -type field_name() :: atom(). -type type() :: 'int' -- 2.30.2 From 3ede4f22e1b80b97a6b9441686e931d55232253d Mon Sep 17 00:00:00 2001 From: Ulf Wiger Date: Sat, 5 Apr 2025 13:20:30 +0200 Subject: [PATCH 2/3] Register individual types, more docs --- README.md | 88 ++++++++++++++ src/gmser_dyn.erl | 292 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 319 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index a31eca6..f5d0dff 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,91 @@ Test ---- $ rebar3 eunit + +Dynamic encoding +---- + +The module `gmser_dyn` offers dynamic encoding support, encoding most 'regular' +Erlang data types into an internal RLP representation. + +Main API: +* `encode(term()) -> iolist()` +* `encode_typed(template(), term()) -> iolist()` +* `decode(iolist()) -> term()` + +* `serialize(term()) -> binary()` +* `serialize_typed(template(), term()) -> binary()` +* `deserialize(binary()) -> term()` + +The basic types supported by the encoder are: +* `non_neg_integer()` (`int` , code: 248) +* `binary()` (`binary`, code: 249) +* `boolean()` (`bool` , code: 250) +* `list()` (`list` , code: 251) +* `map()` (`map` , code: 252) +* `tuple()` (`tuple` , code: 253) +* `gmser_id:id()` (`id` , code: 254) +* `atom()` (`label` , code: 255) + +When encoding `map` types, the map elements are first sorted. + +When specifying a map type for template-driven encoding, use +the `#{items => [{Key, Value}]}` construct. + +Labels +---- + +Labels correspond to (existing) atoms in Erlang. +Decoding of a label results in a call to `binary_to_existing_atom/2`, so will +fail if the corresponding atom does not already exist. + +It's possible to cache labels for more compact encoding. +Note that when caching labels, the same cache mapping needs to be used on the +decoder side. + +Labels are encoded as `[<<255>>, << AtomToBinary/binary >>]`. +If a cached label is used, the encoding becomes `[<<255>, [Ix]]`, where +`Ix` is the integer-encoded index value of the cached label. + +Examples +---- + +Dynamically encoded objects have the basic structure `[<<0>>,V,Obj]`, where `V` is the +integer-coded version, and `Obj` is the top-level encoding on the form `[Tag,Data]`. + +```erlang +E = fun(T) -> io:fwrite("~w~n", [gmser_dyn:encode(T)]) end. + +E(17) -> [<<0>>,<<1>>,[<<248>>,<<17>>]] +E(<<"abc">>) -> [<<0>>,<<1>>,[<<249>>,<<97,98,99>>]] +E(true) -> [<<0>>,<<1>>,[<<250>>,<<1>>]] +E(false) -> [<<0>>,<<1>>,[<<250>>,<<0>>]] +E([1,2]) -> [<<0>>,<<1>>,[<<251>>,[[<<248>>,<<1>>],[<<248>>,<<2>>]]]] +E({1,2}) -> [<<0>>,<<1>>,[<<253>>,[[<<248>>,<<1>>],[<<248>>,<<2>>]]]] +E(#{a=>1, b=>2}) -> + [<<0>>,<<1>>,[<<252>>,[[[<<255>>,<<97>>],[<<248>>,<<1>>]],[[<<255>>,<<98>>],[<<248>>,<<2>>]]]]] +E(gmser_id:create(account,<<1:256>>)) -> + [<<0>>,<<1>>,[<<254>>,<<1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1>>]] +``` + +Note that tuples and list are encoded the same way, except for the initial type tag. +Maps are encoded as `[, [KV1, KV2, ...]]`, where `[KV1, KV2, ...]` is the sorted +list of key-value tuples from `map:to_list(Map)`, but with the `tuple` type tag omitted. + +Template-driven encoding +---- + +Templates can be provided to the encoder by either naming an already registered +type, or by passing a template directly. The template will then be enforced, and +used to slightly compress the encoding. + +In the following example, as the encoder knows that `{11,12}` is encoded as a +tuple of two integers, it can omit the inner type tags. + +```erlang +ET = fun(Type,Term) -> io:fwrite("~w~n", [gmser_dyn:encode_typed(Type,Term)]) end. + +ET({int,int}, {11,12}) ->[<<0>>,<<1>>,[<<253>>,[<<11>>,<<12>>]]] +ET({int,int}, {11,a}) -> +** exception error: {illegal,int,a} ... +``` diff --git a/src/gmser_dyn.erl b/src/gmser_dyn.erl index d241562..2a194ef 100644 --- a/src/gmser_dyn.erl +++ b/src/gmser_dyn.erl @@ -14,11 +14,17 @@ , deserialize/1 , deserialize/2 ]). +%% register a type schema, inspect existing schema -export([ register_types/1 , registered_types/0 + , types_from_list/1 , revert_to_default_types/0 , dynamic_types/0 ]). +%% Register individual types, or cache labels +-export([ register_type/3 + , cache_label/2 ]). + -import(gmserialization, [ decode_field/2 ]). -define(VSN, 1). @@ -88,7 +94,7 @@ dynamic_types() -> , 252 => map , 253 => tuple , 254 => id - , 255 => label } + , 255 => label} , rev => #{ int => 248 , binary => 249 @@ -98,6 +104,8 @@ dynamic_types() -> , tuple => 253 , id => 254 , label => 255} + , labels => #{} + , rev_labels => #{} , templates => #{ int => int , binary => binary @@ -113,66 +121,13 @@ dynamic_types() -> vsn(Types) -> maps:get(vsn, Types, ?VSN). -register_types(Types) when is_map(Types) -> - Codes = maps:get(codes, Types, #{}), - Rev = rev_codes(Codes), - Templates = maps:get(templates, Types, #{}), - #{codes := Codes0, rev := Rev0, templates := Templates0} = - dynamic_types(), - Merged = #{ codes => maps:merge(Codes0, Codes) - , rev => maps:merge(Rev0, Rev) - , templates => maps:merge(Templates0, Templates) }, - assert_sizes(Merged), - assert_mappings(Merged), - persistent_term:put({?MODULE, types}, Merged). - -revert_to_default_types() -> - persistent_term:put({?MODULE, types}, dynamic_types()). - -assert_sizes(#{codes := Codes, rev := Rev, templates := Ts} = Types) -> - assert_sizes(map_size(Codes), map_size(Rev), map_size(Ts), Types). - -assert_sizes(Sz, Sz, Sz, _) -> - ok; -assert_sizes(Sz, RSz, Sz, Types) when RSz =/= Sz -> - %% Wrong size reverse mapping must mean duplicate mappings - %% We auto-generate the reverse-mappings, so we know there aren't - %% too many of them - ?LOG_ERROR("Reverse mapping size doesn't match codes size", []), - Codes = maps:get(codes, Types), - CodeVals = maps:values(Codes), - Duplicates = CodeVals -- lists:usort(CodeVals), - error({duplicate_mappings, Duplicates, Types}); -assert_sizes(Sz, _, TSz, Types) when Sz > TSz -> - ?LOG_ERROR("More codes than templates", []), - Tags = maps:keys(maps:get(rev, Types)), - Templates = maps:get(templates, Types), - Missing = [T || T <- Tags, - not is_map_key(T, Templates)], - error({missing_mappings, Missing, Types}); -assert_sizes(Sz, _, TSz, Types) when TSz > Sz -> - %% More mappings than codes. May not be horrible. - %% We check that all codes have mappings elsewhere. - ?LOG_WARNING("More templates than codes in ~p", [Types]), - ok. - -assert_mappings(#{rev := Rev, templates := Ts} = Types) -> - Tags = maps:keys(Rev), - case [T || T <- Tags, - not is_map_key(T, Ts)] of - [] -> - ok; - Missing -> - ?LOG_ERROR("Missing templates for ~p", [Missing]), - error({missing_templates, Missing, Types}) - end. - -rev_codes(Codes) -> - L = maps:to_list(Codes), - maps:from_list([{V, K} || {K, V} <- L]). - registered_types() -> - persistent_term:get({?MODULE, types}, dynamic_types()). + case persistent_term:get({?MODULE, types}, undefined) of + undefined -> + dynamic_types(); + Types when is_map(Types) -> + Types + end. template(TagOrCode, Vsn, Types) -> {Tag, Template} = get_template(TagOrCode, Types), @@ -190,6 +145,9 @@ dyn_template_(F, Vsn) -> true -> F end. +find_cached_label(Lbl, #{labels := Lbls}) -> + maps:find(Lbl, Lbls). + decode_(Fields, Vsn, Types, Acc) -> {_Tag, Term, Rest} = decode_field_(Fields, Vsn, Types), Acc1 = [Term | Acc], @@ -228,7 +186,7 @@ encode_typed_(Code, Term, Vsn, #{codes := Codes} = Types) when is_map_key(Code, {_Tag, Template} = template(Code, Vsn, Types), [encode_basic(int, Code), encode_from_template(Template, Term, Vsn, Types)]; encode_typed_(Tag, Term, Vsn, #{templates := Ts} = Types) when is_map_key(Tag, Ts) -> - Template = maps:get(Tag, Ts), + Template = dyn_template_(maps:get(Tag, Ts), Vsn), [emit_code(Tag, Types), encode_from_template(Template, Term, Vsn, Types)]; encode_typed_(MaybeTemplate, Term, Vsn, Types) -> encode_maybe_template(MaybeTemplate, Term, Vsn, Types). @@ -286,6 +244,9 @@ decode_from_template(Type, V, Vsn, Types) when is_tuple(Type), is_list(V) -> Zipped = lists:zip(tuple_to_list(Type), V), Items = [decode_from_template(T1, V1, Vsn, Types) || {T1, V1} <- Zipped], list_to_tuple(Items); +decode_from_template(label, [C], _, #{rev_labels := RLbls}) -> + Code = decode_basic(int, C), + maps:get(Code, RLbls); decode_from_template(Type, Fld, _, _) when Type == int ; Type == binary ; Type == bool @@ -310,6 +271,13 @@ encode_from_template([Type], List, _, Vsn, Types) -> [encode_from_template(Type, V, false, Vsn, Types) || V <- List]; encode_from_template(Type, List, _, Vsn, Types) when is_list(Type), is_list(List) -> encode_fields(Type, List, Vsn, Types); +encode_from_template(label, V, _, _, Types) -> + case find_cached_label(V, Types) of + error -> + encode_basic(label, V); + {ok, Code} when is_integer(Code) -> + [encode_basic(int, Code)] + end; encode_from_template(Type, V, _, _, _Types) when Type == id ; Type == binary ; Type == bool @@ -365,6 +333,152 @@ rlp_decode(Bin) -> rlp_encode(Fields) -> gmser_rlp:encode(Fields). +%% =========================================================================== +%% Type registration and validation code + +register_types(Types) when is_map(Types) -> + Codes = maps:get(codes, Types, #{}), + Rev = rev_codes(Codes), + Templates = maps:get(templates, Types, #{}), + Labels = maps:get(labels, Types, #{}), + #{codes := Codes0, rev := Rev0, labels := Labels0, templates := Templates0} = + dynamic_types(), + Merged = #{ codes => maps:merge(Codes0, Codes) + , rev => maps:merge(Rev0, Rev) + , templates => maps:merge(Templates0, Templates) + , labels => maps:merge(Labels0, Labels) }, + assert_sizes(Merged), + assert_mappings(Merged), + Merged1 = assert_label_cache(Merged), + put_types(Merged1). + +put_types(Types) -> + persistent_term:put({?MODULE, types}, Types). + +types_from_list(L) -> + lists:foldl(fun elem_to_type/2, dynamic_types(), L). + +register_type(Code, Tag, Template) when is_integer(Code), Code >= 0 -> + #{codes := Codes, rev := Rev, templates := Temps} = Types = registered_types(), + case {is_map_key(Code, Codes), is_map_key(Tag, Rev)} of + {false, false} -> + New = Types#{ codes := Codes#{Code => Tag} + , rev := Rev#{Tag => Code} + , templates := Temps#{Tag => Template} }, + put_types(New), + New; + {true, _} -> error(code_exists); + {_, true} -> error(tag_exists) + end. + +cache_label(Code, Label) when is_integer(Code), Code >= 0, is_atom(Label) -> + #{labels := Lbls, rev_labels := RevLbls} = Types = registered_types(), + case {is_map_key(Label, Lbls), is_map_key(Code, RevLbls)} of + {false, false} -> + New = Types#{ labels := Lbls#{Label => Code} + , rev_labels := RevLbls#{Code => Label} }, + put_types(New), + New; + {true,_} -> error(label_exists); + {_,true} -> error(code_exists) + end. + +elem_to_type({Tag, Code, Template}, Acc) when is_atom(Tag), is_integer(Code) -> + #{codes := Codes, rev := Rev, templates := Temps} = Acc, + case {is_map_key(Tag, Rev), is_map_key(Code, Codes)} of + {false, false} -> + Acc#{ codes := Codes#{Code => Tag} + , rev := Rev#{Tag => Code} + , templates => Temps#{Tag => Template} + }; + {true, _} -> error({duplicate_tag, Tag}); + {_, true} -> error({duplicate_code, Code}) + end; +elem_to_type({labels, Lbls}, Acc) -> + lists:foldl(fun add_label/2, Acc, Lbls); +elem_to_type(Elem, _) -> + error({invalid_type_list_element, Elem}). + +add_label({L, Code}, #{labels := Lbls, rev_labels := RevLbls} = Acc) + when is_atom(L), is_integer(Code), Code > 0 -> + case {is_map_key(L, Lbls), is_map_key(Code, RevLbls)} of + {false, false} -> + Acc#{labels := Lbls#{L => Code}, + rev_labels := RevLbls#{Code => L}}; + {true, _} -> error({duplicate_label, L}); + {_, true} -> error({duplicate_label_code, Code}) + end; +add_label(Elem, _) -> + error({invalid_label_elem, Elem}). + + +revert_to_default_types() -> + persistent_term:put({?MODULE, types}, dynamic_types()). + +assert_sizes(#{codes := Codes, rev := Rev, templates := Ts} = Types) -> + assert_sizes(map_size(Codes), map_size(Rev), map_size(Ts), Types). + +assert_sizes(Sz, Sz, Sz, _) -> + ok; +assert_sizes(Sz, RSz, Sz, Types) when RSz =/= Sz -> + %% Wrong size reverse mapping must mean duplicate mappings + %% We auto-generate the reverse-mappings, so we know there aren't + %% too many of them + ?LOG_ERROR("Reverse mapping size doesn't match codes size", []), + Codes = maps:get(codes, Types), + CodeVals = maps:values(Codes), + Duplicates = CodeVals -- lists:usort(CodeVals), + error({duplicate_mappings, Duplicates, Types}); +assert_sizes(Sz, _, TSz, Types) when Sz > TSz -> + ?LOG_ERROR("More codes than templates", []), + Tags = maps:keys(maps:get(rev, Types)), + Templates = maps:get(templates, Types), + Missing = [T || T <- Tags, + not is_map_key(T, Templates)], + error({missing_mappings, Missing, Types}); +assert_sizes(Sz, _, TSz, Types) when TSz > Sz -> + %% More mappings than codes. May not be horrible. + %% We check that all codes have mappings elsewhere. + ?LOG_WARNING("More templates than codes in ~p", [Types]), + ok. + +assert_mappings(#{rev := Rev, templates := Ts} = Types) -> + Tags = maps:keys(Rev), + case [T || T <- Tags, + not is_map_key(T, Ts)] of + [] -> + ok; + Missing -> + ?LOG_ERROR("Missing templates for ~p", [Missing]), + error({missing_templates, Missing, Types}) + end. + +assert_label_cache(#{labels := Labels} = Types) -> + Ls = maps:keys(Labels), + case [L || L <- Ls, not is_atom(L)] of + [] -> ok; + _NonAtoms -> + error(non_atoms_in_label_cache) + end, + Rev = [{C,L} || {L,C} <- maps:to_list(Labels)], + case [C || {C,_} <- Rev, not is_integer(C)] of + [] -> ok; + _NonInts -> error(non_integer_label_cache_codes) + end, + RevLabels = maps:from_list(Rev), + case map_size(RevLabels) == map_size(Labels) of + true -> + Types#{rev_labels => RevLabels}; + false -> + error(non_unique_label_cache_codes) + end. + +rev_codes(Codes) -> + L = maps:to_list(Codes), + maps:from_list([{V, K} || {K, V} <- L]). + +%% =========================================================================== +%% Unit tests -ifdef(TEST). @@ -402,6 +516,10 @@ user_types_test_() -> end, [ ?_test(t_reg_typed_tuple()) , ?_test(t_reg_chain_objects_array()) + , ?_test(t_reg_template_fun()) + , ?_test(t_reg_template_vsnd_fun()) + , ?_test(t_reg_label_cache()) + , ?_test(t_reg_label_cache2()) ]}. t_round_trip(T) -> @@ -440,4 +558,56 @@ t_reg_chain_objects_array() -> Enc = encode_typed(coa, Values), Values = decode(Enc). +t_reg_template_fun() -> + Template = fun() -> {int,int} end, + New = register_type(1010, tup2f0, Template), + ?debugFmt("New = ~p", [New]), + E = encode_typed(tup2f0, {3,4}), + {3,4} = decode(E), + ok. + +t_reg_template_vsnd_fun() -> + Template = fun(1) -> {int,int} end, + New = register_type(1011, tup2f1, Template), + ?debugFmt("New = ~p", [New]), + E = encode_typed(tup2f1, {3,4}), + {3,4} = decode(E), + ok. + + +t_reg_label_cache() -> + Enc0 = gmser_dyn:encode('1'), + ?debugFmt("Enc0 (no cache): ~w", [Enc0]), + MyTypes1 = #{codes => #{1003 => lbl_tup2}, templates => #{ lbl_tup2 => {label,label} }}, + register_types(MyTypes1), + Enc0a = gmser_dyn:encode_typed(lbl_tup2, {'1','1'}), + ?debugFmt("Enc0a (no cache): ~w", [Enc0a]), + {'1','1'} = gmser_dyn:decode(Enc0a), + MyTypes2 = MyTypes1#{labels => #{'1' => 49}}, % atom_to_list('1') == [49] + register_types(MyTypes2), + Enc1 = gmser_dyn:encode('1'), + Enc1a = gmser_dyn:encode_typed(lbl_tup2, {'1','1'}), + ?debugFmt("Enc1 (w/ cache): ~w", [Enc1]), + ?debugFmt("Enc1a (w/ cache): ~w", [Enc1a]), + {'1','1'} = gmser_dyn:decode(Enc1a), + true = Enc0 =/= Enc1, + Enc2 = gmser_dyn:encode_typed(label, '1'), + ?debugFmt("Enc2 (typed): ~w", [Enc2]), + true = Enc2 == Enc1, + true = Enc0a =/= Enc1a. + +t_reg_label_cache2() -> + TFromL = gmser_dyn:types_from_list( + [ {lbl_tup2, 1003, {label, label}} + , {labels, + [{'1', 49}]} + ]), + ?debugFmt("TFromL = ~w", [TFromL]), + register_types(TFromL), + Tup = {'1', '1'}, + Enc = gmser_dyn:encode_typed(lbl_tup2, Tup), + [<<0>>,<<1>>,[<<3,235>>,[[<<49>>],[<<49>>]]]] = Enc, + Tup = gmser_dyn:decode(Enc). + + -endif. -- 2.30.2 From dd1c2455f06aef1fc96d2c942131c0dcc26e2ed8 Mon Sep 17 00:00:00 2001 From: Ulf Wiger Date: Sat, 5 Apr 2025 21:44:36 +0200 Subject: [PATCH 3/3] Fix type-driven encode, more docs --- README.md | 25 ++++++--- src/gmser_dyn.erl | 130 ++++++++++++++++++++++++++++++---------------- 2 files changed, 102 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index f5d0dff..94dc396 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,10 @@ Main API: * `serialize_typed(template(), term()) -> binary()` * `deserialize(binary()) -> term()` +In the examples below, we use the `decode` functions, to illustrate +how the type information is represented. The fully serialized form is +produced by the `serialize` functions. + The basic types supported by the encoder are: * `non_neg_integer()` (`int` , code: 248) * `binary()` (`binary`, code: 249) @@ -88,16 +92,23 @@ Template-driven encoding ---- Templates can be provided to the encoder by either naming an already registered -type, or by passing a template directly. The template will then be enforced, and -used to slightly compress the encoding. +type, or by passing a template directly. In both cases, the encoder will enforce +the type information in the template. -In the following example, as the encoder knows that `{11,12}` is encoded as a -tuple of two integers, it can omit the inner type tags. +If the template has been registered, the encoder omits inner type tags (still +inserting the top-level tag), leading to some compression of the output. +This also means that the serialized term cannot be decoded without the same +schema information on the decoder side. + +In the case of a directly provided template, all type information is inserted, +such that the serialized term can be decoded without any added type information. +The template types are still enforced during encoding. ```erlang ET = fun(Type,Term) -> io:fwrite("~w~n", [gmser_dyn:encode_typed(Type,Term)]) end. -ET({int,int}, {11,12}) ->[<<0>>,<<1>>,[<<253>>,[<<11>>,<<12>>]]] -ET({int,int}, {11,a}) -> -** exception error: {illegal,int,a} ... +ET([{int,int}], [{1,2}]) -> [<<0>>,<<1>>,[<<251>>,[[[<<248>>,<<1>>],[<<248>>,<<2>>]]]]] + +gmser_dyn:register_type(1000,lt2i,[{int,int}]). +ET(lt2i, [{1,2}]) -> [<<0>>,<<1>>,[<<3,232>>,[[<<1>>,<<2>>]]]] ``` diff --git a/src/gmser_dyn.erl b/src/gmser_dyn.erl index 2a194ef..52590f5 100644 --- a/src/gmser_dyn.erl +++ b/src/gmser_dyn.erl @@ -182,21 +182,33 @@ encode_(Term, Emit, Vsn, Types) -> Enc end. -encode_typed_(Code, Term, Vsn, #{codes := Codes} = Types) when is_map_key(Code, Codes) -> +encode_typed_(Type, Term, Vsn, Types) -> + encode_typed_(Type, Term, true, Vsn, Types). + +encode_typed_(any, Term, _, Vsn, Types) -> + encode_(Term, true, Vsn, Types); +encode_typed_(Code, Term, Emit, Vsn, #{codes := Codes} = Types) when is_map_key(Code, Codes) -> {_Tag, Template} = template(Code, Vsn, Types), - [encode_basic(int, Code), encode_from_template(Template, Term, Vsn, Types)]; -encode_typed_(Tag, Term, Vsn, #{templates := Ts} = Types) when is_map_key(Tag, Ts) -> + maybe_emit(Emit, Code, encode_from_template(Template, Term, false, Vsn, Types)); +encode_typed_(Tag, Term, Emit, Vsn, #{templates := Ts, rev := Rev} = Types) + when is_map_key(Tag, Ts) -> Template = dyn_template_(maps:get(Tag, Ts), Vsn), - [emit_code(Tag, Types), encode_from_template(Template, Term, Vsn, Types)]; -encode_typed_(MaybeTemplate, Term, Vsn, Types) -> + Code = maps:get(Tag, Rev), + maybe_emit(Emit, Code, encode_from_template(Template, Term, false, Vsn, Types)); +encode_typed_(MaybeTemplate, Term, _, Vsn, Types) -> encode_maybe_template(MaybeTemplate, Term, Vsn, Types). +maybe_emit(true, Code, Enc) -> + [encode_basic(int, Code), Enc]; +maybe_emit(false, _, Enc) -> + Enc. + encode_maybe_template(Pat, Term, Vsn, Types) when is_list(Pat); is_tuple(Pat); is_map(Pat) -> {Tag, _} = auto_template(Pat), [emit_code(Tag, Types), - encode_from_template(Pat, Term, Vsn, Types)]; + encode_from_template(Pat, Term, true, Vsn, Types)]; encode_maybe_template(Other, Term, _Vsn, _Types) -> error({illegal_template, Other, Term}). @@ -247,63 +259,74 @@ decode_from_template(Type, V, Vsn, Types) when is_tuple(Type), is_list(V) -> decode_from_template(label, [C], _, #{rev_labels := RLbls}) -> Code = decode_basic(int, C), maps:get(Code, RLbls); -decode_from_template(Type, Fld, _, _) when Type == int - ; Type == binary - ; Type == bool - ; Type == id - ; Type == label -> - decode_basic(Type, Fld). +decode_from_template(Type, Fld, _, Types) when Type == int + ; Type == binary + ; Type == bool + ; Type == id + ; Type == label -> + decode_basic(Type, Fld, Types). encode_from_template(Type, V, Vsn, Types) -> encode_from_template(Type, V, true, Vsn, Types). +encode_from_template(any, V, _, Vsn, Types) -> + encode_(V, true, Vsn, Types); encode_from_template(list, L, _, Vsn, Types) when is_list(L) -> + assert_type(is_list(L), list, L), [encode_(V, Vsn, Types) || V <- L]; -encode_from_template(map, M, _, Vsn, Types) when is_map(M) -> +encode_from_template(map, M, _, Vsn, Types) -> + assert_type(is_map(M), map, M), [encode_({K,V}, false, Vsn, Types) || {K, V} <- lists:sort(maps:to_list(M))]; -encode_from_template(tuple, T, _, Vsn, Types) when is_tuple(T) -> - [encode_(V, Vsn, Types) || V <- tuple_to_list(T)]; -encode_from_template(T, V, _, Vsn, Types) when tuple_size(T) =:= tuple_size(V) -> +encode_from_template(tuple, T, Emit, Vsn, Types) -> + assert_type(is_tuple(T), tuple, T), + [encode_(V, Emit, Vsn, Types) || V <- tuple_to_list(T)]; +encode_from_template(T, V, Emit, Vsn, Types) when is_tuple(T) -> + assert_type(is_tuple(V), T, V), + assert_type(tuple_size(T) =:= tuple_size(V), T, V), Zipped = lists:zip(tuple_to_list(T), tuple_to_list(V)), - [encode_from_template(T1, V1, false, Vsn, Types) || {T1, V1} <- Zipped]; -encode_from_template([Type], List, _, Vsn, Types) -> - [encode_from_template(Type, V, false, Vsn, Types) || V <- List]; -encode_from_template(Type, List, _, Vsn, Types) when is_list(Type), is_list(List) -> - encode_fields(Type, List, Vsn, Types); -encode_from_template(label, V, _, _, Types) -> + [encode_from_template(T1, V1, Emit, Vsn, Types) || {T1, V1} <- Zipped]; +encode_from_template([Type] = T, List, Emit, Vsn, Types) -> + assert_type(is_list(List), T, List), + [encode_from_template(Type, V, Emit, Vsn, Types) || V <- List]; +encode_from_template(Type, List, Emit, Vsn, Types) when is_list(Type), is_list(List) -> + encode_fields(Type, List, Emit, Vsn, Types); +encode_from_template(label, V, Emit, _, Types) -> + assert_type(is_atom(V), label, V), case find_cached_label(V, Types) of error -> - encode_basic(label, V); + encode_basic(label, V, Emit, Types); {ok, Code} when is_integer(Code) -> [encode_basic(int, Code)] end; -encode_from_template(Type, V, _, _, _Types) when Type == id - ; Type == binary - ; Type == bool - ; Type == int - ; Type == label -> - encode_basic(Type, V); -encode_from_template(Type, V, _, _, _) -> - error({illegal, Type, V}). +encode_from_template(Type, V, Emit, _, Types) when Type == id + ; Type == binary + ; Type == bool + ; Type == int + ; Type == label -> + encode_basic(Type, V, Emit, Types); +encode_from_template(Type, V, Emit, Vsn, Types) -> + encode_typed_(Type, V, Emit, Vsn, Types). + %% error({illegal, Type, V}). + +assert_type(true, _, _) -> ok; +assert_type(_, Type, V) -> error({illegal, Type, V}). %% Basically, dynamically encoding a statically defined object encode_fields([{Field, Type}|TypesLeft], - [{Field, Val}|FieldsLeft], Vsn, Types) -> - [ encode_from_template(Type, Val, Vsn, Types) - | encode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; -encode_fields([{_Field, Type}|TypesLeft], - [Val |FieldsLeft], Vsn, Types) -> - %% Not sure if we want to try this ... - [ encode_from_template(Type, Val, Vsn, Types) - | encode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; + [{Field, Val}|FieldsLeft], Emit, Vsn, Types) -> + [ encode_from_template(Type, Val, Emit, Vsn, Types) + | encode_fields(TypesLeft, FieldsLeft, Emit, Vsn, Types)]; +encode_fields([{_Field, _Type} = FT|_TypesLeft], + [Val |_FieldsLeft], _Emit, _Vsn, _Types) -> + error({illegal_field, FT, Val}); encode_fields([Type|TypesLeft], - [Val |FieldsLeft], Vsn, Types) when is_atom(Type) -> - %% Not sure about this either ... - [ encode_from_template(Type, Val, Vsn, Types) - | encode_fields(TypesLeft, FieldsLeft, Vsn, Types)]; -encode_fields([], [], _, _) -> + [Val |FieldsLeft], Emit, Vsn, Types) when is_atom(Type) -> + %% Not sure about this ... + [ encode_from_template(Type, Val, Emit, Vsn, Types) + | encode_fields(TypesLeft, FieldsLeft, Emit, Vsn, Types)]; +encode_fields([], [], _, _, _) -> []. decode_fields([{Tag, Type}|TypesLeft], @@ -317,11 +340,26 @@ decode_fields([], [], _, _) -> emit_code(Tag, #{rev := Tags}) -> encode_basic(int, maps:get(Tag, Tags)). +decode_basic(Type, [Tag,V], #{codes := Codes}) -> + case decode_basic(int, Tag) of + Code when map_get(Code, Codes) == Type -> + decode_basic(Type, V); + _ -> + error(illegal) + end; +decode_basic(Type, V, _) -> + decode_basic(Type, V). + decode_basic(label, Fld) -> binary_to_existing_atom(decode_basic(binary, Fld), utf8); decode_basic(Type, Fld) -> gmserialization:decode_field(Type, Fld). +encode_basic(Tag, V, true, Types) -> + [emit_code(Tag, Types), encode_basic(Tag, V)]; +encode_basic(Tag, V, false, _) -> + encode_basic(Tag, V). + encode_basic(label, A) when is_atom(A) -> encode_basic(binary, atom_to_binary(A, utf8)); encode_basic(Type, Fld) -> @@ -593,8 +631,8 @@ t_reg_label_cache() -> true = Enc0 =/= Enc1, Enc2 = gmser_dyn:encode_typed(label, '1'), ?debugFmt("Enc2 (typed): ~w", [Enc2]), - true = Enc2 == Enc1, - true = Enc0a =/= Enc1a. + ?assertEqual(Enc2, Enc1), + ?assertNotEqual(Enc0a, Enc1a). t_reg_label_cache2() -> TFromL = gmser_dyn:types_from_list( -- 2.30.2