mnesia_rocksdb/src/mnesia_rocksdb_lib.erl
Ulf Wiger d5dafb5b7e Refactor to support column families, direct rocksdb access
Expose low-level helpers, fix dialyzer warnings

WIP column families and mrdb API

Basic functionality in place

started adding documentation

remove doc/ from .gitignore

add doc/* files

recognize pre-existing tabs at startup

wip: most of the functionality in place (not yet merge ops)

wip: adding transaction support

wip: add transaction test case (currently dumps core)

First draft, mnesia plugin user guide

Fix note formatting

WIP working on indexing

Index iterators, dialyzer, xref fixes

open db with optimistic transactions

Use rocksdb-1.7.0

Use seanhinde rocksdb patch, enable rollback

Call the right transaction_get() function

WIP add 'snap_tx' activity type

tx restart using mrdb_mutex

Fix test suite sync bugs

WIP instrumented for debugging

WIP working on migration test case

Add migration test suite

Migration works, subscribe to schema changes

WIP fix batch handling

Manage separate batches per db_ref

Add mrdb:fold/3

Add some docs, erlang_ls config

Use seanhinde's rocksdb vsn
2022-03-25 15:48:19 +01:00

342 lines
9.4 KiB
Erlang

%%% @doc RocksDB update wrappers, in separate module for easy tracing and mocking.
%%%
-module(mnesia_rocksdb_lib).
-export([ put/4
, write/3
, delete/3
]).
-export([ open_rocksdb/3
, data_mountpoint/1
, create_mountpoint/1
, tabname/1
]).
-export([ default_encoding/3
, check_encoding/2
, valid_obj_type/2
, valid_key_type/2 ]).
-export([ keypos/1
, encode_key/1, encode_key/2
, decode_key/1, decode_key/2
, encode_val/1, encode_val/2
, decode_val/1, decode_val/3
, encode/2
, decode/2
]).
-include("mnesia_rocksdb.hrl").
-include_lib("hut/include/hut.hrl").
put(#{db := Ref, cf := CF}, K, V, Opts) ->
rocksdb:put(Ref, CF, K, V, Opts);
put(Ref, K, V, Opts) ->
rocksdb:put(Ref, K, V, Opts).
write(#{db := Ref, cf := CF}, L, Opts) ->
write_as_batch(L, Ref, CF, Opts).
delete(Ref, K, Opts) ->
rocksdb:delete(Ref, K, Opts).
write_as_batch(L, Ref, CF, Opts) ->
{ok, Batch} = rocksdb:batch(),
lists:foreach(
fun({put, K, V}) ->
ok = rocksdb:batch_put(Batch, CF, K, V);
({delete, K}) ->
ok = rocksdb:batch_delete(Batch, CF, K)
end, L),
rocksdb:write_batch(Ref, Batch, Opts).
create_mountpoint(Tab) ->
MPd = data_mountpoint(Tab),
case filelib:is_dir(MPd) of
false ->
file:make_dir(MPd),
ok;
true ->
Dir = mnesia_lib:dir(),
case lists:prefix(Dir, MPd) of
true ->
ok;
false ->
{error, exists}
end
end.
data_mountpoint(Tab) ->
Dir = mnesia_monitor:get_env(dir),
filename:join(Dir, tabname(Tab) ++ ".extrdb").
tabname({admin, Alias}) ->
"mnesia_rocksdb-" ++ atom_to_list(Alias) ++ "-_db";
tabname({Tab, index, {{Pos},_}}) ->
atom_to_list(Tab) ++ "-=" ++ atom_to_list(Pos) ++ "=-_ix";
tabname({Tab, index, {Pos,_}}) ->
atom_to_list(Tab) ++ "-" ++ integer_to_list(Pos) ++ "-_ix";
tabname({Tab, retainer, Name}) ->
atom_to_list(Tab) ++ "-" ++ retainername(Name) ++ "-_RET";
tabname(Tab) when is_atom(Tab) ->
atom_to_list(Tab) ++ "-_tab".
default_encoding({_, index, _}, _, _) ->
{sext, {value, raw}};
default_encoding({_, retainer, _}, _, _) ->
{term, {value, term}};
default_encoding(_, Type, As) ->
KeyEnc = case Type of
ordered_set -> sext;
set -> term;
bag -> sext
end,
ValEnc = case As of
[_, _] ->
{value, term};
[_, _ | _] ->
{object, term}
end,
{KeyEnc, ValEnc}.
check_encoding(Encoding, Attributes) ->
try check_encoding_(Encoding, Attributes)
catch
throw:Error ->
Error
end.
check_encoding_({Key, Val}, As) ->
Key1 = check_key_encoding(Key),
Val1 = check_value_encoding(Val, As),
{ok, {Key1, Val1}};
check_encoding_(E, _) ->
throw({error, {invalid_encoding, E}}).
check_key_encoding(E) when E==sext; E==term; E==raw ->
E;
check_key_encoding(E) ->
throw({error, {invalid_key_encoding, E}}).
check_value_encoding(raw, [_, _]) -> {value, raw};
check_value_encoding({value, E} = V, [_, _]) when E==term; E==raw; E==sext -> V;
check_value_encoding({object, E} = V, _) when E==term; E==raw; E==sext -> V;
check_value_encoding(term, As) -> {val_encoding_type(As), term};
check_value_encoding(sext, As) -> {val_encoding_type(As), sext};
check_value_encoding(E, _) ->
throw({error, {invalid_value_encoding, E}}).
val_encoding_type(Attrs) ->
case Attrs of
[_, _] -> value;
[_, _|_] -> object
end.
valid_obj_type(#{encoding := Enc}, Obj) ->
case {Enc, Obj} of
{{binary, {value, binary}}, {_, K, V}} ->
is_binary(K) andalso is_binary(V);
{{binary, _}, _} ->
is_binary(element(2, Obj));
{{_, {value, binary}}, {_, _, V}} ->
is_binary(V);
_ ->
%% No restrictions on object type
%% unless key and/or value typed to binary
true
end.
valid_key_type(#{encoding := Enc}, Key) ->
case Enc of
{binary, _} when is_binary(Key) ->
true;
{binary, _} ->
false;
_ ->
true
end.
-spec encode_key(any()) -> binary().
encode_key(Key) ->
encode(Key, sext).
encode(Value, sext) ->
sext:encode(Value);
encode(Value, raw) when is_binary(Value) ->
Value;
encode(Value, term) ->
term_to_binary(Value).
encode_key(Key, #{encoding := {Enc,_}}) ->
encode(Key, Enc);
encode_key(Key, _) ->
encode(Key, sext).
-spec decode_key(binary()) -> any().
decode_key(CodedKey) ->
decode(CodedKey, sext).
decode_key(CodedKey, #{encoding := {Enc, _}}) ->
decode(CodedKey, Enc);
decode_key(CodedKey, Enc) ->
decode(CodedKey, Enc).
decode(Val, sext) ->
case sext:partial_decode(Val) of
{full, Result, _} ->
Result;
_ ->
error(badarg, Val)
end;
decode(Val, raw) ->
Val;
decode(Val, term) ->
binary_to_term(Val).
-spec encode_val(any()) -> binary().
encode_val(Val) ->
encode(Val, term).
encode_val(Val, Enc) when is_atom(Enc) ->
encode(Val, Enc);
encode_val(_, #{name := {_,index,_}}) ->
<<>>;
encode_val(Val, #{encoding := {_, Enc0}, attr_pos := AP}) ->
{Type, Enc} = enc_type(Enc0),
case {map_size(AP), Type} of
{2, value} ->
encode(element(3, Val), Enc);
{_, object} ->
encode(setelement(2, Val, []), Enc)
end.
enc_type({T, _} = E) when T==value; T==object ->
E;
enc_type(E) when is_atom(E) ->
{object, E}.
-spec decode_val(binary()) -> any().
decode_val(CodedVal) ->
binary_to_term(CodedVal).
decode_val(<<>>, K, #{name := {_,index,_}}) ->
{K};
decode_val(CodedVal, Key, Ref) ->
{Type, Enc} = value_encoding(Ref),
case Type of
object ->
setelement(2, decode(CodedVal, Enc), Key);
value ->
make_rec(Key, decode(CodedVal, Enc), Ref)
end.
make_rec(Key, _Val, #{name := {_, index, {_,ordered}}}) ->
{Key};
make_rec(Key, Val, #{properties := #{record_name := Tag}}) ->
{Tag, Key, Val};
make_rec(Key, Val, #{attr_pos := AP}) ->
%% no record name
case AP of
#{key := 1} -> {Key, Val};
#{key := 2} -> {Val, Key} %% Yeah, right, but people are weird
end.
value_encoding(#{encoding := {_, Enc}}) ->
enc_type(Enc);
value_encoding(#{}) ->
{object, term};
value_encoding({Type, Enc} = E) when is_atom(Type), is_atom(Enc) ->
E.
keypos({admin, _}) ->
1;
keypos({_, index, _}) ->
1;
keypos({_, retainer, _}) ->
2;
keypos(Tab) when is_atom(Tab) ->
2.
%% ======================================================================
%% Private functions
%% ======================================================================
retainername(Name) when is_atom(Name) ->
atom_to_list(Name);
retainername(Name) when is_list(Name) ->
try binary_to_list(list_to_binary(Name))
catch
error:_ ->
lists:flatten(io_lib:write(Name))
end;
retainername(Name) ->
lists:flatten(io_lib:write(Name)).
open_rocksdb(MPd, RdbOpts, CFs) ->
open_rocksdb(MPd, rocksdb_open_opts_(RdbOpts), CFs, get_retries()).
%% Code adapted from basho/riak_kv_eleveldb_backend.erl
open_rocksdb(MPd, Opts, CFs, Retries) ->
open_db(MPd, Opts, CFs, max(1, Retries), undefined).
open_db(_, _, _, 0, LastError) ->
{error, LastError};
open_db(MPd, Opts, CFs, RetriesLeft, _) ->
case rocksdb:open_optimistic_transaction_db(MPd, Opts, CFs) of
{ok, _Ref, _CFRefs} = Ok ->
?log(debug, "Open - Rocksdb: ~s (~p) -> ~p", [MPd, Opts, Ok]),
Ok;
%% Check specifically for lock error, this can be caused if
%% a crashed mnesia takes some time to flush rocksdb information
%% out to disk. The process is gone, but the NIF resource cleanup
%% may not have completed.
{error, {db_open, OpenErr}=Reason} ->
case lists:prefix("IO error: lock ", OpenErr) of
true ->
SleepFor = get_retry_delay(),
?log(debug, ("Open - Rocksdb backend retrying ~p in ~p ms"
" after error ~s"), [MPd, SleepFor, OpenErr]),
timer:sleep(SleepFor),
open_db(MPd, Opts, CFs, RetriesLeft - 1, Reason);
false ->
{error, Reason}
end;
{error, Reason} ->
{error, Reason}
end.
get_retries() -> 30.
get_retry_delay() -> 100.
rocksdb_open_opts_(RdbOpts) ->
lists:foldl(
fun({K,_} = Item, Acc) ->
lists:keystore(K, 1, Acc, Item)
end, default_open_opts(), RdbOpts).
default_open_opts() ->
[ {create_if_missing, true}
, {cache_size,
list_to_integer(get_env_default("ROCKSDB_CACHE_SIZE", "32212254"))}
, {block_size, 1024}
, {max_open_files, 30}
, {write_buffer_size,
list_to_integer(get_env_default(
"ROCKSDB_WRITE_BUFFER_SIZE", "4194304"))}
, {compression,
list_to_atom(get_env_default("ROCKSDB_COMPRESSION", "true"))}
, {use_bloomfilter, true}
].
get_env_default(Key, Default) ->
case os:getenv(Key) of
false ->
Default;
Value ->
Value
end.