
Expose low-level helpers, fix dialyzer warnings WIP column families and mrdb API Basic functionality in place started adding documentation remove doc/ from .gitignore add doc/* files recognize pre-existing tabs at startup wip: most of the functionality in place (not yet merge ops) wip: adding transaction support wip: add transaction test case (currently dumps core) First draft, mnesia plugin user guide Fix note formatting WIP working on indexing Index iterators, dialyzer, xref fixes open db with optimistic transactions Use rocksdb-1.7.0 Use seanhinde rocksdb patch, enable rollback Call the right transaction_get() function WIP add 'snap_tx' activity type tx restart using mrdb_mutex Fix test suite sync bugs WIP instrumented for debugging WIP working on migration test case Add migration test suite Migration works, subscribe to schema changes WIP fix batch handling Manage separate batches per db_ref Add mrdb:fold/3 Add some docs, erlang_ls config Use seanhinde's rocksdb vsn
342 lines
9.4 KiB
Erlang
342 lines
9.4 KiB
Erlang
%%% @doc RocksDB update wrappers, in separate module for easy tracing and mocking.
|
|
%%%
|
|
-module(mnesia_rocksdb_lib).
|
|
|
|
-export([ put/4
|
|
, write/3
|
|
, delete/3
|
|
]).
|
|
|
|
-export([ open_rocksdb/3
|
|
, data_mountpoint/1
|
|
, create_mountpoint/1
|
|
, tabname/1
|
|
]).
|
|
|
|
-export([ default_encoding/3
|
|
, check_encoding/2
|
|
, valid_obj_type/2
|
|
, valid_key_type/2 ]).
|
|
|
|
-export([ keypos/1
|
|
, encode_key/1, encode_key/2
|
|
, decode_key/1, decode_key/2
|
|
, encode_val/1, encode_val/2
|
|
, decode_val/1, decode_val/3
|
|
, encode/2
|
|
, decode/2
|
|
]).
|
|
|
|
-include("mnesia_rocksdb.hrl").
|
|
-include_lib("hut/include/hut.hrl").
|
|
|
|
put(#{db := Ref, cf := CF}, K, V, Opts) ->
|
|
rocksdb:put(Ref, CF, K, V, Opts);
|
|
put(Ref, K, V, Opts) ->
|
|
rocksdb:put(Ref, K, V, Opts).
|
|
|
|
write(#{db := Ref, cf := CF}, L, Opts) ->
|
|
write_as_batch(L, Ref, CF, Opts).
|
|
|
|
delete(Ref, K, Opts) ->
|
|
rocksdb:delete(Ref, K, Opts).
|
|
|
|
|
|
write_as_batch(L, Ref, CF, Opts) ->
|
|
{ok, Batch} = rocksdb:batch(),
|
|
lists:foreach(
|
|
fun({put, K, V}) ->
|
|
ok = rocksdb:batch_put(Batch, CF, K, V);
|
|
({delete, K}) ->
|
|
ok = rocksdb:batch_delete(Batch, CF, K)
|
|
end, L),
|
|
rocksdb:write_batch(Ref, Batch, Opts).
|
|
|
|
create_mountpoint(Tab) ->
|
|
MPd = data_mountpoint(Tab),
|
|
case filelib:is_dir(MPd) of
|
|
false ->
|
|
file:make_dir(MPd),
|
|
ok;
|
|
true ->
|
|
Dir = mnesia_lib:dir(),
|
|
case lists:prefix(Dir, MPd) of
|
|
true ->
|
|
ok;
|
|
false ->
|
|
{error, exists}
|
|
end
|
|
end.
|
|
|
|
data_mountpoint(Tab) ->
|
|
Dir = mnesia_monitor:get_env(dir),
|
|
filename:join(Dir, tabname(Tab) ++ ".extrdb").
|
|
|
|
tabname({admin, Alias}) ->
|
|
"mnesia_rocksdb-" ++ atom_to_list(Alias) ++ "-_db";
|
|
tabname({Tab, index, {{Pos},_}}) ->
|
|
atom_to_list(Tab) ++ "-=" ++ atom_to_list(Pos) ++ "=-_ix";
|
|
tabname({Tab, index, {Pos,_}}) ->
|
|
atom_to_list(Tab) ++ "-" ++ integer_to_list(Pos) ++ "-_ix";
|
|
tabname({Tab, retainer, Name}) ->
|
|
atom_to_list(Tab) ++ "-" ++ retainername(Name) ++ "-_RET";
|
|
tabname(Tab) when is_atom(Tab) ->
|
|
atom_to_list(Tab) ++ "-_tab".
|
|
|
|
default_encoding({_, index, _}, _, _) ->
|
|
{sext, {value, raw}};
|
|
default_encoding({_, retainer, _}, _, _) ->
|
|
{term, {value, term}};
|
|
default_encoding(_, Type, As) ->
|
|
KeyEnc = case Type of
|
|
ordered_set -> sext;
|
|
set -> term;
|
|
bag -> sext
|
|
end,
|
|
ValEnc = case As of
|
|
[_, _] ->
|
|
{value, term};
|
|
[_, _ | _] ->
|
|
{object, term}
|
|
end,
|
|
{KeyEnc, ValEnc}.
|
|
|
|
check_encoding(Encoding, Attributes) ->
|
|
try check_encoding_(Encoding, Attributes)
|
|
catch
|
|
throw:Error ->
|
|
Error
|
|
end.
|
|
|
|
check_encoding_({Key, Val}, As) ->
|
|
Key1 = check_key_encoding(Key),
|
|
Val1 = check_value_encoding(Val, As),
|
|
{ok, {Key1, Val1}};
|
|
check_encoding_(E, _) ->
|
|
throw({error, {invalid_encoding, E}}).
|
|
|
|
check_key_encoding(E) when E==sext; E==term; E==raw ->
|
|
E;
|
|
check_key_encoding(E) ->
|
|
throw({error, {invalid_key_encoding, E}}).
|
|
|
|
check_value_encoding(raw, [_, _]) -> {value, raw};
|
|
check_value_encoding({value, E} = V, [_, _]) when E==term; E==raw; E==sext -> V;
|
|
check_value_encoding({object, E} = V, _) when E==term; E==raw; E==sext -> V;
|
|
check_value_encoding(term, As) -> {val_encoding_type(As), term};
|
|
check_value_encoding(sext, As) -> {val_encoding_type(As), sext};
|
|
check_value_encoding(E, _) ->
|
|
throw({error, {invalid_value_encoding, E}}).
|
|
|
|
val_encoding_type(Attrs) ->
|
|
case Attrs of
|
|
[_, _] -> value;
|
|
[_, _|_] -> object
|
|
end.
|
|
|
|
valid_obj_type(#{encoding := Enc}, Obj) ->
|
|
case {Enc, Obj} of
|
|
{{binary, {value, binary}}, {_, K, V}} ->
|
|
is_binary(K) andalso is_binary(V);
|
|
{{binary, _}, _} ->
|
|
is_binary(element(2, Obj));
|
|
{{_, {value, binary}}, {_, _, V}} ->
|
|
is_binary(V);
|
|
_ ->
|
|
%% No restrictions on object type
|
|
%% unless key and/or value typed to binary
|
|
true
|
|
end.
|
|
|
|
valid_key_type(#{encoding := Enc}, Key) ->
|
|
case Enc of
|
|
{binary, _} when is_binary(Key) ->
|
|
true;
|
|
{binary, _} ->
|
|
false;
|
|
_ ->
|
|
true
|
|
end.
|
|
|
|
|
|
-spec encode_key(any()) -> binary().
|
|
encode_key(Key) ->
|
|
encode(Key, sext).
|
|
|
|
encode(Value, sext) ->
|
|
sext:encode(Value);
|
|
encode(Value, raw) when is_binary(Value) ->
|
|
Value;
|
|
encode(Value, term) ->
|
|
term_to_binary(Value).
|
|
|
|
|
|
encode_key(Key, #{encoding := {Enc,_}}) ->
|
|
encode(Key, Enc);
|
|
encode_key(Key, _) ->
|
|
encode(Key, sext).
|
|
|
|
-spec decode_key(binary()) -> any().
|
|
decode_key(CodedKey) ->
|
|
decode(CodedKey, sext).
|
|
|
|
decode_key(CodedKey, #{encoding := {Enc, _}}) ->
|
|
decode(CodedKey, Enc);
|
|
decode_key(CodedKey, Enc) ->
|
|
decode(CodedKey, Enc).
|
|
|
|
decode(Val, sext) ->
|
|
case sext:partial_decode(Val) of
|
|
{full, Result, _} ->
|
|
Result;
|
|
_ ->
|
|
error(badarg, Val)
|
|
end;
|
|
decode(Val, raw) ->
|
|
Val;
|
|
decode(Val, term) ->
|
|
binary_to_term(Val).
|
|
|
|
-spec encode_val(any()) -> binary().
|
|
encode_val(Val) ->
|
|
encode(Val, term).
|
|
|
|
encode_val(Val, Enc) when is_atom(Enc) ->
|
|
encode(Val, Enc);
|
|
encode_val(_, #{name := {_,index,_}}) ->
|
|
<<>>;
|
|
encode_val(Val, #{encoding := {_, Enc0}, attr_pos := AP}) ->
|
|
{Type, Enc} = enc_type(Enc0),
|
|
case {map_size(AP), Type} of
|
|
{2, value} ->
|
|
encode(element(3, Val), Enc);
|
|
{_, object} ->
|
|
encode(setelement(2, Val, []), Enc)
|
|
end.
|
|
|
|
enc_type({T, _} = E) when T==value; T==object ->
|
|
E;
|
|
enc_type(E) when is_atom(E) ->
|
|
{object, E}.
|
|
|
|
-spec decode_val(binary()) -> any().
|
|
decode_val(CodedVal) ->
|
|
binary_to_term(CodedVal).
|
|
|
|
decode_val(<<>>, K, #{name := {_,index,_}}) ->
|
|
{K};
|
|
decode_val(CodedVal, Key, Ref) ->
|
|
{Type, Enc} = value_encoding(Ref),
|
|
case Type of
|
|
object ->
|
|
setelement(2, decode(CodedVal, Enc), Key);
|
|
value ->
|
|
make_rec(Key, decode(CodedVal, Enc), Ref)
|
|
end.
|
|
|
|
make_rec(Key, _Val, #{name := {_, index, {_,ordered}}}) ->
|
|
{Key};
|
|
make_rec(Key, Val, #{properties := #{record_name := Tag}}) ->
|
|
{Tag, Key, Val};
|
|
make_rec(Key, Val, #{attr_pos := AP}) ->
|
|
%% no record name
|
|
case AP of
|
|
#{key := 1} -> {Key, Val};
|
|
#{key := 2} -> {Val, Key} %% Yeah, right, but people are weird
|
|
end.
|
|
|
|
value_encoding(#{encoding := {_, Enc}}) ->
|
|
enc_type(Enc);
|
|
value_encoding(#{}) ->
|
|
{object, term};
|
|
value_encoding({Type, Enc} = E) when is_atom(Type), is_atom(Enc) ->
|
|
E.
|
|
|
|
keypos({admin, _}) ->
|
|
1;
|
|
keypos({_, index, _}) ->
|
|
1;
|
|
keypos({_, retainer, _}) ->
|
|
2;
|
|
keypos(Tab) when is_atom(Tab) ->
|
|
2.
|
|
|
|
%% ======================================================================
|
|
%% Private functions
|
|
%% ======================================================================
|
|
|
|
retainername(Name) when is_atom(Name) ->
|
|
atom_to_list(Name);
|
|
retainername(Name) when is_list(Name) ->
|
|
try binary_to_list(list_to_binary(Name))
|
|
catch
|
|
error:_ ->
|
|
lists:flatten(io_lib:write(Name))
|
|
end;
|
|
retainername(Name) ->
|
|
lists:flatten(io_lib:write(Name)).
|
|
|
|
open_rocksdb(MPd, RdbOpts, CFs) ->
|
|
open_rocksdb(MPd, rocksdb_open_opts_(RdbOpts), CFs, get_retries()).
|
|
|
|
%% Code adapted from basho/riak_kv_eleveldb_backend.erl
|
|
open_rocksdb(MPd, Opts, CFs, Retries) ->
|
|
open_db(MPd, Opts, CFs, max(1, Retries), undefined).
|
|
|
|
open_db(_, _, _, 0, LastError) ->
|
|
{error, LastError};
|
|
open_db(MPd, Opts, CFs, RetriesLeft, _) ->
|
|
case rocksdb:open_optimistic_transaction_db(MPd, Opts, CFs) of
|
|
{ok, _Ref, _CFRefs} = Ok ->
|
|
?log(debug, "Open - Rocksdb: ~s (~p) -> ~p", [MPd, Opts, Ok]),
|
|
Ok;
|
|
%% Check specifically for lock error, this can be caused if
|
|
%% a crashed mnesia takes some time to flush rocksdb information
|
|
%% out to disk. The process is gone, but the NIF resource cleanup
|
|
%% may not have completed.
|
|
{error, {db_open, OpenErr}=Reason} ->
|
|
case lists:prefix("IO error: lock ", OpenErr) of
|
|
true ->
|
|
SleepFor = get_retry_delay(),
|
|
?log(debug, ("Open - Rocksdb backend retrying ~p in ~p ms"
|
|
" after error ~s"), [MPd, SleepFor, OpenErr]),
|
|
timer:sleep(SleepFor),
|
|
open_db(MPd, Opts, CFs, RetriesLeft - 1, Reason);
|
|
false ->
|
|
{error, Reason}
|
|
end;
|
|
{error, Reason} ->
|
|
{error, Reason}
|
|
end.
|
|
|
|
get_retries() -> 30.
|
|
get_retry_delay() -> 100.
|
|
|
|
rocksdb_open_opts_(RdbOpts) ->
|
|
lists:foldl(
|
|
fun({K,_} = Item, Acc) ->
|
|
lists:keystore(K, 1, Acc, Item)
|
|
end, default_open_opts(), RdbOpts).
|
|
|
|
default_open_opts() ->
|
|
[ {create_if_missing, true}
|
|
, {cache_size,
|
|
list_to_integer(get_env_default("ROCKSDB_CACHE_SIZE", "32212254"))}
|
|
, {block_size, 1024}
|
|
, {max_open_files, 30}
|
|
, {write_buffer_size,
|
|
list_to_integer(get_env_default(
|
|
"ROCKSDB_WRITE_BUFFER_SIZE", "4194304"))}
|
|
, {compression,
|
|
list_to_atom(get_env_default("ROCKSDB_COMPRESSION", "true"))}
|
|
, {use_bloomfilter, true}
|
|
].
|
|
|
|
get_env_default(Key, Default) ->
|
|
case os:getenv(Key) of
|
|
false ->
|
|
Default;
|
|
Value ->
|
|
Value
|
|
end.
|