diff --git a/.gitignore b/.gitignore index eefd59f..5cff9a2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ -/doc /_build \ No newline at end of file diff --git a/Makefile b/Makefile index 91ff3d3..b530ed2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ suite=$(if $(SUITE), suite=$(SUITE), ) REBAR3=$(shell which rebar3 || echo ./rebar3) -.PHONY: all check test clean run +.PHONY: all check test clean run dialyzer xref all: $(REBAR3) compile @@ -18,7 +18,13 @@ eunit: ct: $(REBAR3) ct $(suite) -test: eunit ct +test: dialyzer xref eunit ct + +dialyzer: + $(REBAR3) dialyzer + +xref: + $(REBAR3) xref conf_clean: @: diff --git a/README.md b/README.md index 0a6187b..6f2b2d9 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,53 @@ -# mnesia_rocksdb -A RocksDB backend for Mnesia. -This permits Erlang/OTP applications to use RocksDB as a backend for -mnesia tables. It is based on Klarna's `mnesia_eleveldb`. +# Mnesia Rocksdb - Rocksdb backend plugin for Mnesia # -## Prerequisites +Copyright (c) 2013-21 Klarna AB -- rocksdb (included as dependency) -- Erlang/OTP 20.0 or newer (https://github.com/erlang/otp) +__Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)). -## Getting started +The Mnesia DBMS, part of Erlang/OTP, supports 'backend plugins', making +it possible to utilize more capable key-value stores than the `dets` +module (limited to 2 GB per table). Unfortunately, this support is +undocumented. Below, some informal documentation for the plugin system +is provided. + + +### Table of Contents ### + + +1. [Usage](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Usage) +1. [Prerequisites](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Prerequisites) +1. [Getting started](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Getting_started) +1. [Special features](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Special_features) +1. [Customization](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Customization) +1. [Handling of errors in write operations](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Handling_of_errors_in_write_operations) +1. [Caveats](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Caveats) + +1. [Mnesia backend plugins](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Mnesia_backend_plugins) +1. [Background](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Background) +1. [Design](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Design) + +1. [Mnesia index plugins](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Mnesia_index_plugins) + +1. [Rocksdb](https://github.com/aeternity/mnesia_rocksdb/blob/g3553-refactor-plugin-migration-tmp-220318/doc/README.md#Rocksdb) + + + +### Usage ### + + +#### Prerequisites #### + +* rocksdb (included as dependency) + +* sext (included as dependency) + +* Erlang/OTP 21.0 or newer (https://github.com/erlang/otp) + + + +#### Getting started #### Call `mnesia_rocksdb:register()` immediately after starting mnesia. @@ -18,30 +55,32 @@ starting mnesia. Put `{rocksdb_copies, [node()]}` into the table definitions of tables you want to be in RocksDB. -## Special features -RocksDB tables support efficient selects on *prefix keys*. +#### Special features #### + +RocksDB tables support efficient selects on _prefix keys_. The backend uses the `sext` module (see -https://github.com/uwiger/sext) for mapping between Erlang terms and the +[`https://github.com/uwiger/sext`](https://github.com/uwiger/sext)) for mapping between Erlang terms and the binary data stored in the tables. This provides two useful properties: -- The records are stored in the Erlang term order of their keys. -- A prefix of a composite key is ordered just before any key for which - it is a prefix. For example, `{x, '_'}` is a prefix for keys `{x, a}`, - `{x, b}` and so on. +* The records are stored in the Erlang term order of their keys. + +* A prefix of a composite key is ordered just before any key for which + it is a prefix. For example, `{x, '_'}` is a prefix for keys `{x, a}`,`{x, b}` and so on. + This means that a prefix key identifies the start of the sequence of entries whose keys match the prefix. The backend uses this to optimize selects on prefix keys. -## Customization +### Customization RocksDB supports a number of customization options. These can be specified by providing a `{Key, Value}` list named `rocksdb_opts` under `user_properties`, for example: -```erlang +``` mnesia:create_table(foo, [{rocksdb_copies, [node()]}, ... {user_properties, @@ -53,6 +92,7 @@ Consult the [RocksDB documentation](https://github.com/facebook/rocksdb/wiki/Set for information on configuration parameters. Also see the section below on handling write errors. The default configuration for tables in `mnesia_rocksdb` is: + ``` default_open_opts() -> [ {create_if_missing, true} @@ -74,60 +114,169 @@ This is experimental, and mostly copied from `mnesia_leveldb`. Consult the source code in `mnesia_rocksdb_tuning.erl` and `mnesia_rocksdb_params.erl`. Contributions are welcome. -## Handling of errors in write operations -The RocksDB update operations return either `ok` or `{error, any()}`. -Since the actual updates are performed after the 'point-of-no-return', -returning an `error` result will cause mnesia to behave unpredictably, -since the operations are expected to simply work. - -### Option 1: `on_write_error` - -An `on_write_error` option can be provided, per-table, in the `rocksdb_opts` -user property (see [Customization](#customization) above). -Supported values indicate at which level an error indication should be reported. -Mnesia may save reported events in RAM, and may also print them, -depending on the debug level (controlled with `mnesia:set_debug_level/1`). - -Mnesia debug levels are, in increasing detail, `none | verbose | debug | trace` -The supported values for `on_write_error` are: - - | Value | Saved at debug level | Printed at debug level | Action | - | ------- | -------------------- | ---------------------- | --------- | - | debug | unless none | verbose, debug, trace | ignore | - | verbose | unless none | verbose, debug, trace | ignore | - | warning | always | always | ignore | - | error | always | always | exception | - | fatal | always | always | core dump | - -### Option 2: `on_write_error_store` - -An `on_write_error_store` option can be provided, per-table, in the `rocksdb_opts` -user property (see [Customization](#customization) above). -When set, the backend will use the value of the option as the name for an ETS table -which is used as storage for runtime write errors. The table must be set up outside -of the backend by the clients themselves. - -Entries to the table are in the form of a tuple `{{Table, Key}, Error, InsertedAt}` -where `Table` refers to the Mnesia table name, `Key` is the primary key being used by Mnesia, -`Error` is the error encountered by the backend, and `InsertedAt` refers to the time -the error was encountered as system time in milliseconds. - -The backend will only insert entries and otherwise not manage the table. Thus, clients -are expected to clean up the table during runtime to prevent memory leakage. - -## Caveats +#### Caveats #### Avoid placing `bag` tables in RocksDB. Although they work, each write requires additional reads, causing substantial runtime overheads. There are better ways to represent and process bag data (see above about -*prefix keys*). +_prefix keys_). The `mnesia:table_info(T, size)` call always returns zero for RocksDB tables. RocksDB itself does not track the number of elements in a table, and -although it is possible to make the mnesia_rocksdb backend maintain a size +although it is possible to make the `mnesia_rocksdb` backend maintain a size counter, it incurs a high runtime overhead for writes and deletes since it forces them to first do a read to check the existence of the key. If you depend on having an up to date size count at all times, you need to maintain it yourself. If you only need the size occasionally, you may traverse the table to count the elements. + + +### Mnesia backend plugins ### + + +#### Background #### + +Mnesia was initially designed to be a RAM-only DBMS, and Erlang's +`ets` tables were developed for this purpose. In order to support +persistence, e.g. for configuration data, a disk-based version of `ets` +(called `dets`) was created. The `dets` API mimicks the `ets` API, +and `dets` is quite convenient and fast for (nowadays) small datasets. +However, using a 32-bit bucket system, it is limited to 2GB of data. +It also doesn't support ordered sets. When used in Mnesia, dets-based +tables are called `disc_only_copies`. + +To circumvent these limitations, another table type, called `disc_copies` +was added. This is a combination of `ets` and `disk_log`, where Mnesia +periodically snapshots the `ets` data to a log file on disk, and meanwhile +maintains a log of updates, which can be applied at startup. These tables +are quite performant (especially on read access), but all data is kept in +RAM, which can become a serious limitation. + +A backend plugin system was proposed by Ulf Wiger in 2016, and further +developed with Klarna's support, to finally become included in OTP 19. +Klarna uses a LevelDb backend, but Aeternity, in 2017, instead chose +to implement a Rocksdb backend plugin. + + +### Design ### + +As backend plugins were added on a long-since legacy-stable Mnesia, +they had to conform to the existing code structure. For this reason, +the plugin callbacks hook into the already present low-level access +API in the `mnesia_lib` module. As a consequence, backend plugins have +the same access semantics and granularity as `ets` and `dets`. This +isn't much of a disadvantage for key-value stores like LevelDb and RocksDB, +but a more serious issue is that the update part of this API is called +on _after_ the point of no return. That is, Mnesia does not expect +these updates to fail, and has no recourse if they do. As an aside, +this could also happen if a `disc_only_copies` table exceeds the 2 GB +limit (mnesia will not check it, and `dets` will not complain, but simply +drop the update.) + + +### Mnesia index plugins ### + +When adding support for backend plugins, index plugins were also added. Unfortunately, they remain undocumented. + +An index plugin can be added in one of two ways: + +1. When creating a schema, provide `{index_plugins, [{Name, Module, Function}]}` options. + +1. Call the function `mnesia_schema:add_index_plugin(Name, Module, Function)` + + +`Name` must be an atom wrapped as a 1-tuple, e.g. `{words}`. + +The plugin callback is called as `Module:Function(Table, Pos, Obj)`, where `Pos=={words}` in +our example. It returns a list of index terms. + +Example + +Given the following index plugin implementation: + +``` +-module(words). +-export([words_f/3]). + +words_f(_,_,Obj) when is_tuple(Obj) -> + words_(tuple_to_list(Obj)). + +words_(Str) when is_binary(Str) -> + string:lexemes(Str, [$\s, $\n, [$\r,$\n]]); +words_(L) when is_list(L) -> + lists:flatmap(fun words_/1, L); +words_(_) -> + []. +``` + +We can register the plugin and use it in table definitions: + +``` +Eshell V12.1.3 (abort with ^G) +1> mnesia:start(). +ok +2> mnesia_schema:add_index_plugin({words}, words, words_f). +{atomic,ok} +3> mnesia:create_table(i, [{index, [{words}]}]). +{atomic,ok} +``` + +Note that in this case, we had neither a backend plugin, nor even a persistent schema. +Index plugins can be used with all table types. The registered indexing function (arity 3) must exist +as an exported function along the node's code path. + +To see what happens when we insert an object, we can turn on call trace. + +``` +4> dbg:tracer(). +{ok,<0.108.0>} +5> dbg:tp(words, x). +{ok,[{matched,nonode@nohost,3},{saved,x}]} +6> dbg:p(all,[c]). +{ok,[{matched,nonode@nohost,60}]} +7> mnesia:dirty_write({i,<<"one two">>, [<<"three">>, <<"four">>]}). +(<0.84.0>) call words:words_f(i,{words},{i,<<"one two">>,[<<"three">>,<<"four">>]}) +(<0.84.0>) returned from words:words_f/3 -> [<<"one">>,<<"two">>,<<"three">>, + <<"four">>] +(<0.84.0>) call words:words_f(i,{words},{i,<<"one two">>,[<<"three">>,<<"four">>]}) +(<0.84.0>) returned from words:words_f/3 -> [<<"one">>,<<"two">>,<<"three">>, + <<"four">>] +ok +8> dbg:ctp('_'), dbg:stop(). +ok +9> mnesia:dirty_index_read(i, <<"one">>, {words}). +[{i,<<"one two">>,[<<"three">>,<<"four">>]}] +``` + +(The fact that the indexing function is called twice, seems like a performance bug.) + +We can observe that the indexing callback is able to operate on the whole object. +It needs to be side-effect free and efficient, since it will be called at least once for each update +(if an old object exists in the table, the indexing function will be called on it too, before it is +replaced by the new object.) + + +### Rocksdb ### + + +### Usage ### + + + +## Modules ## + + + + + + + + + + + + + +
mnesia_rocksdb
mnesia_rocksdb_admin
mnesia_rocksdb_app
mnesia_rocksdb_lib
mnesia_rocksdb_params
mnesia_rocksdb_sup
mnesia_rocksdb_tuning
mrdb
mrdb_index
mrdb_mutex
mrdb_select
+ diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000..28c149d --- /dev/null +++ b/doc/README.md @@ -0,0 +1,282 @@ + + +# Mnesia Rocksdb - Rocksdb backend plugin for Mnesia # + +Copyright (c) 2013-21 Klarna AB + +__Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)). + +The Mnesia DBMS, part of Erlang/OTP, supports 'backend plugins', making +it possible to utilize more capable key-value stores than the `dets` +module (limited to 2 GB per table). Unfortunately, this support is +undocumented. Below, some informal documentation for the plugin system +is provided. + + +### Table of Contents ### + + +1. [Usage](#Usage) +1. [Prerequisites](#Prerequisites) +1. [Getting started](#Getting_started) +1. [Special features](#Special_features) +1. [Customization](#Customization) +1. [Handling of errors in write operations](#Handling_of_errors_in_write_operations) +1. [Caveats](#Caveats) + +1. [Mnesia backend plugins](#Mnesia_backend_plugins) +1. [Background](#Background) +1. [Design](#Design) + +1. [Mnesia index plugins](#Mnesia_index_plugins) + +1. [Rocksdb](#Rocksdb) + + + +### Usage ### + + +#### Prerequisites #### + +* rocksdb (included as dependency) + +* sext (included as dependency) + +* Erlang/OTP 21.0 or newer (https://github.com/erlang/otp) + + + +#### Getting started #### + +Call `mnesia_rocksdb:register()` immediately after +starting mnesia. + +Put `{rocksdb_copies, [node()]}` into the table definitions of +tables you want to be in RocksDB. + + +#### Special features #### + +RocksDB tables support efficient selects on _prefix keys_. + +The backend uses the `sext` module (see +[`https://github.com/uwiger/sext`](https://github.com/uwiger/sext)) for mapping between Erlang terms and the +binary data stored in the tables. This provides two useful properties: + +* The records are stored in the Erlang term order of their keys. + +* A prefix of a composite key is ordered just before any key for which + it is a prefix. For example, `{x, '_'}` is a prefix for keys `{x, a}`,`{x, b}` and so on. + + +This means that a prefix key identifies the start of the sequence of +entries whose keys match the prefix. The backend uses this to optimize +selects on prefix keys. + +### Customization + +RocksDB supports a number of customization options. These can be specified +by providing a `{Key, Value}` list named `rocksdb_opts` under `user_properties`, +for example: + +``` +mnesia:create_table(foo, [{rocksdb_copies, [node()]}, + ... + {user_properties, + [{rocksdb_opts, [{max_open_files, 1024}]}] + }]) +``` + +Consult the [RocksDB documentation](https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning) +for information on configuration parameters. Also see the section below on handling write errors. + +The default configuration for tables in `mnesia_rocksdb` is: + +``` +default_open_opts() -> + [ {create_if_missing, true} + , {cache_size, + list_to_integer(get_env_default("ROCKSDB_CACHE_SIZE", "32212254"))} + , {block_size, 1024} + , {max_open_files, 100} + , {write_buffer_size, + list_to_integer(get_env_default( + "ROCKSDB_WRITE_BUFFER_SIZE", "4194304"))} + , {compression, + list_to_atom(get_env_default("ROCKSDB_COMPRESSION", "true"))} + , {use_bloomfilter, true} + ]. +``` + +It is also possible, for larger databases, to produce a tuning parameter file. +This is experimental, and mostly copied from `mnesia_leveldb`. Consult the +source code in `mnesia_rocksdb_tuning.erl` and `mnesia_rocksdb_params.erl`. +Contributions are welcome. + + +#### Caveats #### + +Avoid placing `bag` tables in RocksDB. Although they work, each write +requires additional reads, causing substantial runtime overheads. There +are better ways to represent and process bag data (see above about +_prefix keys_). + +The `mnesia:table_info(T, size)` call always returns zero for RocksDB +tables. RocksDB itself does not track the number of elements in a table, and +although it is possible to make the `mnesia_rocksdb` backend maintain a size +counter, it incurs a high runtime overhead for writes and deletes since it +forces them to first do a read to check the existence of the key. If you +depend on having an up to date size count at all times, you need to maintain +it yourself. If you only need the size occasionally, you may traverse the +table to count the elements. + + +### Mnesia backend plugins ### + + +#### Background #### + +Mnesia was initially designed to be a RAM-only DBMS, and Erlang's +`ets` tables were developed for this purpose. In order to support +persistence, e.g. for configuration data, a disk-based version of `ets` +(called `dets`) was created. The `dets` API mimicks the `ets` API, +and `dets` is quite convenient and fast for (nowadays) small datasets. +However, using a 32-bit bucket system, it is limited to 2GB of data. +It also doesn't support ordered sets. When used in Mnesia, dets-based +tables are called `disc_only_copies`. + +To circumvent these limitations, another table type, called `disc_copies` +was added. This is a combination of `ets` and `disk_log`, where Mnesia +periodically snapshots the `ets` data to a log file on disk, and meanwhile +maintains a log of updates, which can be applied at startup. These tables +are quite performant (especially on read access), but all data is kept in +RAM, which can become a serious limitation. + +A backend plugin system was proposed by Ulf Wiger in 2016, and further +developed with Klarna's support, to finally become included in OTP 19. +Klarna uses a LevelDb backend, but Aeternity, in 2017, instead chose +to implement a Rocksdb backend plugin. + + +### Design ### + +As backend plugins were added on a long-since legacy-stable Mnesia, +they had to conform to the existing code structure. For this reason, +the plugin callbacks hook into the already present low-level access +API in the `mnesia_lib` module. As a consequence, backend plugins have +the same access semantics and granularity as `ets` and `dets`. This +isn't much of a disadvantage for key-value stores like LevelDb and RocksDB, +but a more serious issue is that the update part of this API is called +on _after_ the point of no return. That is, Mnesia does not expect +these updates to fail, and has no recourse if they do. As an aside, +this could also happen if a `disc_only_copies` table exceeds the 2 GB +limit (mnesia will not check it, and `dets` will not complain, but simply +drop the update.) + + +### Mnesia index plugins ### + +When adding support for backend plugins, index plugins were also added. Unfortunately, they remain undocumented. + +An index plugin can be added in one of two ways: + +1. When creating a schema, provide `{index_plugins, [{Name, Module, Function}]}` options. + +1. Call the function `mnesia_schema:add_index_plugin(Name, Module, Function)` + + +`Name` must be an atom wrapped as a 1-tuple, e.g. `{words}`. + +The plugin callback is called as `Module:Function(Table, Pos, Obj)`, where `Pos=={words}` in +our example. It returns a list of index terms. + +Example + +Given the following index plugin implementation: + +``` +-module(words). +-export([words_f/3]). + +words_f(_,_,Obj) when is_tuple(Obj) -> + words_(tuple_to_list(Obj)). + +words_(Str) when is_binary(Str) -> + string:lexemes(Str, [$\s, $\n, [$\r,$\n]]); +words_(L) when is_list(L) -> + lists:flatmap(fun words_/1, L); +words_(_) -> + []. +``` + +We can register the plugin and use it in table definitions: + +``` +Eshell V12.1.3 (abort with ^G) +1> mnesia:start(). +ok +2> mnesia_schema:add_index_plugin({words}, words, words_f). +{atomic,ok} +3> mnesia:create_table(i, [{index, [{words}]}]). +{atomic,ok} +``` + +Note that in this case, we had neither a backend plugin, nor even a persistent schema. +Index plugins can be used with all table types. The registered indexing function (arity 3) must exist +as an exported function along the node's code path. + +To see what happens when we insert an object, we can turn on call trace. + +``` +4> dbg:tracer(). +{ok,<0.108.0>} +5> dbg:tp(words, x). +{ok,[{matched,nonode@nohost,3},{saved,x}]} +6> dbg:p(all,[c]). +{ok,[{matched,nonode@nohost,60}]} +7> mnesia:dirty_write({i,<<"one two">>, [<<"three">>, <<"four">>]}). +(<0.84.0>) call words:words_f(i,{words},{i,<<"one two">>,[<<"three">>,<<"four">>]}) +(<0.84.0>) returned from words:words_f/3 -> [<<"one">>,<<"two">>,<<"three">>, + <<"four">>] +(<0.84.0>) call words:words_f(i,{words},{i,<<"one two">>,[<<"three">>,<<"four">>]}) +(<0.84.0>) returned from words:words_f/3 -> [<<"one">>,<<"two">>,<<"three">>, + <<"four">>] +ok +8> dbg:ctp('_'), dbg:stop(). +ok +9> mnesia:dirty_index_read(i, <<"one">>, {words}). +[{i,<<"one two">>,[<<"three">>,<<"four">>]}] +``` + +(The fact that the indexing function is called twice, seems like a performance bug.) + +We can observe that the indexing callback is able to operate on the whole object. +It needs to be side-effect free and efficient, since it will be called at least once for each update +(if an old object exists in the table, the indexing function will be called on it too, before it is +replaced by the new object.) + + +### Rocksdb ### + + +### Usage ### + + + +## Modules ## + + + + + + + + + + + + + +
mnesia_rocksdb
mnesia_rocksdb_admin
mnesia_rocksdb_app
mnesia_rocksdb_lib
mnesia_rocksdb_params
mnesia_rocksdb_sup
mnesia_rocksdb_tuning
mrdb
mrdb_index
mrdb_mutex
mrdb_select
+ diff --git a/doc/edoc-info b/doc/edoc-info new file mode 100644 index 0000000..b425e49 --- /dev/null +++ b/doc/edoc-info @@ -0,0 +1,5 @@ +%% encoding: UTF-8 +{application,mnesia_rocksdb}. +{modules,[mnesia_rocksdb,mnesia_rocksdb_admin,mnesia_rocksdb_app, + mnesia_rocksdb_lib,mnesia_rocksdb_params,mnesia_rocksdb_sup, + mnesia_rocksdb_tuning,mrdb,mrdb_index,mrdb_mutex,mrdb_select]}. diff --git a/doc/erlang.png b/doc/erlang.png new file mode 100644 index 0000000..987a618 Binary files /dev/null and b/doc/erlang.png differ diff --git a/doc/mnesia_rocksdb.md b/doc/mnesia_rocksdb.md new file mode 100644 index 0000000..cdb753e --- /dev/null +++ b/doc/mnesia_rocksdb.md @@ -0,0 +1,547 @@ + + +# Module mnesia_rocksdb # +* [Description](#description) +* [Data Types](#types) +* [Function Index](#index) +* [Function Details](#functions) + +rocksdb storage backend for Mnesia. + +__Behaviours:__ [`gen_server`](gen_server.md), [`mnesia_backend_type`](mnesia_backend_type.md). + + + +## Description ## +This module implements a mnesia backend callback plugin. +It's specifically documented to try to explain the workings of +backend plugins. + + + +## Data Types ## + + + + +### alias() ### + + +

+alias() = atom()
+
+ + + + +### data_tab() ### + + +

+data_tab() = atom()
+
+ + + + +### error() ### + + +

+error() = {error, any()}
+
+ + + + +### index_info() ### + + +

+index_info() = {index_pos(), index_type()}
+
+ + + + +### index_pos() ### + + +

+index_pos() = integer() | {atom()}
+
+ + + + +### index_tab() ### + + +

+index_tab() = {data_tab(), index, index_info()}
+
+ + + + +### index_type() ### + + +

+index_type() = ordered
+
+ + + + +### retainer_name() ### + + +

+retainer_name() = any()
+
+ + + + +### retainer_tab() ### + + +

+retainer_tab() = {data_tab(), retainer, retainer_name()}
+
+ + + + +### table() ### + + +

+table() = data_tab() | index_tab() | retainer_tab()
+
+ + + + +### table_type() ### + + +

+table_type() = set | ordered_set | bag
+
+ + + +## Function Index ## + + +
add_aliases/1
check_definition/4
close_table/2
code_change/3
create_schema/1
create_schema/2
create_table/3
decode_key/1
decode_key/2
decode_val/1
decode_val/3
default_alias/0
delete/3
delete_table/2
encode_key/1
encode_key/2
encode_val/1
encode_val/2
first/2
fixtable/3
handle_call/3
handle_cast/2
handle_info/2
index_is_consistent/3
info/3
init/1
init_backend/0Called by mnesia_schema in order to intialize the backend.
insert/3
is_index_consistent/2
ix_listvals/3
ix_prefixes/3
last/2
load_table/4
lookup/3
match_delete/3
next/3
prev/3
real_suffixes/0
receive_data/5
receive_done/4
receiver_first_message/4
register/0Equivalent to register(rocksdb_copies).
register/1Convenience function for registering a mnesia_rocksdb backend plugin.
remove_aliases/1
repair_continuation/2
select/1
select/3
select/4
semantics/2
sender_handle_info/5
sender_init/4
show_table/1A debug function that shows the rocksdb table content.
show_table/2
slot/3
start_proc/6
sync_close_table/2
terminate/2
tmp_suffixes/0
update_counter/4
validate_key/6
validate_record/6
+ + + + +## Function Details ## + + + +### add_aliases/1 ### + +`add_aliases(Aliases) -> any()` + + + +### check_definition/4 ### + +`check_definition(Alias, Tab, Nodes, Props) -> any()` + + + +### close_table/2 ### + +`close_table(Alias, Tab) -> any()` + + + +### code_change/3 ### + +`code_change(FromVsn, St, Extra) -> any()` + + + +### create_schema/1 ### + +`create_schema(Nodes) -> any()` + + + +### create_schema/2 ### + +`create_schema(Nodes, Aliases) -> any()` + + + +### create_table/3 ### + +`create_table(Alias, Tab, Props) -> any()` + + + +### decode_key/1 ### + +`decode_key(Key) -> any()` + + + +### decode_key/2 ### + +`decode_key(Key, Metadata) -> any()` + + + +### decode_val/1 ### + +`decode_val(Val) -> any()` + + + +### decode_val/3 ### + +`decode_val(Val, Key, Metadata) -> any()` + + + +### default_alias/0 ### + +`default_alias() -> any()` + + + +### delete/3 ### + +`delete(Alias, Tab, Key) -> any()` + + + +### delete_table/2 ### + +`delete_table(Alias, Tab) -> any()` + + + +### encode_key/1 ### + +`encode_key(Key) -> any()` + + + +### encode_key/2 ### + +`encode_key(Key, Metadata) -> any()` + + + +### encode_val/1 ### + +`encode_val(Val) -> any()` + + + +### encode_val/2 ### + +`encode_val(Val, Metadata) -> any()` + + + +### first/2 ### + +`first(Alias, Tab) -> any()` + + + +### fixtable/3 ### + +`fixtable(Alias, Tab, Bool) -> any()` + + + +### handle_call/3 ### + +`handle_call(X1, From, St) -> any()` + + + +### handle_cast/2 ### + +`handle_cast(X1, St) -> any()` + + + +### handle_info/2 ### + +`handle_info(EXIT, St) -> any()` + + + +### index_is_consistent/3 ### + +`index_is_consistent(Alias, X2, Bool) -> any()` + + + +### info/3 ### + +`info(Alias, Tab, Item) -> any()` + + + +### init/1 ### + +`init(X1) -> any()` + + + +### init_backend/0 ### + +`init_backend() -> any()` + +Called by mnesia_schema in order to intialize the backend + +This is called when the backend is registered with the first alias, or ... + +See OTP issue #425 (16 Feb 2021). This callback is supposed to be called +before first use of the backend, but unfortunately, it is only called at +mnesia startup and when a backend module is registered MORE THAN ONCE. +This means we need to handle this function being called multiple times. + +The bug has been fixed as of OTP 24.0-rc3 + +If processes need to be started, this can be done using +`mnesia_ext_sup:start_proc(Name, Mod, F, Args [, Opts])` +where Opts are parameters for the supervised child: + +* `restart` (default: `transient`) +* `shutdown` (default: `120000`) +* `type` (default: `worker`) +* `modules` (default: `[Mod]`) + + + +### insert/3 ### + +`insert(Alias, Tab, Obj) -> any()` + + + +### is_index_consistent/2 ### + +`is_index_consistent(Alias, X2) -> any()` + + + +### ix_listvals/3 ### + +`ix_listvals(Tab, Pos, Obj) -> any()` + + + +### ix_prefixes/3 ### + +`ix_prefixes(Tab, Pos, Obj) -> any()` + + + +### last/2 ### + +`last(Alias, Tab) -> any()` + + + +### load_table/4 ### + +`load_table(Alias, Tab, LoadReason, Opts) -> any()` + + + +### lookup/3 ### + +`lookup(Alias, Tab, Key) -> any()` + + + +### match_delete/3 ### + +`match_delete(Alias, Tab, Pat) -> any()` + + + +### next/3 ### + +`next(Alias, Tab, Key) -> any()` + + + +### prev/3 ### + +`prev(Alias, Tab, Key) -> any()` + + + +### real_suffixes/0 ### + +`real_suffixes() -> any()` + + + +### receive_data/5 ### + +`receive_data(Data, Alias, Tab, Sender, State) -> any()` + + + +### receive_done/4 ### + +`receive_done(Alias, Tab, Sender, State) -> any()` + + + +### receiver_first_message/4 ### + +`receiver_first_message(Pid, Msg, Alias, Tab) -> any()` + + + +### register/0 ### + +

+register() -> {ok, alias()} | {error, term()}
+
+
+ +Equivalent to [`register(rocksdb_copies)`](#register-1). + + + +### register/1 ### + +

+register(Alias::alias()) -> {ok, alias()} | error()
+
+
+ +Convenience function for registering a mnesia_rocksdb backend plugin + +The function used to register a plugin is `mnesia_schema:add_backend_type(Alias, Module)` +where `Module` implements a backend_type behavior. `Alias` is an atom, and is used +in the same way as `ram_copies` etc. The default alias is `rocksdb_copies`. + + + +### remove_aliases/1 ### + +`remove_aliases(Aliases) -> any()` + + + +### repair_continuation/2 ### + +`repair_continuation(Cont, Ms) -> any()` + + + +### select/1 ### + +`select(Cont) -> any()` + + + +### select/3 ### + +`select(Alias, Tab, Ms) -> any()` + + + +### select/4 ### + +`select(Alias, IxTab, Ms, Limit) -> any()` + + + +### semantics/2 ### + +`semantics(Alias, X2) -> any()` + + + +### sender_handle_info/5 ### + +`sender_handle_info(Msg, Alias, Tab, ReceiverPid, Cont) -> any()` + + + +### sender_init/4 ### + +`sender_init(Alias, Tab, RemoteStorage, Pid) -> any()` + + + +### show_table/1 ### + +`show_table(Tab) -> any()` + +A debug function that shows the rocksdb table content + + + +### show_table/2 ### + +`show_table(Tab, Limit) -> any()` + + + +### slot/3 ### + +`slot(Alias, Tab, Pos) -> any()` + + + +### start_proc/6 ### + +`start_proc(Alias, Tab, Type, ProcName, Props, RdbOpts) -> any()` + + + +### sync_close_table/2 ### + +`sync_close_table(Alias, Tab) -> any()` + + + +### terminate/2 ### + +`terminate(Reason, St) -> any()` + + + +### tmp_suffixes/0 ### + +`tmp_suffixes() -> any()` + + + +### update_counter/4 ### + +`update_counter(Alias, Tab, C, Val) -> any()` + + + +### validate_key/6 ### + +`validate_key(Alias, Tab, RecName, Arity, Type, Key) -> any()` + + + +### validate_record/6 ### + +`validate_record(Alias, Tab, RecName, Arity, Type, Obj) -> any()` + diff --git a/doc/mnesia_rocksdb_admin.md b/doc/mnesia_rocksdb_admin.md new file mode 100644 index 0000000..71ec35d --- /dev/null +++ b/doc/mnesia_rocksdb_admin.md @@ -0,0 +1,326 @@ + + +# Module mnesia_rocksdb_admin # +* [Data Types](#types) +* [Function Index](#index) +* [Function Details](#functions) + +__Behaviours:__ [`gen_server`](gen_server.md). + + + +## Data Types ## + + + + +### alias() ### + + +

+alias() = atom()
+
+ + + + +### backend() ### + + +

+backend() = #{db_ref => db_ref(), cf_info => #{table() => cf()}}
+
+ + + + +### cf() ### + + +

+cf() = mrdb:db_ref()
+
+ + + + +### db_ref() ### + + +

+db_ref() = rocksdb:db_handle()
+
+ + + + +### gen_server_noreply() ### + + +

+gen_server_noreply() = {noreply, st()} | {stop, reason(), st()}
+
+ + + + +### gen_server_reply() ### + + +

+gen_server_reply() = {reply, reply(), st()} | {stop, reason(), reply(), st()}
+
+ + + + +### properties() ### + + +

+properties() = [{atom(), any()}]
+
+ + + + +### reason() ### + + +

+reason() = any()
+
+ + + + +### reply() ### + + +

+reply() = any()
+
+ + + + +### req() ### + + +

+req() = {create_table, table(), properties()} | {delete_table, table()} | {load_table, table()} | {related_resources, table()} | {get_ref, table()} | {add_aliases, [alias()]} | {write_table_property, tabname(), tuple()} | {remove_aliases, [alias()]} | {migrate, [{tabname(), map()}]} | {prep_close, table()} | {close_table, table()}
+
+ + + + +### st() ### + + +

+st() = #st{backends = #{alias() => backend()}, standalone = #{{alias(), table()} => cf()}, default_opts = [{atom(), term()}]}
+
+ + + + +### table() ### + + +

+table() = tabname() | {admin, alias()} | {tabname(), index, any()} | {tabname(), retainer, any()}
+
+ + + + +### tabname() ### + + +

+tabname() = atom()
+
+ + + +## Function Index ## + + +
add_aliases/1
close_table/2
code_change/3
create_table/3
delete_table/2
ensure_started/0
get_ref/1
get_ref/2
handle_call/3
handle_cast/2
handle_info/2
init/1
load_table/2
meta/0
migrate_standalone/2
prep_close/2
read_info/1
read_info/2
read_info/4
related_resources/2
remove_aliases/1
request_ref/2
start_link/0
terminate/2
write_info/4
write_table_property/3
+ + + + +## Function Details ## + + + +### add_aliases/1 ### + +`add_aliases(Aliases) -> any()` + + + +### close_table/2 ### + +`close_table(Alias, Name) -> any()` + + + +### code_change/3 ### + +`code_change(FromVsn, St, Extra) -> any()` + + + +### create_table/3 ### + +`create_table(Alias, Name, Props) -> any()` + + + +### delete_table/2 ### + +

+delete_table(Alias::alias(), Name::tabname()) -> ok
+
+
+ + + +### ensure_started/0 ### + +

+ensure_started() -> ok
+
+
+ + + +### get_ref/1 ### + +`get_ref(Name) -> any()` + + + +### get_ref/2 ### + +`get_ref(Name, Default) -> any()` + + + +### handle_call/3 ### + +

+handle_call(Req::{alias(), req()}, From::any(), St::st()) -> gen_server_reply()
+
+
+ + + +### handle_cast/2 ### + +

+handle_cast(Msg::any(), St::st()) -> gen_server_noreply()
+
+
+ + + +### handle_info/2 ### + +

+handle_info(Msg::any(), St::st()) -> gen_server_noreply()
+
+
+ + + +### init/1 ### + +`init(X1) -> any()` + + + +### load_table/2 ### + +`load_table(Alias, Name) -> any()` + + + +### meta/0 ### + +`meta() -> any()` + + + +### migrate_standalone/2 ### + +`migrate_standalone(Alias, Tabs) -> any()` + + + +### prep_close/2 ### + +`prep_close(Alias, Tab) -> any()` + + + +### read_info/1 ### + +`read_info(TRec) -> any()` + + + +### read_info/2 ### + +`read_info(Alias, Tab) -> any()` + + + +### read_info/4 ### + +`read_info(Alias, Tab, K, Default) -> any()` + + + +### related_resources/2 ### + +`related_resources(Alias, Name) -> any()` + + + +### remove_aliases/1 ### + +`remove_aliases(Aliases) -> any()` + + + +### request_ref/2 ### + +`request_ref(Alias, Name) -> any()` + + + +### start_link/0 ### + +`start_link() -> any()` + + + +### terminate/2 ### + +`terminate(X1, St) -> any()` + + + +### write_info/4 ### + +`write_info(Alias, Tab, K, V) -> any()` + + + +### write_table_property/3 ### + +`write_table_property(Alias, Tab, Prop) -> any()` + diff --git a/doc/mnesia_rocksdb_app.md b/doc/mnesia_rocksdb_app.md new file mode 100644 index 0000000..7759199 --- /dev/null +++ b/doc/mnesia_rocksdb_app.md @@ -0,0 +1,32 @@ + + +# Module mnesia_rocksdb_app # +* [Function Index](#index) +* [Function Details](#functions) + +__Behaviours:__ [`application`](application.md). + + + +## Function Index ## + + +
start/2
stop/1
+ + + + +## Function Details ## + + + +### start/2 ### + +`start(StartType, StartArgs) -> any()` + + + +### stop/1 ### + +`stop(State) -> any()` + diff --git a/doc/mnesia_rocksdb_lib.md b/doc/mnesia_rocksdb_lib.md new file mode 100644 index 0000000..263b268 --- /dev/null +++ b/doc/mnesia_rocksdb_lib.md @@ -0,0 +1,168 @@ + + +# Module mnesia_rocksdb_lib # +* [Description](#description) +* [Function Index](#index) +* [Function Details](#functions) + +RocksDB update wrappers, in separate module for easy tracing and mocking. + + + +## Description ## + + +## Function Index ## + + +
check_encoding/2
create_mountpoint/1
data_mountpoint/1
decode/2
decode_key/1
decode_key/2
decode_val/1
decode_val/3
default_encoding/3
delete/3
encode/2
encode_key/1
encode_key/2
encode_val/1
encode_val/2
keypos/1
open_rocksdb/3
put/4
tabname/1
valid_key_type/2
valid_obj_type/2
write/3
+ + + + +## Function Details ## + + + +### check_encoding/2 ### + +`check_encoding(Encoding, Attributes) -> any()` + + + +### create_mountpoint/1 ### + +`create_mountpoint(Tab) -> any()` + + + +### data_mountpoint/1 ### + +`data_mountpoint(Tab) -> any()` + + + +### decode/2 ### + +`decode(Val, X2) -> any()` + + + +### decode_key/1 ### + +

+decode_key(CodedKey::binary()) -> any()
+
+
+ + + +### decode_key/2 ### + +`decode_key(CodedKey, Enc) -> any()` + + + +### decode_val/1 ### + +

+decode_val(CodedVal::binary()) -> any()
+
+
+ + + +### decode_val/3 ### + +`decode_val(CodedVal, K, Ref) -> any()` + + + +### default_encoding/3 ### + +`default_encoding(X1, Type, As) -> any()` + + + +### delete/3 ### + +`delete(Ref, K, Opts) -> any()` + + + +### encode/2 ### + +`encode(Value, X2) -> any()` + + + +### encode_key/1 ### + +

+encode_key(Key::any()) -> binary()
+
+
+ + + +### encode_key/2 ### + +`encode_key(Key, X2) -> any()` + + + +### encode_val/1 ### + +

+encode_val(Val::any()) -> binary()
+
+
+ + + +### encode_val/2 ### + +`encode_val(Val, Enc) -> any()` + + + +### keypos/1 ### + +`keypos(Tab) -> any()` + + + +### open_rocksdb/3 ### + +`open_rocksdb(MPd, RdbOpts, CFs) -> any()` + + + +### put/4 ### + +`put(Ref, K, V, Opts) -> any()` + + + +### tabname/1 ### + +`tabname(Tab) -> any()` + + + +### valid_key_type/2 ### + +`valid_key_type(X1, Key) -> any()` + + + +### valid_obj_type/2 ### + +`valid_obj_type(X1, Obj) -> any()` + + + +### write/3 ### + +`write(X1, L, Opts) -> any()` + diff --git a/doc/mnesia_rocksdb_params.md b/doc/mnesia_rocksdb_params.md new file mode 100644 index 0000000..97fdb04 --- /dev/null +++ b/doc/mnesia_rocksdb_params.md @@ -0,0 +1,80 @@ + + +# Module mnesia_rocksdb_params # +* [Function Index](#index) +* [Function Details](#functions) + +__Behaviours:__ [`gen_server`](gen_server.md). + + + +## Function Index ## + + +
code_change/3
delete/1
handle_call/3
handle_cast/2
handle_info/2
init/1
lookup/2
start_link/0
store/2
terminate/2
+ + + + +## Function Details ## + + + +### code_change/3 ### + +`code_change(X1, S, X3) -> any()` + + + +### delete/1 ### + +`delete(Tab) -> any()` + + + +### handle_call/3 ### + +`handle_call(X1, X2, S) -> any()` + + + +### handle_cast/2 ### + +`handle_cast(X1, S) -> any()` + + + +### handle_info/2 ### + +`handle_info(X1, S) -> any()` + + + +### init/1 ### + +`init(X1) -> any()` + + + +### lookup/2 ### + +`lookup(Tab, Default) -> any()` + + + +### start_link/0 ### + +`start_link() -> any()` + + + +### store/2 ### + +`store(Tab, Params) -> any()` + + + +### terminate/2 ### + +`terminate(X1, X2) -> any()` + diff --git a/doc/mnesia_rocksdb_sup.md b/doc/mnesia_rocksdb_sup.md new file mode 100644 index 0000000..439c02b --- /dev/null +++ b/doc/mnesia_rocksdb_sup.md @@ -0,0 +1,32 @@ + + +# Module mnesia_rocksdb_sup # +* [Function Index](#index) +* [Function Details](#functions) + +__Behaviours:__ [`supervisor`](supervisor.md). + + + +## Function Index ## + + +
init/1
start_link/0
+ + + + +## Function Details ## + + + +### init/1 ### + +`init(X1) -> any()` + + + +### start_link/0 ### + +`start_link() -> any()` + diff --git a/doc/mnesia_rocksdb_tuning.md b/doc/mnesia_rocksdb_tuning.md new file mode 100644 index 0000000..2c688dc --- /dev/null +++ b/doc/mnesia_rocksdb_tuning.md @@ -0,0 +1,126 @@ + + +# Module mnesia_rocksdb_tuning # +* [Function Index](#index) +* [Function Details](#functions) + + + +## Function Index ## + + +
cache/1
calc_sizes/0
calc_sizes/1
count_rdb_tabs/0
count_rdb_tabs/1
default/1
describe_env/0
get_avail_ram/0
get_maxfiles/0
get_maxfiles/1
ideal_max_files/0
ideal_max_files/1
max_files/1
rdb_indexes/0
rdb_indexes/1
rdb_tabs/0
rdb_tabs/1
write_buffer/1
+ + + + +## Function Details ## + + + +### cache/1 ### + +`cache(X1) -> any()` + + + +### calc_sizes/0 ### + +`calc_sizes() -> any()` + + + +### calc_sizes/1 ### + +`calc_sizes(D) -> any()` + + + +### count_rdb_tabs/0 ### + +`count_rdb_tabs() -> any()` + + + +### count_rdb_tabs/1 ### + +`count_rdb_tabs(Db) -> any()` + + + +### default/1 ### + +`default(X1) -> any()` + + + +### describe_env/0 ### + +`describe_env() -> any()` + + + +### get_avail_ram/0 ### + +`get_avail_ram() -> any()` + + + +### get_maxfiles/0 ### + +`get_maxfiles() -> any()` + + + +### get_maxfiles/1 ### + +`get_maxfiles(X1) -> any()` + + + +### ideal_max_files/0 ### + +`ideal_max_files() -> any()` + + + +### ideal_max_files/1 ### + +`ideal_max_files(D) -> any()` + + + +### max_files/1 ### + +`max_files(X1) -> any()` + + + +### rdb_indexes/0 ### + +`rdb_indexes() -> any()` + + + +### rdb_indexes/1 ### + +`rdb_indexes(Db) -> any()` + + + +### rdb_tabs/0 ### + +`rdb_tabs() -> any()` + + + +### rdb_tabs/1 ### + +`rdb_tabs(Db) -> any()` + + + +### write_buffer/1 ### + +`write_buffer(X1) -> any()` + diff --git a/doc/mrdb.md b/doc/mrdb.md new file mode 100644 index 0000000..c27d810 --- /dev/null +++ b/doc/mrdb.md @@ -0,0 +1,943 @@ + + +# Module mrdb # +* [Description](#description) +* [Data Types](#types) +* [Function Index](#index) +* [Function Details](#functions) + +Mid-level access API for Mnesia-managed rocksdb tables. + + + +## Description ## + +This module implements access functions for the mnesia_rocksdb +backend plugin. The functions are designed to also support +direct access to rocksdb with little overhead. Such direct +access will maintain existing indexes, but not support +replication. + +Each table has a metadata structure stored as a persistent +term for fast access. The structure of the metadata is as +follows: + +``` + #{ name := + , db_ref := + , cf_handle := + , batch := + , tx_handle := + , attr_pos := #{AttrName := Pos} + , mode := + , properties := + , type := column_family | standalone + } +``` + +Helper functions like `as_batch(Ref, fun(R) -> ... end)` and +`with_iterator(Ref, fun(I) -> ... end)` add some extra +convenience on top of the `rocksdb` API. + +Note that no automatic provision exists to manage concurrent +updates via mnesia AND direct access to this API. It's advisable +to use ONE primary mode of access. If replication is used, +the mnesia API will support this, but direct `mrdb` updates will +not be replicated. + + +## Data Types ## + + + + +### activity_type() ### + + +

+activity_type() = mrdb_activity_type() | mnesia_activity_type()
+
+ + + + +### admin_tab() ### + + +

+admin_tab() = {admin, alias()}
+
+ + + + +### alias() ### + + +

+alias() = atom()
+
+ + + + +### attr_pos() ### + + +

+attr_pos() = #{atom() => pos()}
+
+ + + + +### batch_handle() ### + + +

+batch_handle() = rocksdb:batch_handle()
+
+ + + + +### cf_handle() ### + + +

+cf_handle() = rocksdb:cf_handle()
+
+ + + + +### db_handle() ### + + +

+db_handle() = rocksdb:db_handle()
+
+ + + + +### db_ref() ### + + +

+db_ref() = #{name => table(), alias => atom(), vsn => non_neg_integer(), db_ref => db_handle(), cf_handle => cf_handle(), semantics => semantics(), encoding => encoding(), attr_pos => attr_pos(), type => column_family | standalone, status => open | closed | pre_existing, properties => properties(), mode => mnesia, ix_vals_f => fun((tuple()) -> [any()]), batch => batch_handle(), tx_handle => tx_handle(), term() => term()}
+
+ + + + +### encoding() ### + + +

+encoding() = raw | sext | term | {key_encoding(), val_encoding()}
+
+ + + + +### error() ### + + +

+error() = {error, any()}
+
+ + + + +### index() ### + + +

+index() = {tab_name(), index, any()}
+
+ + + + +### index_position() ### + + +

+index_position() = atom() | pos()
+
+ + + + +### iterator_action() ### + + +

+iterator_action() = first | last | next | prev | binary() | {seek, binary()} | {seek_for_prev, binary()}
+
+ + + + +### itr_handle() ### + + +

+itr_handle() = rocksdb:itr_handle()
+
+ + + + +### key() ### + + +

+key() = any()
+
+ + + + +### key_encoding() ### + + +

+key_encoding() = raw | sext | term
+
+ + + + +### mnesia_activity_type() ### + + +

+mnesia_activity_type() = transaction | sync_transaction | async_dirty | sync_dirty
+
+ + + + +### mrdb_activity_type() ### + + +

+mrdb_activity_type() = tx | {tx, tx_options()} | batch
+
+ + + + +### mrdb_iterator() ### + + +

+mrdb_iterator() = #mrdb_iter{i = itr_handle(), ref = db_ref()}
+
+ + + + +### obj() ### + + +

+obj() = tuple()
+
+ + + + +### pos() ### + + +

+pos() = non_neg_integer()
+
+ + + + +### properties() ### + + +

+properties() = #{record_name => atom(), attributes => [atom()], index => [{pos(), bag | ordered}]}
+
+ + + + +### read_options() ### + + +

+read_options() = [{verify_checksums, boolean()} | {fill_cache, boolean()} | {iterate_upper_bound, binary()} | {iterate_lower_bound, binary()} | {tailing, boolean()} | {total_order_seek, boolean()} | {prefix_same_as_start, boolean()} | {snapshot, snapshot_handle()}]
+
+ + + + +### ref_or_tab() ### + + +

+ref_or_tab() = table() | db_ref()
+
+ + + + +### retainer() ### + + +

+retainer() = {tab_name(), retainer, any()}
+
+ + + + +### retries() ### + + +

+retries() = non_neg_integer()
+
+ + + + +### semantics() ### + + +

+semantics() = bag | set
+
+ + + + +### snapshot_handle() ### + + +

+snapshot_handle() = rocksdb:snapshot_handle()
+
+ + + + +### tab_name() ### + + +

+tab_name() = atom()
+
+ + + + +### table() ### + + +

+table() = atom() | admin_tab() | index() | retainer()
+
+ + + + +### tx_handle() ### + + +

+tx_handle() = rocksdb:transaction_handle()
+
+ + + + +### tx_options() ### + + +

+tx_options() = #{retries => retries(), no_snapshot => boolean()}
+
+ + + + +### val_encoding() ### + + +

+val_encoding() = {value | object, term | raw} | raw
+
+ + + + +### write_options() ### + + +

+write_options() = [{sync, boolean()} | {disable_wal, boolean()} | {ignore_missing_column_families, boolean()} | {no_slowdown, boolean()} | {low_pri, boolean()}]
+
+ + + +## Function Index ## + + +
abort/1Aborts an ongoing activity/2
activity/3Run an activity (similar to //mnesia/mnesia:activity/2).
alias_of/1Returns the alias of a given table or table reference.
as_batch/2Creates a rocksdb batch context and executes the fun F in it.
as_batch/3as as_batch/2, but with the ability to pass Opts to rocksdb:write_batch/2
batch_write/2
batch_write/3
current_context/0
delete/2
delete/3
delete_object/2
delete_object/3
ensure_ref/1
ensure_ref/2
first/1
first/2
fold/3
fold/4
fold/5
get_batch/1
get_ref/1
index_read/3
insert/2
insert/3
iterator/1
iterator/2
iterator_close/1
iterator_move/2
last/1
last/2
match_delete/2
new_tx/1
new_tx/2
next/2
next/3
prev/2
prev/3
rdb_delete/2
rdb_delete/3
rdb_fold/4
rdb_fold/5
rdb_get/2
rdb_get/3
rdb_iterator/1
rdb_iterator/2
rdb_iterator_move/2
rdb_put/3
rdb_put/4
read/2
read/3
read_info/1
read_info/2
release_snapshot/1release a snapshot created by snapshot/1.
select/1
select/2
select/3
snapshot/1Create a snapshot of the database instance associated with the +table reference, table name or alias.
tx_commit/1
tx_ref/2
update_counter/3
update_counter/4
with_iterator/2
with_iterator/3
with_rdb_iterator/2
with_rdb_iterator/3
write_info/3
+ + + + +## Function Details ## + + + +### abort/1 ### + +`abort(Reason) -> any()` + +Aborts an ongoing [`activity/2`](#activity-2) + + + +### activity/3 ### + +

+activity(Type::activity_type(), Alias::alias(), F::fun(() -> Res)) -> Res
+
+
+ +Run an activity (similar to [`//mnesia/mnesia:activity/2`](http://www.erlang.org/doc/man/mnesia.html#activity-2)). + +Supported activity types are: + +* `transaction` - An optimistic `rocksdb` transaction + +* `{tx, TxOpts}` - A `rocksdb` transaction with sligth modifications + +* `batch` - A `rocksdb` batch operation + + +By default, transactions are combined with a snapshot with 1 retry. +The snapshot ensures that writes from concurrent transactions don't leak into the transaction context. +A transaction will be retried if it detects that the commit set conflicts with recent changes. +A mutex is used to ensure that only one of potentially conflicting `mrdb` transactions is run at a time. +The re-run transaction may still fail, if new transactions, or non-transaction writes interfere with +the commit set. It will then be re-run again, until the retry count is exhausted. + +Valid `TxOpts` are `#{no_snapshot => boolean(), retries => retries()}`. + +To simplify code adaptation, `tx | transaction | sync_transaction` are synonyms, and +`batch | async_dirty | sync_dirty` are synonyms. + + + +### alias_of/1 ### + +

+alias_of(Tab::ref_or_tab()) -> alias()
+
+
+ +Returns the alias of a given table or table reference. + + + +### as_batch/2 ### + +

+as_batch(Tab::ref_or_tab(), F::fun((db_ref()) -> Res)) -> Res
+
+
+ +Creates a `rocksdb` batch context and executes the fun `F` in it. + +%% Rocksdb batches aren't tied to a specific DbRef until written. +This can cause surprising problems if we're juggling multiple +rocksdb instances (as we do if we have standalone tables). +At the time of writing, all objects end up in the DbRef the batch +is written to, albeit not necessarily in the intended column family. +This will probably change, but no failure mode is really acceptable. +The code below ensures that separate batches are created for each +DbRef, under a unique reference stored in the pdict. When writing, +all batches are written separately to the corresponding DbRef, +and when releasing, all batches are released. This will not ensure +atomicity, but there is no way in rocksdb to achieve atomicity +across db instances. At least, data should end up where you expect. + + + +### as_batch/3 ### + +`as_batch(Tab, F, Opts) -> any()` + +as [`as_batch/2`](#as_batch-2), but with the ability to pass `Opts` to `rocksdb:write_batch/2` + + + +### batch_write/2 ### + +`batch_write(Tab, L) -> any()` + + + +### batch_write/3 ### + +`batch_write(Tab, L, Opts) -> any()` + + + +### current_context/0 ### + +`current_context() -> any()` + + + +### delete/2 ### + +

+delete(Tab::ref_or_tab(), Key::key()) -> ok
+
+
+ + + +### delete/3 ### + +

+delete(Tab::ref_or_tab(), Key::key(), Opts::write_options()) -> ok
+
+
+ + + +### delete_object/2 ### + +`delete_object(Tab, Obj) -> any()` + + + +### delete_object/3 ### + +`delete_object(Tab, Obj, Opts) -> any()` + + + +### ensure_ref/1 ### + +

+ensure_ref(Ref::ref_or_tab()) -> db_ref()
+
+
+ + + +### ensure_ref/2 ### + +`ensure_ref(Ref, R) -> any()` + + + +### first/1 ### + +

+first(Tab::ref_or_tab()) -> key() | $end_of_table
+
+
+ + + +### first/2 ### + +

+first(Tab::ref_or_tab(), Opts::read_options()) -> key() | $end_of_table
+
+
+ + + +### fold/3 ### + +`fold(Tab, Fun, Acc) -> any()` + + + +### fold/4 ### + +`fold(Tab, Fun, Acc, MatchSpec) -> any()` + + + +### fold/5 ### + +`fold(Tab, Fun, Acc, MatchSpec, Limit) -> any()` + + + +### get_batch/1 ### + +`get_batch(X1) -> any()` + + + +### get_ref/1 ### + +

+get_ref(Tab::table()) -> db_ref()
+
+
+ + + +### index_read/3 ### + +`index_read(Tab, Val, Ix) -> any()` + + + +### insert/2 ### + +

+insert(Tab::ref_or_tab(), Obj::obj()) -> ok
+
+
+ + + +### insert/3 ### + +

+insert(Tab::ref_or_tab(), Obj0::obj(), Opts::write_options()) -> ok
+
+
+ + + +### iterator/1 ### + +

+iterator(Tab::ref_or_tab()) -> {ok, mrdb_iterator()} | {error, term()}
+
+
+ + + +### iterator/2 ### + +

+iterator(Tab::ref_or_tab(), Opts::read_options()) -> {ok, mrdb_iterator()} | {error, term()}
+
+
+ + + +### iterator_close/1 ### + +

+iterator_close(Mrdb_iter::mrdb_iterator()) -> ok
+
+
+ + + +### iterator_move/2 ### + +

+iterator_move(Mrdb_iter::mrdb_iterator(), Dir::iterator_action()) -> {ok, tuple()} | {error, any()}
+
+
+ + + +### last/1 ### + +

+last(Tab::ref_or_tab()) -> key() | $end_of_table
+
+
+ + + +### last/2 ### + +

+last(Tab::ref_or_tab(), Opts::read_options()) -> key() | $end_of_table
+
+
+ + + +### match_delete/2 ### + +`match_delete(Tab, Pat) -> any()` + + + +### new_tx/1 ### + +

+new_tx(Tab::table() | db_ref()) -> db_ref()
+
+
+ + + +### new_tx/2 ### + +

+new_tx(Tab::ref_or_tab(), Opts::write_options()) -> db_ref()
+
+
+ + + +### next/2 ### + +

+next(Tab::ref_or_tab(), K::key()) -> key() | $end_of_table
+
+
+ + + +### next/3 ### + +

+next(Tab::ref_or_tab(), K::key(), Opts::read_options()) -> key() | $end_of_table
+
+
+ + + +### prev/2 ### + +

+prev(Tab::ref_or_tab(), K::key()) -> key() | $end_of_table
+
+
+ + + +### prev/3 ### + +

+prev(Tab::ref_or_tab(), K::key(), Opts::read_options()) -> key() | $end_of_table
+
+
+ + + +### rdb_delete/2 ### + +`rdb_delete(R, K) -> any()` + + + +### rdb_delete/3 ### + +`rdb_delete(R, K, Opts) -> any()` + + + +### rdb_fold/4 ### + +`rdb_fold(Tab, Fun, Acc, Prefix) -> any()` + + + +### rdb_fold/5 ### + +`rdb_fold(Tab, Fun, Acc, Prefix, Limit) -> any()` + + + +### rdb_get/2 ### + +`rdb_get(R, K) -> any()` + + + +### rdb_get/3 ### + +`rdb_get(R, K, Opts) -> any()` + + + +### rdb_iterator/1 ### + +`rdb_iterator(R) -> any()` + + + +### rdb_iterator/2 ### + +`rdb_iterator(R, Opts) -> any()` + + + +### rdb_iterator_move/2 ### + +`rdb_iterator_move(I, Dir) -> any()` + + + +### rdb_put/3 ### + +`rdb_put(R, K, V) -> any()` + + + +### rdb_put/4 ### + +`rdb_put(R, K, V, Opts) -> any()` + + + +### read/2 ### + +`read(Tab, Key) -> any()` + + + +### read/3 ### + +`read(Tab, Key, Opts) -> any()` + + + +### read_info/1 ### + +`read_info(Tab) -> any()` + + + +### read_info/2 ### + +`read_info(Tab, K) -> any()` + + + +### release_snapshot/1 ### + +

+release_snapshot(SHandle::snapshot_handle()) -> ok | error()
+
+
+ +release a snapshot created by [`snapshot/1`](#snapshot-1). + + + +### select/1 ### + +`select(Cont) -> any()` + + + +### select/2 ### + +`select(Tab, Pat) -> any()` + + + +### select/3 ### + +`select(Tab, Pat, Limit) -> any()` + + + +### snapshot/1 ### + +

+snapshot(Name::alias() | ref_or_tab()) -> {ok, snapshot_handle()} | error()
+
+
+ +Create a snapshot of the database instance associated with the +table reference, table name or alias. + +Snapshots provide consistent read-only views over the entire state of the key-value store. + + + +### tx_commit/1 ### + +

+tx_commit(TxH::tx_handle() | db_ref()) -> ok
+
+
+ + + +### tx_ref/2 ### + +

+tx_ref(Tab::ref_or_tab() | db_ref() | db_ref(), TxH::tx_handle()) -> db_ref()
+
+
+ + + +### update_counter/3 ### + +`update_counter(Tab, C, Val) -> any()` + + + +### update_counter/4 ### + +`update_counter(Tab, C, Val, Opts) -> any()` + + + +### with_iterator/2 ### + +

+with_iterator(Tab::ref_or_tab(), Fun::fun((mrdb_iterator()) -> Res)) -> Res
+
+
+ + + +### with_iterator/3 ### + +

+with_iterator(Tab::ref_or_tab(), Fun::fun((mrdb_iterator()) -> Res), Opts::read_options()) -> Res
+
+
+ + + +### with_rdb_iterator/2 ### + +

+with_rdb_iterator(Tab::ref_or_tab(), Fun::fun((itr_handle()) -> Res)) -> Res
+
+
+ + + +### with_rdb_iterator/3 ### + +

+with_rdb_iterator(Tab::ref_or_tab(), Fun::fun((itr_handle()) -> Res), Opts::read_options()) -> Res
+
+
+ + + +### write_info/3 ### + +`write_info(Tab, K, V) -> any()` + diff --git a/doc/mrdb_index.md b/doc/mrdb_index.md new file mode 100644 index 0000000..5b890a0 --- /dev/null +++ b/doc/mrdb_index.md @@ -0,0 +1,99 @@ + + +# Module mrdb_index # +* [Data Types](#types) +* [Function Index](#index) +* [Function Details](#functions) + + + +## Data Types ## + + + + +### index_value() ### + + +

+index_value() = any()
+
+ + + + +### iterator_action() ### + + +

+iterator_action() = mrdb:iterator_action()
+
+ + + + +### ix_iterator() ### + + +

+ix_iterator() = #mrdb_ix_iter{i = mrdb:iterator(), type = set | bag, sub = mrdb:ref() | pid()}
+
+ + + + +### object() ### + + +

+object() = tuple()
+
+ + + +## Function Index ## + + +
iterator/2
iterator_close/1
iterator_move/2
with_iterator/3
+ + + + +## Function Details ## + + + +### iterator/2 ### + +

+iterator(Tab::mrdb:ref_or_tab(), IxPos::mrdb:index_position()) -> {ok, ix_iterator()} | {error, term()}
+
+
+ + + +### iterator_close/1 ### + +

+iterator_close(Mrdb_ix_iter::ix_iterator()) -> ok
+
+
+ + + +### iterator_move/2 ### + +

+iterator_move(Mrdb_ix_iter::ix_iterator(), Dir::iterator_action()) -> {ok, index_value(), object()} | {error, term()}
+
+
+ + + +### with_iterator/3 ### + +

+with_iterator(Tab::mrdb:ref_or_tab(), IxPos::mrdb:index_position(), Fun::fun((ix_iterator()) -> Res)) -> Res
+
+
+ diff --git a/doc/mrdb_mutex.md b/doc/mrdb_mutex.md new file mode 100644 index 0000000..cc11dfc --- /dev/null +++ b/doc/mrdb_mutex.md @@ -0,0 +1,30 @@ + + +# Module mrdb_mutex # +* [Function Index](#index) +* [Function Details](#functions) + + + +## Function Index ## + + +
do/2
ensure_tab/0
+ + + + +## Function Details ## + + + +### do/2 ### + +`do(Rsrc, F) -> any()` + + + +### ensure_tab/0 ### + +`ensure_tab() -> any()` + diff --git a/doc/mrdb_select.md b/doc/mrdb_select.md new file mode 100644 index 0000000..295116d --- /dev/null +++ b/doc/mrdb_select.md @@ -0,0 +1,48 @@ + + +# Module mrdb_select # +* [Function Index](#index) +* [Function Details](#functions) + + + +## Function Index ## + + +
fold/5
rdb_fold/5
select/1
select/3
select/4
+ + + + +## Function Details ## + + + +### fold/5 ### + +`fold(Ref, Fun, Acc, MS, Limit) -> any()` + + + +### rdb_fold/5 ### + +`rdb_fold(Ref, Fun, Acc, Prefix, Limit) -> any()` + + + +### select/1 ### + +`select(Cont) -> any()` + + + +### select/3 ### + +`select(Ref, MS, Limit) -> any()` + + + +### select/4 ### + +`select(Ref, MS, AccKeys, Limit) -> any()` + diff --git a/doc/overview.edoc b/doc/overview.edoc new file mode 100644 index 0000000..823beab --- /dev/null +++ b/doc/overview.edoc @@ -0,0 +1,250 @@ +@author Ulf Wiger +@copyright 2013-21 Klarna AB +@title Mnesia Rocksdb - Rocksdb backend plugin for Mnesia + +@doc + +The Mnesia DBMS, part of Erlang/OTP, supports 'backend plugins', making +it possible to utilize more capable key-value stores than the `dets' +module (limited to 2 GB per table). Unfortunately, this support is +undocumented. Below, some informal documentation for the plugin system +is provided. + +== Table of Contents == +
    +
  1. {@section Usage}
  2. +
      +
    1. {@section Prerequisites}
    2. +
    3. {@section Getting started}
    4. +
    5. {@section Special features}
    6. +
    7. {@section Customization}
    8. +
    9. {@section Handling of errors in write operations}
    10. +
    11. {@section Caveats}
    12. +
    +
  3. {@section Mnesia backend plugins}
  4. +
      +
    1. {@section Background}
    2. +
    3. {@section Design}
    4. +
    +
  5. {@section Mnesia index plugins}
  6. +
  7. {@section Rocksdb}
  8. +
+ +== Usage == + +=== Prerequisites === + +
    +
  • rocksdb (included as dependency)
  • +
  • sext (included as dependency)
  • +
  • Erlang/OTP 21.0 or newer (https://github.com/erlang/otp)
  • +
+ +=== Getting started === + +Call `mnesia_rocksdb:register()' immediately after +starting mnesia. + +Put `{rocksdb_copies, [node()]}' into the table definitions of +tables you want to be in RocksDB. + +=== Special features === + +RocksDB tables support efficient selects on prefix keys. + +The backend uses the `sext' module (see +[https://github.com/uwiger/sext]) for mapping between Erlang terms and the +binary data stored in the tables. This provides two useful properties: + +
    +
  • The records are stored in the Erlang term order of their keys.
  • +
  • A prefix of a composite key is ordered just before any key for which + it is a prefix. For example, ``{x, '_'}'' is a prefix for keys `{x, a}', + `{x, b}' and so on.
  • +
+ +This means that a prefix key identifies the start of the sequence of +entries whose keys match the prefix. The backend uses this to optimize +selects on prefix keys. + +### Customization + +RocksDB supports a number of customization options. These can be specified +by providing a `{Key, Value}' list named `rocksdb_opts' under `user_properties', +for example: + +``` +mnesia:create_table(foo, [{rocksdb_copies, [node()]}, + ... + {user_properties, + [{rocksdb_opts, [{max_open_files, 1024}]}] + }]) +''' + +Consult the RocksDB documentation +for information on configuration parameters. Also see the section below on handling write errors. + +The default configuration for tables in `mnesia_rocksdb' is: +``` +default_open_opts() -> + [ {create_if_missing, true} + , {cache_size, + list_to_integer(get_env_default("ROCKSDB_CACHE_SIZE", "32212254"))} + , {block_size, 1024} + , {max_open_files, 100} + , {write_buffer_size, + list_to_integer(get_env_default( + "ROCKSDB_WRITE_BUFFER_SIZE", "4194304"))} + , {compression, + list_to_atom(get_env_default("ROCKSDB_COMPRESSION", "true"))} + , {use_bloomfilter, true} + ]. +''' + +It is also possible, for larger databases, to produce a tuning parameter file. +This is experimental, and mostly copied from `mnesia_leveldb'. Consult the +source code in `mnesia_rocksdb_tuning.erl' and `mnesia_rocksdb_params.erl'. +Contributions are welcome. + +=== Caveats === + +Avoid placing `bag' tables in RocksDB. Although they work, each write +requires additional reads, causing substantial runtime overheads. There +are better ways to represent and process bag data (see above about +prefix keys). + +The `mnesia:table_info(T, size)' call always returns zero for RocksDB +tables. RocksDB itself does not track the number of elements in a table, and +although it is possible to make the `mnesia_rocksdb' backend maintain a size +counter, it incurs a high runtime overhead for writes and deletes since it +forces them to first do a read to check the existence of the key. If you +depend on having an up to date size count at all times, you need to maintain +it yourself. If you only need the size occasionally, you may traverse the +table to count the elements. + +== Mnesia backend plugins == + +=== Background === + +Mnesia was initially designed to be a RAM-only DBMS, and Erlang's +`ets' tables were developed for this purpose. In order to support +persistence, e.g. for configuration data, a disk-based version of `ets' +(called `dets') was created. The `dets' API mimicks the `ets' API, +and `dets' is quite convenient and fast for (nowadays) small datasets. +However, using a 32-bit bucket system, it is limited to 2GB of data. +It also doesn't support ordered sets. When used in Mnesia, dets-based +tables are called `disc_only_copies'. + +To circumvent these limitations, another table type, called `disc_copies' +was added. This is a combination of `ets' and `disk_log', where Mnesia +periodically snapshots the `ets' data to a log file on disk, and meanwhile +maintains a log of updates, which can be applied at startup. These tables +are quite performant (especially on read access), but all data is kept in +RAM, which can become a serious limitation. + +A backend plugin system was proposed by Ulf Wiger in 2016, and further +developed with Klarna's support, to finally become included in OTP 19. +Klarna uses a LevelDb backend, but Aeternity, in 2017, instead chose +to implement a Rocksdb backend plugin. + +== Design == + +As backend plugins were added on a long-since legacy-stable Mnesia, +they had to conform to the existing code structure. For this reason, +the plugin callbacks hook into the already present low-level access +API in the `mnesia_lib' module. As a consequence, backend plugins have +the same access semantics and granularity as `ets' and `dets'. This +isn't much of a disadvantage for key-value stores like LevelDb and RocksDB, +but a more serious issue is that the update part of this API is called +on after the point of no return. That is, Mnesia does not expect +these updates to fail, and has no recourse if they do. As an aside, +this could also happen if a `disc_only_copies' table exceeds the 2 GB +limit (mnesia will not check it, and `dets' will not complain, but simply +drop the update.) + +== Mnesia index plugins == + +When adding support for backend plugins, index plugins were also added. Unfortunately, they remain undocumented. + +An index plugin can be added in one of two ways: + +
    +
  1. When creating a schema, provide `{index_plugins, [{Name, Module, Function}]}' options.
  2. +
  3. Call the function `mnesia_schema:add_index_plugin(Name, Module, Function)'
  4. +
+ +`Name' must be an atom wrapped as a 1-tuple, e.g. `{words}'. + +The plugin callback is called as `Module:Function(Table, Pos, Obj)', where `Pos=={words}' in +our example. It returns a list of index terms. + +Example + +Given the following index plugin implementation: + +``` +-module(words). +-export([words_f/3]). + +words_f(_,_,Obj) when is_tuple(Obj) -> + words_(tuple_to_list(Obj)). + +words_(Str) when is_binary(Str) -> + string:lexemes(Str, [$\s, $\n, [$\r,$\n]]); +words_(L) when is_list(L) -> + lists:flatmap(fun words_/1, L); +words_(_) -> + []. +''' + +We can register the plugin and use it in table definitions: + +``` +Eshell V12.1.3 (abort with ^G) +1> mnesia:start(). +ok +2> mnesia_schema:add_index_plugin({words}, words, words_f). +{atomic,ok} +3> mnesia:create_table(i, [{index, [{words}]}]). +{atomic,ok} +''' + +Note that in this case, we had neither a backend plugin, nor even a persistent schema. +Index plugins can be used with all table types. The registered indexing function (arity 3) must exist +as an exported function along the node's code path. + +To see what happens when we insert an object, we can turn on call trace. + +``` +4> dbg:tracer(). +{ok,<0.108.0>} +5> dbg:tp(words, x). +{ok,[{matched,nonode@nohost,3},{saved,x}]} +6> dbg:p(all,[c]). +{ok,[{matched,nonode@nohost,60}]} +7> mnesia:dirty_write({i,<<"one two">>, [<<"three">>, <<"four">>]}). +(<0.84.0>) call words:words_f(i,{words},{i,<<"one two">>,[<<"three">>,<<"four">>]}) +(<0.84.0>) returned from words:words_f/3 -> [<<"one">>,<<"two">>,<<"three">>, + <<"four">>] +(<0.84.0>) call words:words_f(i,{words},{i,<<"one two">>,[<<"three">>,<<"four">>]}) +(<0.84.0>) returned from words:words_f/3 -> [<<"one">>,<<"two">>,<<"three">>, + <<"four">>] +ok +8> dbg:ctp('_'), dbg:stop(). +ok +9> mnesia:dirty_index_read(i, <<"one">>, {words}). +[{i,<<"one two">>,[<<"three">>,<<"four">>]}] +''' + +(The fact that the indexing function is called twice, seems like a performance bug.) + +We can observe that the indexing callback is able to operate on the whole object. +It needs to be side-effect free and efficient, since it will be called at least once for each update +(if an old object exists in the table, the indexing function will be called on it too, before it is +replaced by the new object.) + +== Rocksdb == + +== Usage == + +@end \ No newline at end of file diff --git a/doc/plugin-userguide.md b/doc/plugin-userguide.md new file mode 100644 index 0000000..d9712d1 --- /dev/null +++ b/doc/plugin-userguide.md @@ -0,0 +1,300 @@ + + +# Using Mnesia Plugins # + +Copyright (c) 2017-21 Aeternity Anstalt. All Rights Reserved. + +__Authors:__ Ulf Wiger ([`ulf@wiger.net`](mailto:ulf@wiger.net)). + +The Mnesia DBMS, part of Erlang/OTP, supports 'backend plugins', making +it possible to utilize more capable key-value stores than the `dets` +module (limited to 2 GB per table). Unfortunately, this support is +undocumented. Below, some informal documentation for the plugin system +is provided. + +This user guide illustrates these concepts using `mnesia_rocksdb` +as an example. + +We will deal with two types of plugin: +1. backend plugins +2. index plugins + +A backend plugin is a module that implements a `mnesia_backend_type` +behavior. Each plugin can support any number of `aliases`, which +combined with the plugin module make up a `backend_type`. + +When using `mnesia_rocksdb`, the default alias is `rocksdb_copies`, +and it is registered as a `{rocksdb_copies, mnesia_rocksdb}` pair. +Once registered, the alias can be used just like the built-in +backend types `ram_copies`, `disc_copies`, `disc_only_copies`. +Mnesia asks the plugin module which one of the built-in types' +semantics the new type is supposed to mimick: ram-only, ram+disk +or disk-only. This is mainly relevant for how Mnesia checkpoints and +backs up data. + +### Table of Contents ### + +1. [Usage](#Usage) + 1. [Prerequisites](#Prerequisites) + 2. [Getting started](#Getting_started) + 3. [New indexing functionality](#New_indexing_functionality) + +## Usage + +### Prerequisites + +- rocksdb (included as dependency) +- sext (included as dependency) +- Erlang/OTP 22.0 or newer (https://github.com/erlang/otp) + +### Getting started + +For the purposes of this user guide, we assume an unnamed, single node +mnesia installation. The only place where plugins are affected by +distributed Mnesia, is in the table sync callbacks. The simplest way +to get all paths in order for experimentation is to check out +`mnesia_rocksdb`, building it, and then calling `rebar3 shell`. Unless +we note otherwise, this is how a node has been started for each example. + +> Erlang shell interactions have been slightly beautified by eliding +> some text and breaking and indenting some lines + +#### Adding a backend type to mnesia + +There are three different ways, all undocumented, to register a +backend plugin in mnesia: + +1. Add a `backend_types` option when creating the schema, using + `mnesia:create_schema/2` + +```erlang +Erlang/OTP 22 [erts-10.7] ... + +Eshell V10.7 (abort with ^G) +1> mnesia:create_schema([node()], + [{backend_types,[{rocksdb_copies,mnesia_rocksdb}]}]). +ok +2> mnesia:start(). +ok +3> mnesia_schema:backend_types(). +[ram_copies,disc_copies,disc_only_copies,rocksdb_copies] +``` + +(In `mnesia_rocksdb`, a shortcut for this exists in `mnesia_rocksdb:create_schema(Nodes)`.) + +2. Add it when starting mnesia, using `mnesia:start/1` (undocumented) + +```erlang +Eshell V10.7 (abort with ^G) +1> mnesia:create_schema([node()]). +ok +2> mnesia:start([{schema,[{backend_types, + [{rocksdb_copies,mnesia_rocksdb}]}]}]). +ok +3> mnesia_schema:backend_types(). +[ram_copies,disc_copies,disc_only_copies] +``` + +3. Call `mnesia_schema:add_backend_type/2` when mnesia is running. + +```erlang +Eshell V10.7 (abort with ^G) +1> mnesia:create_schema([node()]). +ok +2> mnesia:start(). +ok +3> mnesia_schema:add_backend_type(rocksdb_copies,mnesia_rocksdb). +{atomic,ok} +4> mnesia_schema:backend_types(). +[ram_copies,disc_copies,disc_only_copies,rocksdb_copies] +``` + +In all cases the schema is updated, and other nodes, and subsequently +added nodes, will automatically receive the information. +The function `mnesia_schema:backend_types()` shows which backend plugin +aliases are registered. + +The information is also displayed when calling `mnesia:info()`: + +```erlang +5> mnesia:info(). +---> Processes holding locks <--- +---> Processes waiting for locks <--- +---> Participant transactions <--- +---> Coordinator transactions <--- +---> Uncertain transactions <--- +---> Active tables <--- +schema : with 1 records occupying 443 words of mem +===> System info in version "4.16.3", debug level = none <=== +opt_disc. Directory "/.../Mnesia.nonode@nohost" is used. +use fallback at restart = false +running db nodes = [nonode@nohost] +stopped db nodes = [] +master node tables = [] +backend types = rocksdb_copies - mnesia_rocksdb +remote = [] +ram_copies = [] +disc_copies = [schema] +disc_only_copies = [] +[{nonode@nohost,disc_copies}] = [schema] +2 transactions committed, 0 aborted, 0 restarted, 0 logged to disc +0 held locks, 0 in queue; 0 local transactions, 0 remote +0 transactions waits for other nodes: [] +ok +``` + +To illustrate how mnesia persists the information in the schema: + +```erlang +6> mnesia:table_info(schema,user_properties). +[{mnesia_backend_types,[{rocksdb_copies,mnesia_rocksdb}]}] +``` + +#### Rocksdb registration shortcut + +Call `mnesia_rocksdb:register()` after starting mnesia. + +#### Creating a table + +Put `{rocksdb_copies, [node()]}` into the table definitions of +tables you want to be in RocksDB. + +```erlang +4> mnesia:create_table(t, [{rocksdb_copies,[node()]}]). +{atomic,ok} +5> rr(mnesia). +[commit,cstruct,cyclic,decision,log_header,mnesia_select, + tid,tidstore] +6> mnesia:table_info(t,cstruct). +#cstruct{name = t,type = set,ram_copies = [], + disc_copies = [],disc_only_copies = [], + external_copies = [{{rocksdb_copies,mnesia_rocksdb}, + [nonode@nohost]}], + load_order = 0,access_mode = read_write,majority = false, + index = [],snmp = [],local_content = false,record_name = t, + attributes = [key,val], + user_properties = [],frag_properties = [], + storage_properties = [], + cookie = {{1621758137965715000,-576460752303423420,1}, + nonode@nohost}, + version = {{2,0},[]}} +``` + +In the example above, we take a peek at the `cstruct`, which is the +internal metadata structure for mnesia tables. The attribute showing +that the table has been created with a `rocksdb_copies` instance, is +the `external_copies` attribute. It lists the alias, the callback module +and the nodes, where the instances reside. + +The table works essentially like one of the built-in table types. +If we want to find out which type, we can query the callback module: + +```erlang +8> mnesia_rocksdb:semantics(rocksdb_copies, storage). +disc_only_copies +``` + +Consult the `mnesia_rocksdb` man page for more info on the +`Mod:semantics/2` function. + +### New indexing functionality + +With the introduction of backend plugins, a few improvements were made +to mnesia's indexing support. + +#### Persistent indexes + +In the past, and still with the built-in types, indexes were always +rebuilt on startup. Since backend plugins were introduced mainly in +order to support very large tables, a couple of callback functions +were added in order to detect whether a full rebuild is needed. + +> The callback functions are `Mod:is_index_consistent/2` and +> `Mod:index_is_consistent/3`. +> The first function (figuratively) always returns `false` for indexes +> on built-in table types. Backend plugin modules should always return +> `false` if they have no information. After building the index, mnesia +> calls `Mod:index_is_consistent(Alias, IxTab, true)`, and the callback +> is expected to persist this information. `IxTab`, in this case, is +> a logical name for the index 'table': `{Tab, index, PosInfo}` + +#### Ordered indexes + +A problem in the past with mnesia indexing has been that indexes with +very large fan-out were inefficient. Indexes were represented as `bag` +tables, and the cost of inserting a secondary key was proportional to +the number of identical secondary keys already in the index. + +When adding the backend plugin support - also not least because the +first candidate LevelDb didn't do bags well - support for ordered +indexes was added. They turn out to be have much more stable performance +for indexes with large fan-out. They also work on all built-in table +types. + +When creating an index, you can specify the type of index as `bag` or +`ordered`. If you omit the type, it will default to `bag` for built-in +table types, and for external types, whatever is the first type in the +list of supported index types returned by `Mod:semantics(Alias, index_types)`. + +> For `mnesia_rocksdb`, only `ordered` is supported, but a bug in mnesia +> makes it ignore this, and try to create a bag index anyway. The +> `mnesia_rocksdb` plugin rejects this. +> Note that while e.g. mnesia_rocksdb supports regular bag tables, they are not +> efficiently implemented. + +Mnesia currently doesn't allow specifying an index type in +`mnesia:add_table_index/2`, so simply indicate the index position, +and let the backend choose the default. + +Having ordered indexes opens up for some new possibilities, but +there are currently no functions in mnesia such as index_first, index_next +etc., or performing a select in index order. + +#### Index plugins + +Index plugins are a great new feature, also almost entirely undocumented. + +An index plugin is a registered indexing function, which can operate +on the entire object, and shall return a list of secondary keys. +When registering an index plugin, it is given an alias, a callback module, +and an function name, not unlike backend plugins. The index plugin alias +must be an atom wrapped inside a 1-tuple, i.e. `{atom()}`. + +To illustrate, we use a sample indexing function implemented in +mnesia_rocksdb, which checks all non-key attributes of an object, +and for each value that is a list, makes each list element a secondary +key value. + +```erlang +9> mnesia_schema:add_index_plugin({lv}, mnesia_rocksdb, ix_listvals). +{atomic,ok} +10> mnesia:add_table_index(t,{lv}). +{atomic,ok} +11> mnesia:dirty_write({t,1,[a,b]}). +ok +12> mnesia:dirty_write({t,2,[b,c]}). +ok +13> mnesia:dirty_index_read(t,a,{lv}). +[{t,1,[a,b]}] +14> mnesia:dirty_index_read(t,b,{lv}). +[{t,1,[a,b]},{t,2,[b,c]}] +15> mnesia:dirty_index_read(t,c,{lv}). +[{t,2,[b,c]}] +``` + +For clarity, this is the implementation of the index callback: + +```erlang +ix_listvals(_Tab, _Pos, Obj) -> + lists:foldl( + fun(V, Acc) when is_list(V) -> + V ++ Acc; + (_, Acc) -> + Acc + end, [], tl(tuple_to_list(Obj))). +``` + +Note that the index callback must be a pure function, as it +is also relied upon when deleting objects. That is, it must +always return the same values when called with a specific +set of input arguments. diff --git a/doc/stylesheet.css b/doc/stylesheet.css new file mode 100644 index 0000000..ab170c0 --- /dev/null +++ b/doc/stylesheet.css @@ -0,0 +1,55 @@ +/* standard EDoc style sheet */ +body { + font-family: Verdana, Arial, Helvetica, sans-serif; + margin-left: .25in; + margin-right: .2in; + margin-top: 0.2in; + margin-bottom: 0.2in; + color: #000000; + background-color: #ffffff; +} +h1,h2 { + margin-left: -0.2in; +} +div.navbar { + background-color: #add8e6; + padding: 0.2em; +} +h2.indextitle { + padding: 0.4em; + background-color: #add8e6; +} +h3.function,h3.typedecl { + background-color: #add8e6; + padding-left: 1em; +} +div.spec { + margin-left: 2em; + background-color: #eeeeee; +} +a.module { + text-decoration:none +} +a.module:hover { + background-color: #eeeeee; +} +ul.definitions { + list-style-type: none; +} +ul.index { + list-style-type: none; + background-color: #eeeeee; +} + +/* + * Minor style tweaks + */ +ul { + list-style-type: square; +} +table { + border-collapse: collapse; +} +td { + padding: 3 +} diff --git a/erlang_ls.config b/erlang_ls.config new file mode 100644 index 0000000..40ab7eb --- /dev/null +++ b/erlang_ls.config @@ -0,0 +1,7 @@ +deps_dirs: + - "_build/default/lib/*" +include_paths: + - "src/*" +include_dirs: + - "include" + - "_build/default/lib" diff --git a/include/mnesia_rocksdb.hrl b/include/mnesia_rocksdb.hrl new file mode 100644 index 0000000..188986c --- /dev/null +++ b/include/mnesia_rocksdb.hrl @@ -0,0 +1,15 @@ + +%% Data and meta data (a.k.a. info) are stored in the same table. +%% This is a table of the first byte in data +%% 0 = before meta data +%% 1 = meta data +%% 2 = before data +%% >= 8 = data + +-define(INFO_START, 0). +-define(INFO_TAG, 1). +-define(DATA_START, 2). +-define(BAG_CNT, 32). % Number of bits used for bag object counter +-define(MAX_BAG, 16#FFFFFFFF). + +-define(VSN, 2). diff --git a/rebar.config b/rebar.config index 9eb6939..4e3b856 100644 --- a/rebar.config +++ b/rebar.config @@ -4,14 +4,40 @@ {deps, [ {sext, "1.8.0"}, - {rocksdb,"1.7.0"} + {rocksdb, {git, "https://gitlab.com/seanhinde/erlang-rocksdb.git", {ref,"9ae37839"}}}, + {hut, "1.3.0"} ]}. +{xref_checks, [ + undefined_function_calls, + locals_not_used, + deprecated_function_calls +]}. + {profiles, [ {test, [ - {deps, [ {proper, "1.3.0"} - , {meck, "0.9.0"}]} - ]} + {deps, [ {proper, "1.4.0"} + , {meck, "0.9.2"} + , {trace_runner, {git, "https://github.com/uwiger/trace_runner.git", + {ref, "2e56677"}}} + ]} + ]}, + {edown, + %% Use as `rebar3 as edown do edoc` + [ + {deps, [{edown, "0.8.4"}]}, + {edoc_opts, + [{doclet, edown_doclet}, + {app_default, "http://www.erlang.org/doc/man"}, + {doc_path, []}, + {top_level_readme, + {"./README.md", "https://github.com/aeternity/mnesia_rocksdb"}}]}]} ]}. + +{ex_doc, [ + {extras, [<<"README.md">>, <<"LICENSE">>]}, + {main, <<"readme">>}, + {source_url, <<"https://github.com/aeternity/mnesia_rocksdb">>} +]}. diff --git a/rebar.lock b/rebar.lock index faec707..b153003 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,11 +1,15 @@ {"1.2.0", -[{<<"rocksdb">>,{pkg,<<"rocksdb">>,<<"1.7.0">>},0}, +[{<<"hut">>,{pkg,<<"hut">>,<<"1.3.0">>},0}, + {<<"rocksdb">>, + {git,"https://gitlab.com/seanhinde/erlang-rocksdb.git", + {ref,"9ae378391ffc94200bde24efcd7a4921eba688d0"}}, + 0}, {<<"sext">>,{pkg,<<"sext">>,<<"1.8.0">>},0}]}. [ {pkg_hash,[ - {<<"rocksdb">>, <<"5D23319998A7FCE5FFD5D7824116C905CABA7F91BAF8EDDABD0180F1BB272CEF">>}, + {<<"hut">>, <<"71F2F054E657C03F959CF1ACC43F436EA87580696528CA2A55C8AFB1B06C85E7">>}, {<<"sext">>, <<"90A95B889F5C781B70BBCF44278B763148E313C376B60D87CE664CB1C1DD29B5">>}]}, {pkg_hash_ext,[ - {<<"rocksdb">>, <<"A4BDC5DD80ED137161549713062131E8240523787EBE7B51DF61CFB48B1786CE">>}, + {<<"hut">>, <<"7E15D28555D8A1F2B5A3A931EC120AF0753E4853A4C66053DB354F35BF9AB563">>}, {<<"sext">>, <<"BC6016CB8690BAF677EACACFE6E7CADFEC8DC7E286CBBED762F6CD55B0678E73">>}]} ]. diff --git a/src/mnesia_rocksdb.erl b/src/mnesia_rocksdb.erl index 9fc391c..80ce80e 100644 --- a/src/mnesia_rocksdb.erl +++ b/src/mnesia_rocksdb.erl @@ -17,6 +17,12 @@ %%---------------------------------------------------------------- %% @doc rocksdb storage backend for Mnesia. +%% +%% This module implements a mnesia backend callback plugin. +%% It's specifically documented to try to explain the workings of +%% backend plugins. +%% +%% @end %% Initialization: register() or register(Alias) %% Usage: mnesia:create_table(Tab, [{rocksdb_copies, Nodes}, ...]). @@ -49,9 +55,7 @@ %% -export([show_table/1, - show_table/2, - show_table/3, - fold/6]). + show_table/2]). %% %% BACKEND CALLBACKS @@ -62,39 +66,53 @@ add_aliases/1, remove_aliases/1]). +%% convenience +-export([ create_schema/1 + , create_schema/2 ]). + %% schema level callbacks --export([semantics/2, - check_definition/4, - create_table/3, - load_table/4, - close_table/2, - sync_close_table/2, - delete_table/2, - info/3]). +-export([ semantics/2 + , check_definition/4 + , create_table/3 + , load_table/4 + , close_table/2 + , sync_close_table/2 + , delete_table/2 + , info/3 ]). %% table synch calls --export([sender_init/4, - sender_handle_info/5, - receiver_first_message/4, - receive_data/5, - receive_done/4]). +-export([ sender_init/4 + , sender_handle_info/5 + , receiver_first_message/4 + , receive_data/5 + , receive_done/4 ]). %% low-level accessor callbacks. --export([delete/3, - first/2, - fixtable/3, - insert/3, - last/2, - lookup/3, - match_delete/3, - next/3, - prev/3, - repair_continuation/2, - select/1, - select/3, - select/4, - slot/3, - update_counter/4]). +-export([ delete/3 + , first/2 + , fixtable/3 + , insert/3 + , last/2 + , lookup/3 + , match_delete/3 + , next/3 + , prev/3 + , repair_continuation/2 + , select/1 + , select/3 + , select/4 + , slot/3 + , update_counter/4 ]). + +-export([ encode_key/1 %% (term()) + , encode_key/2 %% (term(), Meta::map()) + , encode_val/1 %% (term()) + , encode_val/2 %% (term(), Meta::map()) + , decode_key/1 %% (binary()) + , decode_key/2 %% (binary(), Meta::map()) + , decode_val/1 %% (binary()) + , decode_val/3 %% (binary(), Key::term(), Meta::map()) + ]). %% Index consistency -export([index_is_consistent/3, @@ -112,7 +130,7 @@ %% GEN SERVER CALLBACKS AND CALLS %% --export([start_proc/4, +-export([start_proc/6, init/1, handle_call/3, handle_info/2, @@ -120,83 +138,36 @@ terminate/2, code_change/3]). --export([ix_prefixes/3]). +-export([ ix_prefixes/3 + , ix_listvals/3 ]). -%% Exposed low-level helpers --export([get_ref/2, %% (Alias, Tab) -> {RocksDbHandle, TabType} - encode_key/1, %% (Key) -> EncodedKey - decode_key/1, %% (EncodedKey) -> Key - encode_val/1, %% (Value) -> EncodedValue - decode_val/1]). %% (EncodedValue) -> Value +-import(mrdb, [ with_iterator/2 + ]). + +-import(mnesia_rocksdb_admin, [ get_ref/1 ]). + +%% -import(mnesia_rocksdb_lib, [ encode_key/2 +%% , encode_val/3 +%% , decode_key/2 +%% , decode_val/3 +%% ]). + +-include("mnesia_rocksdb.hrl"). +-include("mnesia_rocksdb_int.hrl"). %% ---------------------------------------------------------------------------- %% DEFINES %% ---------------------------------------------------------------------------- -%% Name of the Rocksdb interface module; defaults to rocksdb but can be -%% configured by passing -DROCKSDB_MODULE= to erlc. --ifdef(ROCKSDB_MODULE). --define(rocksdb, ?ROCKSDB_MODULE). --else. --define(rocksdb, rocksdb). %% Name of the Rocksdb interface module --endif. - -%% Data and meta data (a.k.a. info) are stored in the same table. -%% This is a table of the first byte in data -%% 0 = before meta data -%% 1 = meta data -%% 2 = before data -%% >= 8 = data - --define(INFO_START, 0). --define(INFO_TAG, 1). --define(DATA_START, 2). --define(BAG_CNT, 32). % Number of bits used for bag object counter --define(MAX_BAG, 16#FFFFFFFF). - -%% enable debugging messages through mnesia:set_debug_level(debug) --ifndef(MNESIA_ROCKSDB_NO_DBG). --define(dbg(Fmt, Args), - %% avoid evaluating Args if the message will be dropped anyway - case mnesia_monitor:get_env(debug) of - none -> ok; - verbose -> ok; - _ -> mnesia_lib:dbg_out("~p:~p: "++(Fmt),[?MODULE,?LINE|Args]) - end). --else. --define(dbg(Fmt, Args), ok). --endif. %% ---------------------------------------------------------------------------- %% RECORDS %% ---------------------------------------------------------------------------- --record(sel, { alias % TODO: not used - , tab - , ref - , keypat - , ms % TODO: not used - , compiled_ms - , limit - , key_only = false % TODO: not used - , direction = forward % TODO: not used - }). - --type on_write_error() :: debug | verbose | warning | error | fatal. --type on_write_error_store() :: atom() | undefined. - --define(WRITE_ERR_DEFAULT, verbose). --define(WRITE_ERR_STORE_DEFAULT, undefined). - --record(st, { ets - , ref +-record(st, { ref , alias , tab , type - , size_warnings :: integer() - , maintain_size :: boolean() - , on_write_error = ?WRITE_ERR_DEFAULT :: on_write_error() - , on_write_error_store = ?WRITE_ERR_STORE_DEFAULT :: on_write_error_store() }). -type data_tab() :: atom(). @@ -211,6 +182,8 @@ -type table_type() :: set | ordered_set | bag. -type table() :: data_tab() | index_tab() | retainer_tab(). +-type error() :: {error, any()}. + -export_type([alias/0, table/0, table_type/0]). @@ -219,9 +192,18 @@ %% CONVENIENCE API %% ---------------------------------------------------------------------------- +-spec register() -> {ok, alias()} | {error, _}. +%% @equiv register(rocksdb_copies) register() -> register(default_alias()). +%% @doc Convenience function for registering a mnesia_rocksdb backend plugin +%% +%% The function used to register a plugin is `mnesia_schema:add_backend_type(Alias, Module)' +%% where `Module' implements a backend_type behavior. `Alias' is an atom, and is used +%% in the same way as `ram_copies' etc. The default alias is `rocksdb_copies'. +%% @end +-spec register(alias()) -> {ok, alias()} | error(). register(Alias) -> Module = ?MODULE, case mnesia:add_backend_type(Alias, Module) of @@ -236,40 +218,63 @@ register(Alias) -> default_alias() -> rocksdb_copies. + +encode_key(Key) -> + mnesia_rocksdb_lib:encode_key(Key, sext). + +encode_key(Key, Metadata) when is_map(Metadata) -> + mnesia_rocksdb_lib:encode_key(Key, Metadata). + +encode_val(Val) -> + mnesia_rocksdb_lib:encode_val(Val). + +encode_val(Val, Metadata) when is_map(Metadata) -> + mnesia_rocksdb_lib:encode_val(Val, Metadata). + +decode_key(Key) -> + mnesia_rocksdb_lib:decode_key(Key, sext). + +decode_key(Key, Metadata) when is_map(Metadata) -> + mnesia_rocksdb_lib:decode_key(Key, Metadata). + +decode_val(Val) -> + mnesia_rocksdb_lib:decode_val(Val). + +decode_val(Val, Key, Metadata) when is_map(Metadata); is_reference(Metadata) -> + mnesia_rocksdb_lib:decode_val(Val, Key, Metadata). + %% ---------------------------------------------------------------------------- %% DEBUG API %% ---------------------------------------------------------------------------- -%% A debug function that shows the rocksdb table content +%% @doc A debug function that shows the rocksdb table content show_table(Tab) -> - show_table(default_alias(), Tab). + show_table(Tab, 100). -show_table(Alias, Tab) -> - show_table(Alias, Tab, 100). +show_table(Tab, Limit) -> + Ref = get_ref(Tab), + mrdb:with_rdb_iterator(Ref, fun(I) -> + i_show_table(I, first, Limit, Ref) + end). -show_table(Alias, Tab, Limit) -> - {Ref, _Type} = get_ref(Alias, Tab), - with_iterator(Ref, fun(I) -> i_show_table(I, first, Limit) end). - -%% PRIVATE - -i_show_table(_, _, 0) -> +i_show_table(_, _, 0, _) -> {error, skipped_some}; -i_show_table(I, Move, Limit) -> - case ?rocksdb:iterator_move(I, Move) of +i_show_table(I, Move, Limit, Ref) -> + case rocksdb:iterator_move(I, Move) of {ok, EncKey, EncVal} -> {Type,Val} = case EncKey of << ?INFO_TAG, K/binary >> -> - {info,{decode_key(K),decode_val(EncVal)}}; + K1 = decode_key(K, Ref), + V = decode_val(EncVal, K1, Ref), + {info,V}; _ -> - K = decode_key(EncKey), - V = decode_val(EncVal), - V2 = setelement(2,V,K), - {data,V2} + K = decode_key(EncKey, Ref), + V = decode_val(EncVal, K, Ref), + {data,V} end, io:fwrite("~p: ~p~n", [Type, Val]), - i_show_table(I, next, Limit-1); + i_show_table(I, next, Limit-1, Ref); _ -> ok end. @@ -281,34 +286,46 @@ i_show_table(I, Move, Limit) -> %% backend management +%% @doc Called by mnesia_schema in order to intialize the backend +%% +%% This is called when the backend is registered with the first alias, or ... +%% +%% See OTP issue #425 (16 Feb 2021). This callback is supposed to be called +%% before first use of the backend, but unfortunately, it is only called at +%% mnesia startup and when a backend module is registered MORE THAN ONCE. +%% This means we need to handle this function being called multiple times. +%% +%% The bug has been fixed as of OTP 24.0-rc3 +%% +%% If processes need to be started, this can be done using +%% `mnesia_ext_sup:start_proc(Name, Mod, F, Args [, Opts])' +%% where Opts are parameters for the supervised child: +%% +%% * `restart' (default: `transient') +%% * `shutdown' (default: `120000') +%% * `type' (default: `worker') +%% * `modules' (default: `[Mod]') +%% @end init_backend() -> - stick_rocksdb_dir(), - application:ensure_all_started(mnesia_rocksdb), - ok. + mnesia_rocksdb_admin:ensure_started(). -%% Prevent reloading of modules in rocksdb itself during runtime, since it -%% can lead to inconsistent state in rocksdb and silent data corruption. -stick_rocksdb_dir() -> - case code:which(rocksdb) of - BeamPath when is_list(BeamPath), BeamPath =/= "" -> - Dir = filename:dirname(BeamPath), - case code:stick_dir(Dir) of - ok -> ok; - error -> warn_stick_dir({error, Dir}) - end; - Other -> - warn_stick_dir({not_found, Other}) - end. +add_aliases(Aliases) -> + %% Since we can't be sure that init_backend() has been called (see above), + %% and we know that it can be called repeatedly anyway, let's call it here. + init_backend(), + mnesia_rocksdb_admin:add_aliases(Aliases). -warn_stick_dir(Reason) -> - mnesia_lib:warning("cannot make rocksdb directory sticky:~n~p~n", - [Reason]). +remove_aliases(Aliases) -> + mnesia_rocksdb_admin:remove_aliases(Aliases). -add_aliases(_Aliases) -> - ok. +%% Convenience function for creating a schema with this plugin +%% already registered (default aliases: [rocksdb_copies]). +create_schema(Nodes) -> + create_schema(Nodes, [rocksdb_copies]). -remove_aliases(_Aliases) -> - ok. +create_schema(Nodes, Aliases) when is_list(Nodes), is_list(Aliases) -> + mnesia:create_schema(Nodes, [{backend_types, + [{A, ?MODULE} || A <- Aliases]}]). %% schema level callbacks @@ -324,7 +341,7 @@ remove_aliases(_Aliases) -> %% semantics(_Alias, storage) -> disc_only_copies; semantics(_Alias, types ) -> [set, ordered_set, bag]; -semantics(_Alias, index_types) -> [ordered]; +semantics(_Alias, index_types) -> [ordered, bag]; % treat bag as ordered semantics(_Alias, index_fun) -> fun index_f/4; semantics(_Alias, _) -> undefined. @@ -334,9 +351,9 @@ is_index_consistent(Alias, {Tab, index, PosInfo}) -> _ -> false end. -index_is_consistent(Alias, {Tab, index, PosInfo}, Bool) +index_is_consistent(_Alias, {Tab, index, PosInfo}, Bool) when is_boolean(Bool) -> - write_info(Alias, Tab, {index_consistent, PosInfo}, Bool). + mrdb:write_info(Tab, {index_consistent, PosInfo}, Bool). %% PRIVATE FUN @@ -364,108 +381,86 @@ prefixes(<>) -> prefixes(_) -> []. +ix_listvals(_Tab, _Pos, Obj) -> + lists:foldl( + fun(V, Acc) when is_list(V) -> + V ++ Acc; + (_, Acc) -> + Acc + end, [], tl(tuple_to_list(Obj))). + %% For now, only verify that the type is set or ordered_set. %% set is OK as ordered_set is a kind of set. check_definition(Alias, Tab, Nodes, Props) -> Id = {Alias, Nodes}, try Props1 = lists:map(fun(E) -> check_definition_entry(Tab, Id, E) end, Props), - {ok, Props1} + Props2 = check_encoding(Tab, Props1), + {ok, Props2} catch throw:Error -> Error end. +check_encoding(Tab, Props) -> + {_, Type} = lists:keyfind(type, 1, Props), + {_, Attrs} = lists:keyfind(attributes, 1, Props), + {_, UserProps} = lists:keyfind(user_properties, 1, Props), + Enc = proplists:get_value( + mrdb_encoding, UserProps, + mnesia_rocksdb_lib:default_encoding(Tab, Type, Attrs)), + + Enc1 = case mnesia_rocksdb_lib:check_encoding(Enc, Attrs) of + {ok, Res} -> Res; + EncErr -> throw(EncErr) + end, + UserProps1 = lists:keystore(mrdb_encoding, 1, UserProps, + {mrdb_encoding, Enc1}), + lists:keyreplace(user_properties, 1, Props, {user_properties, UserProps1}). + check_definition_entry(_Tab, _Id, {type, T} = P) when T==set; T==ordered_set; T==bag -> P; check_definition_entry(Tab, Id, {type, T}) -> mnesia:abort({combine_error, Tab, [Id, {type, T}]}); -check_definition_entry(_Tab, _Id, {user_properties, UPs} = P) -> - RdbOpts = proplists:get_value(rocksdb_opts, UPs, []), - OWE = proplists:get_value(on_write_error, RdbOpts, ?WRITE_ERR_DEFAULT), - OWEStore = proplists:get_value(on_write_error_store, RdbOpts, ?WRITE_ERR_STORE_DEFAULT), - case valid_mnesia_op(OWE) of - true -> - case OWEStore of - undefined -> - P; - V when is_atom(V) -> - P; - V -> - throw({error, {invalid_configuration, {on_write_error_store, V}}}) - end; - false -> - throw({error, {invalid_configuration, {on_write_error, OWE}}}) +check_definition_entry(Tab, Id, {index, Ixs} = P) -> + case [true || {_, bag} <- Ixs] of + [] -> + P; + [_|_] -> + %% Let's not pretend-support bag indexes + mnesia:abort({combine_error, Tab, [Id, P]}) end; +check_definition_entry(_Tab, _Id, {user_properties, UPs} = P) -> + case lists:keyfind(rocksdb_standalone, 1, UPs) of + false -> ok; + {_, Bool} when is_boolean(Bool) -> ok; + Other -> + throw({error, {invalid_configuration, Other}}) + end, + P; check_definition_entry(_Tab, _Id, P) -> P. -%% -> ok | {error, exists} -create_table(_Alias, Tab, _Props) -> - create_mountpoint(Tab). +create_table(Alias, Tab, Props) -> + {ok, Pid} = maybe_start_proc(Alias, Tab, Props), + do_call(Pid, {create_table, Tab, Props}). -load_table(Alias, Tab, _LoadReason, Opts) -> - Type = proplists:get_value(type, Opts), - RdbUserProps = proplists:get_value( - rocksdb_opts, proplists:get_value( - user_properties, Opts, []), []), - StorageProps = proplists:get_value( - rocksdb, proplists:get_value( - storage_properties, Opts, []), RdbUserProps), - RdbOpts = mnesia_rocksdb_params:lookup(Tab, StorageProps), - ProcName = proc_name(Alias, Tab), - case whereis(ProcName) of - undefined -> - load_table_(Alias, Tab, Type, RdbOpts); - Pid -> - gen_server:call(Pid, {load, Alias, Tab, Type, RdbOpts}, infinity) - end. - -load_table_(Alias, Tab, Type, RdbOpts) -> - ShutdownTime = proplists:get_value( - owner_shutdown_time, RdbOpts, 120000), - case mnesia_ext_sup:start_proc( - Tab, ?MODULE, start_proc, [Alias,Tab,Type, RdbOpts], - [{shutdown, ShutdownTime}]) of - {ok, _Pid} -> - ok; - - %% TODO: This reply is according to the manual, but we dont get it. - {error, {already_started, _Pid}} -> - %% TODO: Is it an error if the table already is - %% loaded. This printout is triggered when running - %% transform_table on a rocksdb_table that has indexing. - ?dbg("ERR: table:~p already loaded pid:~p~n", - [Tab, _Pid]), - ok; - - %% TODO: This reply is not according to the manual, but we get it. - {error, {{already_started, _Pid}, _Stack}} -> - %% TODO: Is it an error if the table already is - %% loaded. This printout is triggered when running - %% transform_table on a rocksdb_table that has indexing. - ?dbg("ERR: table:~p already loaded pid:~p stack:~p~n", - [Tab, _Pid, _Stack]), - ok; - {error, Other} -> - mnesia:abort(Other) - end. +load_table(Alias, Tab, LoadReason, Opts) -> + call(Alias, Tab, {load_table, LoadReason, Opts}). close_table(Alias, Tab) -> - ?dbg("~p: close_table(~p, ~p);~n Trace: ~s~n", - [self(), Alias, Tab, pp_stack()]), - if is_atom(Tab) -> - [close_table(Alias, R) - || {R, _} <- related_resources(Tab)]; - true -> - ok - end, - close_table_(Alias, Tab). + case mnesia_rocksdb_admin:get_ref(Tab, error) of + error -> + ok; + _ -> + ok = mnesia_rocksdb_admin:prep_close(Alias, Tab), + close_table_(Alias, Tab) + end. close_table_(Alias, Tab) -> case opt_call(Alias, Tab, close_table) of {error, noproc} -> - ?dbg("~p: close_table_(~p) -> noproc~n", + ?log(debug, "~p: close_table_(~p) -> noproc~n", [self(), Tab]), ok; {ok, _} -> @@ -498,75 +493,20 @@ pp_pos([{file,_},{line,L}]) -> -endif. sync_close_table(Alias, Tab) -> - ?dbg("~p: sync_close_table(~p, ~p);~n Trace: ~s~n", + ?log(debug, "~p: sync_close_table(~p, ~p);~n Trace: ~s~n", [self(), Alias, Tab, pp_stack()]), close_table(Alias, Tab). delete_table(Alias, Tab) -> - ?dbg("~p: delete_table(~p, ~p);~n Trace: ~s~n", - [self(), Alias, Tab, pp_stack()]), - delete_table(Alias, Tab, data_mountpoint(Tab)). - -delete_table(Alias, Tab, MP) -> - if is_atom(Tab) -> - [delete_table(Alias, T, M) || {T,M} <- related_resources(Tab)]; - true -> - ok - end, - case opt_call(Alias, Tab, delete_table) of - {error, noproc} -> - do_delete_table(Tab, MP); - {ok, _} -> - ok + case whereis_proc(Alias, Tab) of + undefined -> + ok; + Pid when is_pid(Pid) -> + call(Alias, Tab, delete_table) end. -do_delete_table(Tab, MP) -> - assert_proper_mountpoint(Tab, MP), - destroy_db(MP, []). - - -info(_Alias, Tab, memory) -> - try ets:info(tab_name(icache, Tab), memory) - catch - error:_ -> - 0 - end; -info(Alias, Tab, size) -> - case retrieve_size(Alias, Tab) of - {ok, Size} -> - if Size < 10000 -> ok; - true -> size_warning(Alias, Tab) - end, - Size; - Error -> - Error - end; info(_Alias, Tab, Item) -> - case try_read_info(Tab, Item, undefined) of - {ok, Value} -> - Value; - Error -> - Error - end. - -retrieve_size(_Alias, Tab) -> - case try_read_info(Tab, size, 0) of - {ok, Size} -> - {ok, Size}; - Error -> - Error - end. - -try_read_info(Tab, Item, Default) -> - try - {ok, read_info(Item, Default, tab_name(icache, Tab))} - catch - error:Reason -> - {error, Reason} - end. - -write_info(Alias, Tab, Key, Value) -> - call(Alias, Tab, {write_info, Key, Value}). + mrdb:read_info(Tab, Item). %% table synch calls @@ -640,190 +580,95 @@ chunk_fun() -> %% low-level accessor callbacks. -delete(Alias, Tab, Key) -> - opt_call(Alias, Tab, {delete, encode_key(Key)}), - ok. - -first(Alias, Tab) -> - {Ref, _Type} = get_ref(Alias, Tab), - with_iterator(Ref, fun i_first/1). - -%% PRIVATE ITERATOR -i_first(I) -> - case ?rocksdb:iterator_move(I, <>) of - {ok, First, _} -> - decode_key(First); - _ -> - '$end_of_table' +%% Whereas the return type, legacy | Ref, seems odd, it's a shortcut for +%% performance reasons. +access_type(Tab) -> + case get_ref(Tab) of + #{semantics := bag, vsn := 1} -> legacy; + R -> + R#{mode => mnesia} end. +delete(Alias, Tab, Key) -> + case access_type(Tab) of + legacy -> call(Alias, Tab, {delete, Key}); + R -> db_delete(R, Key, [], R) + end. + %% call_if_legacy(Alias, Tab, {delete, Key}, fun() -> mrdb + %% mrdb:delete(Tab, Key). + %% %% opt_call(Alias, Tab, {delete, Key}), + %% %% ok. + +first(_Alias, Tab) -> + mrdb:first(Tab). + %% Not relevant for an ordered_set fixtable(_Alias, _Tab, _Bool) -> true. -%% To save storage space, we avoid storing the key twice. We replace the key -%% in the record with []. It has to be put back in lookup/3. insert(Alias, Tab, Obj) -> - Pos = keypos(Tab), - EncKey = encode_key(element(Pos, Obj)), - EncVal = encode_val(setelement(Pos, Obj, [])), - call(Alias, Tab, {insert, EncKey, EncVal}). - -last(Alias, Tab) -> - {Ref, _Type} = get_ref(Alias, Tab), - with_iterator(Ref, fun i_last/1). - -%% PRIVATE ITERATOR -i_last(I) -> - case ?rocksdb:iterator_move(I, last) of - {ok, << ?INFO_TAG, _/binary >>, _} -> - '$end_of_table'; - {ok, Last, _} -> - decode_key(Last); - _ -> - '$end_of_table' + case access_type(Tab) of + legacy -> call(Alias, Tab, {insert, Obj}); + R -> db_insert(R, Obj, [], R) end. +last(_Alias, Tab) -> + mrdb:last(Tab). + %% Since we replace the key with [] in the record, we have to put it back %% into the found record. -lookup(Alias, Tab, Key) -> - Enc = encode_key(Key), - {Ref, Type} = call(Alias, Tab, get_ref), - case Type of - bag -> - lookup_bag(Ref, Key, Enc, keypos(Tab)); - _ -> - case ?rocksdb:get(Ref, Enc, []) of - {ok, EncVal} -> - [setelement(keypos(Tab), decode_val(EncVal), Key)]; - _ -> - [] - end +lookup(_Alias, Tab, Key) -> + mrdb:read(Tab, Key). + +match_delete(Alias, Tab, Pat) -> + case access_type(Tab) of + legacy -> call(Alias, Tab, {match_delete, Pat}); + R -> match_delete_(R, Pat) end. -lookup_bag(Ref, K, Enc, KP) -> - Sz = byte_size(Enc), - with_iterator( - Ref, fun(I) -> - lookup_bag_(Sz, Enc, ?rocksdb:iterator_move(I, Enc), - K, I, KP) - end). - -lookup_bag_(Sz, Enc, {ok, Enc, _}, K, I, KP) -> - lookup_bag_(Sz, Enc, ?rocksdb:iterator_move(I, next), K, I, KP); -lookup_bag_(Sz, Enc, Res, K, I, KP) -> - case Res of - {ok, <>, V} -> - [setelement(KP, decode_val(V), K)| - lookup_bag_(Sz, Enc, ?rocksdb:iterator_move(I, next), K, I, KP)]; - _ -> - [] - end. - -match_delete(Alias, Tab, Pat) when is_atom(Pat) -> - %do_match_delete(Alias, Tab, '_'), - case is_wild(Pat) of - true -> - call(Alias, Tab, clear_table), - ok; - false -> - %% can this happen?? - error(badarg) +match_delete_(#{name := {_, index, {_,bag}}, semantics := set} = R, Pat) -> + case Pat of + '_' -> + mrdb:match_delete(R, Pat); + {V, Key} -> + db_delete(R, {V, Key}, [], R) end; -match_delete(Alias, Tab, Pat) when is_tuple(Pat) -> - KP = keypos(Tab), - Key = element(KP, Pat), - case is_wild(Key) of - true -> - call(Alias, Tab, clear_table); - false -> - call(Alias, Tab, {match_delete, Pat}) - end, - ok. +match_delete_(R, Pat) -> + mrdb:match_delete(R, Pat). +next(_Alias, Tab, Key) -> + mrdb:next(Tab, Key). -next(Alias, Tab, Key) -> - {Ref, _Type} = get_ref(Alias, Tab), - EncKey = encode_key(Key), - with_iterator(Ref, fun(I) -> i_next(I, EncKey, Key) end). - -%% PRIVATE ITERATOR -i_next(I, EncKey, Key) -> - case ?rocksdb:iterator_move(I, EncKey) of - {ok, EncKey, _} -> - i_next_loop(?rocksdb:iterator_move(I, next), I, Key); - Other -> - i_next_loop(Other, I, Key) - end. - -i_next_loop({ok, EncKey, _}, I, Key) -> - case decode_key(EncKey) of - Key -> - i_next_loop(?rocksdb:iterator_move(I, next), I, Key); - NextKey -> - NextKey - end; -i_next_loop(_, _I, _Key) -> - '$end_of_table'. - -prev(Alias, Tab, Key0) -> - {Ref, _Type} = call(Alias, Tab, get_ref), - Key = encode_key(Key0), - with_iterator(Ref, fun(I) -> i_prev(I, Key) end). - -%% PRIVATE ITERATOR -i_prev(I, Key) -> - case ?rocksdb:iterator_move(I, Key) of - {ok, _, _} -> - i_move_to_prev(I, Key); - {error, invalid_iterator} -> - i_last(I) - end. - -%% PRIVATE ITERATOR -i_move_to_prev(I, Key) -> - case ?rocksdb:iterator_move(I, prev) of - {ok, << ?INFO_TAG, _/binary >>, _} -> - '$end_of_table'; - {ok, Prev, _} when Prev < Key -> - decode_key(Prev); - {ok, _, _} -> - i_move_to_prev(I, Key); - _ -> - '$end_of_table' - end. +prev(_Alias, Tab, Key) -> + mrdb:prev(Tab, Key). repair_continuation(Cont, _Ms) -> Cont. select(Cont) -> - %% Handle {ModOrAlias, Cont} wrappers for backwards compatibility with - %% older versions of mnesia_ext (before OTP 20). - case Cont of - {_, '$end_of_table'} -> '$end_of_table'; - {_, Cont1} -> Cont1(); - '$end_of_table' -> '$end_of_table'; - _ -> Cont() - end. + mrdb:select(Cont). select(Alias, Tab, Ms) -> - case select(Alias, Tab, Ms, infinity) of - {Res, '$end_of_table'} -> - Res; - '$end_of_table' -> - '$end_of_table' - end. + select(Alias, Tab, Ms, infinity). -select(Alias, Tab, Ms, Limit) when Limit==infinity; is_integer(Limit) -> - {Ref, Type} = get_ref(Alias, Tab), - do_select(Ref, Tab, Type, Ms, Limit). +select(_Alias, {_,index,{_,bag}} = IxTab, Ms, Limit) -> + %% We at mnesia_rocksdb do not support bag indexes, but ordered indexes + %% have the same outward semantics (more or less). Reshape the match pattern. + [{{IxKey,'$1'}, [], ['$1']}] = Ms, + mrdb:select(IxTab, [{{{IxKey,'$1'}}, [], [{element, 1, '$_'}]}], Limit); +select(_Alias, Tab, Ms, Limit) when Limit==infinity; is_integer(Limit) -> + mrdb:select(Tab, Ms, Limit). -slot(Alias, Tab, Pos) when is_integer(Pos), Pos >= 0 -> - {Ref, Type} = get_ref(Alias, Tab), - First = fun(I) -> ?rocksdb:iterator_move(I, <>) end, - F = case Type of - bag -> fun(I) -> slot_iter_set(First(I), I, 0, Pos) end; - _ -> fun(I) -> slot_iter_set(First(I), I, 0, Pos) end +slot(_Alias, Tab, Pos) when is_integer(Pos), Pos >= 0 -> + #{semantics := Sem} = Ref = get_ref(Tab), + Start = case Ref of + #{type := standalone, vsn := 1} -> <>; + _ -> first + end, + First = fun(I) -> rocksdb:iterator_move(I, Start) end, + F = case Sem of + bag -> fun(I) -> slot_iter_set(First(I), I, 0, Pos, Ref) end; + _ -> fun(I) -> slot_iter_set(First(I), I, 0, Pos, Ref) end end, with_iterator(Ref, F); slot(_, _, _) -> @@ -832,64 +677,46 @@ slot(_, _, _) -> %% Exactly which objects Mod:slot/2 is supposed to return is not defined, %% so let's just use the same version for both set and bag. No one should %% use this function anyway, as it is ridiculously inefficient. -slot_iter_set({ok, K, V}, _I, P, P) -> - [setelement(2, decode_val(V), decode_key(K))]; -slot_iter_set({ok, _, _}, I, P1, P) when P1 < P -> - slot_iter_set(?rocksdb:iterator_move(I, next), I, P1+1, P); -slot_iter_set(Res, _, _, _) when element(1, Res) =/= ok -> +slot_iter_set({ok, K, V}, _I, P, P, R) -> + Kd = decode_key(K, R), + [setelement(2, decode_val(V, Kd, R), Kd)]; +slot_iter_set({ok, _, _}, I, P1, P, R) when P1 < P -> + slot_iter_set(rocksdb:iterator_move(I, next), I, P1+1, P, R); +slot_iter_set(Res, _, _, _, _) when element(1, Res) =/= ok -> '$end_of_table'. update_counter(Alias, Tab, C, Val) when is_integer(Val) -> - call(Alias, Tab, {update_counter, C, Val}). - -%% server-side part -do_update_counter(C, Val, Ref, St) -> - Enc = encode_key(C), - case ?rocksdb:get(Ref, Enc, [{fill_cache, true}]) of - {ok, EncVal} -> - case decode_val(EncVal) of - {_, _, Old} = Rec when is_integer(Old) -> - Res = Old+Val, - return_catch( - fun() -> - db_put(Ref, Enc, - encode_val( - setelement(3, Rec, Res)), - [], St) - end); - _ -> - badarg - end; - _ -> - badarg + case access_type(Tab) of + legacy -> call(Alias, Tab, {update_counter, C, Val}); + R -> mrdb:update_counter(R, C, Val) end. %% PRIVATE -%% key+data iterator: iterator_move/2 returns {ok, EncKey, EncVal} -with_iterator(Ref, F) -> - {ok, I} = ?rocksdb:iterator(Ref, []), - try F(I) - after - ?rocksdb:iterator_close(I) - end. - %% keys_only iterator: iterator_move/2 returns {ok, EncKey} %% with_keys_only_iterator(Ref, F) -> -%% {ok, I} = ?rocksdb:iterator(Ref, [], keys_only), +%% {ok, I} = rocksdb:iterator(Ref, [], keys_only), %% try F(I) %% after -%% ?rocksdb:iterator_close(I) +%% rocksdb:iterator_close(I) %% end. %% TODO - use with_keys_only_iterator for match_delete %% record and key validation -validate_key(_Alias, _Tab, RecName, Arity, Type, _Key) -> +validate_key(_Alias, Tab, RecName, Arity, Type, Key) -> + case mnesia_rocksdb_lib:valid_key_type(get_ref(Tab), Key) of + true -> ok; + false -> mnesia:abort({bad_type, Key}) + end, {RecName, Arity, Type}. -validate_record(_Alias, _Tab, RecName, Arity, Type, _Obj) -> +validate_record(_Alias, Tab, RecName, Arity, Type, Obj) -> + case mnesia_rocksdb_lib:valid_obj_type(get_ref(Tab), Obj) of + true -> ok; + false -> mnesia:abort({bad_type, Obj}) + end, {RecName, Arity, Type}. %% file extension callbacks @@ -909,104 +736,95 @@ tmp_suffixes() -> %% GEN SERVER CALLBACKS AND CALLS %% ---------------------------------------------------------------------------- -start_proc(Alias, Tab, Type, RdbOpts) -> +maybe_start_proc(Alias, Tab, Props) -> ProcName = proc_name(Alias, Tab), - gen_server:start_link({local, ProcName}, ?MODULE, - {Alias, Tab, Type, RdbOpts}, []). - -init({Alias, Tab, Type, RdbOpts}) -> - process_flag(trap_exit, true), - try - {ok, Ref, Ets} = do_load_table(Tab, RdbOpts), - OWE = proplists:get_value(on_write_error, RdbOpts, ?WRITE_ERR_DEFAULT), - OWEStore = proplists:get_value(on_write_error_store, RdbOpts, ?WRITE_ERR_STORE_DEFAULT), - St = #st{ ets = Ets - , ref = Ref - , alias = Alias - , tab = Tab - , type = Type - , size_warnings = 0 - , maintain_size = should_maintain_size(Tab) - , on_write_error = OWE - , on_write_error_store = OWEStore - }, - {ok, recover_size_info(St)} - catch - throw:badarg -> - {error, write_error} + case whereis(ProcName) of + undefined -> + Type = proplists:get_value(type, Props), + RdbUserProps = proplists:get_value( + rocksdb_opts, proplists:get_value( + user_properties, Props, []), []), + StorageProps = proplists:get_value( + rocksdb, proplists:get_value( + storage_properties, Props, []), RdbUserProps), + RdbOpts = mnesia_rocksdb_params:lookup(Tab, StorageProps), + ShutdownTime = proplists:get_value( + owner_shutdown_time, RdbOpts, 120000), + mnesia_ext_sup:start_proc( + Tab, ?MODULE, start_proc, [Alias, Tab, Type, ProcName, Props, RdbOpts], + [{shutdown, ShutdownTime}]); + Pid when is_pid(Pid) -> + {ok, Pid} end. -do_load_table(Tab, RdbOpts) -> - MPd = data_mountpoint(Tab), - ?dbg("** Mountpoint: ~p~n ~s~n", [MPd, os:cmd("ls " ++ MPd)]), - Ets = ets:new(tab_name(icache,Tab), [set, protected, named_table]), - {ok, Ref} = open_rocksdb(MPd, RdbOpts), - rocksdb_to_ets(Ref, Ets), - {ok, Ref, Ets}. +%% Exported callback +start_proc(Alias, Tab, Type, ProcName, Props, RdbOpts) -> + gen_server:start_link({local, ProcName}, ?MODULE, + {Alias, Tab, Type, Props, RdbOpts}, []). -handle_call({load, Alias, Tab, Type, RdbOpts}, _From, - #st{type = Type, alias = Alias, tab = Tab} = St) -> - {ok, Ref, Ets} = do_load_table(Tab, RdbOpts), - {reply, ok, St#st{ref = Ref, ets = Ets}}; -handle_call(get_ref, _From, #st{ref = Ref, type = Type} = St) -> - {reply, {Ref, Type}, St}; -handle_call({write_info, Key, Value}, _From, #st{} = St) -> - _ = write_info_(Key, Value, St), +init({Alias, Tab, Type, _Props, RdbOpts}) -> + process_flag(trap_exit, true), + %% In case of a process restart, we try to rebuild the state + %% from the cf info held by the admin process. + Ref = case mnesia_rocksdb_admin:request_ref(Alias, Tab) of + {ok, Ref1} -> Ref1; + {error, _} -> undefined + end, + {ok, update_state(Ref, Alias, Tab, Type, RdbOpts, #st{})}. + +update_state(Ref, Alias, Tab, Type, _RdbOpts, St) -> + St#st{ tab = Tab + , alias = Alias + , type = Type + , ref = maybe_set_ref_mode(Ref) + }. + +maybe_set_ref_mode(Ref) when is_map(Ref) -> + Ref#{mode => mnesia}; +maybe_set_ref_mode(Ref) -> + Ref. + +handle_call({create_table, Tab, Props}, _From, + #st{alias = Alias, tab = Tab} = St) -> + try mnesia_rocksdb_admin:create_table(Alias, Tab, Props) of + {ok, Ref} -> + {reply, ok, St#st{ref = maybe_set_ref_mode(Ref)}}; + Other -> + {reply, Other, St} + catch + exit:{aborted, Error} -> + {reply, {aborted, Error}, St} + end; +handle_call({load_table, _LoadReason, _Opts}, _From, + #st{alias = Alias, tab = Tab} = St) -> + ok = mnesia_rocksdb_admin:load_table(Alias, Tab), + {reply, ok, St}; +handle_call({write_info, Key, Value}, _From, #st{ref = Ref} = St) -> + mrdb:write_info(Ref, Key, Value), {reply, ok, St}; handle_call({update_counter, C, Incr}, _From, #st{ref = Ref} = St) -> - {reply, do_update_counter(C, Incr, Ref, St), St}; -handle_call({insert, Key, Val}, _From, St) -> - Res = do_insert(Key, Val, St), + {reply, mrdb:update_counter(Ref, C, Incr), St}; +handle_call({insert, Obj}, _From, St) -> + Res = do_insert(Obj, St), {reply, Res, St}; handle_call({delete, Key}, _From, St) -> Res = do_delete(Key, St), {reply, Res, St}; -handle_call(clear_table, _From, #st{ets = Ets, tab = Tab, ref = Ref} = St) -> - MPd = data_mountpoint(Tab), - ?dbg("Attempting clear_table(~p)~n", [Tab]), - _ = rocksdb_close(Ref), - {ok, NewRef} = destroy_recreate(MPd, rocksdb_open_opts(Tab)), - ets:delete_all_objects(Ets), - rocksdb_to_ets(NewRef, Ets), - {reply, ok, St#st{ref = NewRef}}; -handle_call({match_delete, Pat}, _From, #st{} = St) -> - Res = do_match_delete(Pat, St), +handle_call({match_delete, Pat}, _From, #st{ref = Ref} = St) -> + Res = mrdb:match_delete(Ref, Pat), {reply, Res, St}; -handle_call(close_table, _From, #st{ref = Ref, ets = Ets} = St) -> - _ = rocksdb_close(Ref), - ets:delete(Ets), +handle_call(close_table, _From, #st{alias = Alias, tab = Tab} = St) -> + _ = mnesia_rocksdb_admin:close_table(Alias, Tab), {reply, ok, St#st{ref = undefined}}; -handle_call(delete_table, _From, #st{tab = T, ref = Ref, ets = Ets} = St) -> - _ = (catch rocksdb_close(Ref)), - _ = (catch ets:delete(Ets)), - do_delete_table(T, data_mountpoint(T)), +handle_call(delete_table, _From, #st{alias = Alias, tab = Tab} = St) -> + ok = mnesia_rocksdb_admin:delete_table(Alias, Tab), {stop, normal, ok, St#st{ref = undefined}}. -handle_cast(size_warning, #st{tab = T, size_warnings = W} = St) when W < 10 -> - mnesia_lib:warning("large size retrieved from table: ~p~n", [T]), - if W =:= 9 -> - OneHrMs = 60 * 60 * 1000, - erlang:send_after(OneHrMs, self(), unmute_size_warnings); - true -> - ok - end, - {noreply, St#st{size_warnings = W + 1}}; -handle_cast(size_warning, #st{size_warnings = W} = St) when W >= 10 -> - {noreply, St#st{size_warnings = W + 1}}; handle_cast(_, St) -> {noreply, St}. -handle_info(unmute_size_warnings, #st{tab = T, size_warnings = W} = St) -> - C = W - 10, - if C > 0 -> - mnesia_lib:warning("warnings suppressed~ntable: ~p, count: ~p~n", - [T, C]); - true -> - ok - end, - {noreply, St#st{size_warnings = 0}}; handle_info({'EXIT', _, _} = _EXIT, St) -> - ?dbg("rocksdb owner received ~p~n", [_EXIT]), + ?log(debug, "rocksdb owner received ~p~n", [_EXIT]), {noreply, St}; handle_info(_, St) -> {noreply, St}. @@ -1014,11 +832,7 @@ handle_info(_, St) -> code_change(_FromVsn, St, _Extra) -> {ok, St}. -terminate(_Reason, #st{ref = Ref}) -> - if Ref =/= undefined -> - ?rocksdb:close(Ref); - true -> ok - end, +terminate(_Reason, _St) -> ok. @@ -1026,177 +840,25 @@ terminate(_Reason, #st{ref = Ref}) -> %% GEN SERVER PRIVATE %% ---------------------------------------------------------------------------- -get_env_default(Key, Default) -> - case os:getenv(Key) of - false -> - Default; - Value -> - Value - end. - -rocksdb_open_opts({Tab, index, {Pos,_}}) -> - UserProps = mnesia_lib:val({Tab, user_properties}), - IxOpts = proplists:get_value(rocksdb_index_opts, UserProps, []), - PosOpts = proplists:get_value(Pos, IxOpts, []), - rocksdb_open_opts_(PosOpts); -rocksdb_open_opts(Tab) -> - UserProps = mnesia_lib:val({Tab, user_properties}), - RdbOpts = proplists:get_value(rocksdb_opts, UserProps, []), - rocksdb_open_opts_(RdbOpts). - -rocksdb_open_opts_(RdbOpts) -> - lists:foldl( - fun({K,_} = Item, Acc) -> - lists:keystore(K, 1, Acc, Item) - end, default_open_opts(), RdbOpts). - -default_open_opts() -> - [ {create_if_missing, true} - , {cache_size, - list_to_integer(get_env_default("ROCKSDB_CACHE_SIZE", "32212254"))} - , {block_size, 1024} - , {max_open_files, 100} - , {write_buffer_size, - list_to_integer(get_env_default( - "ROCKSDB_WRITE_BUFFER_SIZE", "4194304"))} - , {compression, - list_to_atom(get_env_default("ROCKSDB_COMPRESSION", "true"))} - , {use_bloomfilter, true} - ]. - -destroy_recreate(MPd, RdbOpts) -> - ok = destroy_db(MPd, []), - open_rocksdb(MPd, RdbOpts). - -open_rocksdb(MPd, RdbOpts) -> - open_rocksdb(MPd, rocksdb_open_opts_(RdbOpts), get_retries()). - -%% Code adapted from basho/riak_kv_eleveldb_backend.erl -open_rocksdb(MPd, Opts, Retries) -> - open_db(MPd, Opts, max(1, Retries), undefined). - -open_db(_, _, 0, LastError) -> - {error, LastError}; -open_db(MPd, Opts, RetriesLeft, _) -> - case ?rocksdb:open(MPd, Opts) of - {ok, Ref} -> - ?dbg("~p: Open - Rocksdb: ~s~n -> {ok, ~p}~n", - [self(), MPd, Ref]), - {ok, Ref}; - %% Check specifically for lock error, this can be caused if - %% a crashed mnesia takes some time to flush rocksdb information - %% out to disk. The process is gone, but the NIF resource cleanup - %% may not have completed. - {error, {db_open, OpenErr}=Reason} -> - case lists:prefix("IO error: lock ", OpenErr) of - true -> - SleepFor = get_retry_delay(), - ?dbg("~p: Open - Rocksdb backend retrying ~p in ~p ms" - " after error ~s\n", - [self(), MPd, SleepFor, OpenErr]), - timer:sleep(SleepFor), - open_db(MPd, Opts, RetriesLeft - 1, Reason); - false -> - {error, Reason} - end; - {error, Reason} -> - {error, Reason} - end. - -%% await_db_closed(Tab) -> -%% MPd = data_mountpoint(Tab), -%% await_db_closed_(MPd). - -%% await_db_closed_(MPd) -> -%% case filelib:is_file(filename:join(MPd, "LOCK")) of -%% true -> -%% SleepFor = get_retry_delay(), -%% timer:sleep(SleepFor), -%% await_db_closed_(MPd); -%% false -> -%% ok -%% end. - -rocksdb_close(undefined) -> - ok; -rocksdb_close(Ref) -> - Res = ?rocksdb:close(Ref), - erlang:garbage_collect(), - Res. - -destroy_db(MPd, Opts) -> - destroy_db(MPd, Opts, get_retries()). - -%% Essentially same code as above. -destroy_db(MPd, Opts, Retries) -> - _DRes = destroy_db(MPd, Opts, max(1, Retries), undefined), - ?dbg("~p: Destroy ~s -> ~p~n", [self(), MPd, _DRes]), - [_|_] = MPd, % ensure MPd is non-empty - _RmRes = os:cmd("rm -rf " ++ MPd ++ "/*"), - ?dbg("~p: RmRes = '~s'~n", [self(), _RmRes]), - ok. - -destroy_db(_, _, 0, LastError) -> - {error, LastError}; -destroy_db(MPd, Opts, RetriesLeft, _) -> - case ?rocksdb:destroy(MPd, Opts) of - ok -> - ok; - %% Check specifically for lock error, this can be caused if - %% destroy follows quickly after close. - {error, {error_db_destroy, Err}=Reason} -> - case lists:prefix("IO error: lock ", Err) of - true -> - SleepFor = get_retry_delay(), - ?dbg("~p: Destroy - Rocksdb backend retrying ~p in ~p ms" - " after error ~s\n" - " children = ~p~n", - [self(), MPd, SleepFor, Err, - supervisor:which_children(mnesia_ext_sup)]), - timer:sleep(SleepFor), - destroy_db(MPd, Opts, RetriesLeft - 1, Reason); - false -> - {error, Reason} - end; - {error, Reason} -> - {error, Reason} - end. - -get_retries() -> 30. -get_retry_delay() -> 10000. - -rocksdb_to_ets(Ref, Ets) -> - with_iterator(Ref, fun(I) -> - i_rocksdb_to_ets(I, Ets, <>) - end). - -i_rocksdb_to_ets(I, Ets, Move) -> - case ?rocksdb:iterator_move(I, Move) of - {ok, << ?INFO_TAG, EncKey/binary >>, EncVal} -> - Item = decode_key(EncKey), - Val = decode_val(EncVal), - ets:insert(Ets, {{info,Item}, Val}), - i_rocksdb_to_ets(I, Ets, next); - _ -> - '$end_of_table' - end. - opt_call(Alias, Tab, Req) -> ProcName = proc_name(Alias, Tab), case whereis(ProcName) of undefined -> - ?dbg("proc_name(~p, ~p): ~p; NO PROCESS~n", + ?log(debug, "proc_name(~p, ~p): ~p; NO PROCESS~n", [Alias, Tab, ProcName]), {error, noproc}; Pid when is_pid(Pid) -> - ?dbg("proc_name(~p, ~p): ~p; Pid = ~p~n", + ?log(debug, "proc_name(~p, ~p): ~p; Pid = ~p~n", [Alias, Tab, ProcName, Pid]), - {ok, gen_server:call(Pid, Req, infinity)} + {ok, do_call(Pid, Req)} end. call(Alias, Tab, Req) -> ProcName = proc_name(Alias, Tab), - case gen_server:call(ProcName, Req, infinity) of + do_call(ProcName, Req). + +do_call(P, Req) -> + case gen_server:call(P, Req, infinity) of badarg -> mnesia:abort(badarg); {abort, _} = Err -> @@ -1205,409 +867,19 @@ call(Alias, Tab, Req) -> Reply end. -size_warning(Alias, Tab) -> - ProcName = proc_name(Alias, Tab), - gen_server:cast(ProcName, size_warning). - %% server-side end of insert/3. -do_insert(K, V, #st{ref = Ref, type = bag, maintain_size = false} = St) -> - return_catch(fun() -> do_insert_bag(Ref, K, V, false, St) end); -do_insert(K, V, #st{ets = Ets, ref = Ref, type = bag, maintain_size = true} = St) -> - return_catch( - fun() -> - CurSz = read_info(size, 0, Ets), - NewSz = do_insert_bag(Ref, K, V, CurSz, St), - ets_insert_info(Ets, size, NewSz), - ok - end); -do_insert(K, V, #st{ref = Ref, maintain_size = false} = St) -> - return_catch(fun() -> db_put(Ref, K, V, [], St) end); -do_insert(K, V, #st{ets = Ets, ref = Ref, maintain_size = true} = St) -> - IsNew = case ?rocksdb:get(Ref, K, []) of - {ok, _} -> - false; - _ -> - true - end, - case IsNew of - true -> - return_catch( - fun() -> - NewSz = read_info(size, 0, Ets) + 1, - {Ki, Vi} = info_obj(size, NewSz), - L = [{put, Ki, Vi}, {put, K, V}], - write_result(mnesia_rocksdb_lib:write(Ref, L, []), - write, [Ref, L, []], St), % may throw - ets_insert_info(Ets, size, NewSz) - end); - false -> - return_catch(fun() -> db_put(Ref, K, V, [], St) end) - end, - ok. - -do_insert_bag(Ref, K, V, CurSz, St) -> - KSz = byte_size(K), - with_iterator( - Ref, fun(I) -> - do_insert_bag_( - KSz, K, ?rocksdb:iterator_move(I, K), I, V, 0, Ref, CurSz, St) - end). - - -%% There's a potential access pattern that would force counters to -%% creep upwards and eventually hit the limit. This could be addressed, -%% with compaction. TODO. -do_insert_bag_(Sz, K, Res, I, V, Prev, Ref, TSz, St) when Prev < ?MAX_BAG -> - case Res of - {ok, <>, V} -> - %% object exists - TSz; - {ok, <>, _} -> - do_insert_bag_( - Sz, K, ?rocksdb:iterator_move(I, next), I, V, N, Ref, TSz, St); - _ when TSz =:= false -> - Key = <>, - db_put(Ref, Key, V, [], St); - _ -> - NewSz = TSz + 1, - {Ki, Vi} = info_obj(size, NewSz), - Key = <>, - db_write(Ref, [{put, Ki, Vi}, {put, Key, V}], [], St), - NewSz - end. +do_insert(Obj, #st{ref = Ref} = St) -> + return_catch(fun() -> db_insert(Ref, Obj, [], St) end). %% server-side part -do_delete(Key, #st{ref = Ref, type = bag, maintain_size = false} = St) -> - return_catch(fun() -> do_delete_bag(byte_size(Key), Key, Ref, false, St) end); -do_delete(Key, #st{ets = Ets, ref = Ref, type = bag, maintain_size = true} = St) -> - return_catch( - fun() -> - Sz = byte_size(Key), - CurSz = read_info(size, 0, Ets), - NewSz = do_delete_bag(Sz, Key, Ref, CurSz, St), - ets_insert_info(Ets, size, NewSz), - ok - end); -do_delete(Key, #st{ref = Ref, maintain_size = false} = St) -> - return_catch(fun() -> db_delete(Ref, Key, [], St) end); -do_delete(Key, #st{ets = Ets, ref = Ref, maintain_size = true} = St) -> - CurSz = read_info(size, 0, Ets), - case ?rocksdb:get(Ref, Key, [{fill_cache,true}]) of - {ok, _} -> - return_catch( - fun() -> - NewSz = CurSz -1, - {Ki, Vi} = info_obj(size, NewSz), - ok = db_write(Ref, [{delete, Key}, {put, Ki, Vi}], [], St), - ets_insert_info(Ets, size, NewSz) - end); - not_found -> - false - end. - -do_delete_bag(Sz, Key, Ref, TSz, St) -> - Found = with_iterator( - Ref, fun(I) -> - do_delete_bag_(Sz, Key, ?rocksdb:iterator_move(I, Key), - Ref, I) - end), - case {Found, TSz} of - {[], _} -> - TSz; - {_, false} -> - db_write(Ref, [{delete, K} || K <- Found], [], St); - {_, _} -> - N = length(Found), - NewSz = TSz - N, - {Ki, Vi} = info_obj(size, NewSz), - db_write(Ref, [{put, Ki, Vi} | - [{delete, K} || K <- Found]], [], St), - NewSz - end. - -do_delete_bag_(Sz, K, Res, Ref, I) -> - case Res of - {ok, K, _} -> - do_delete_bag_(Sz, K, ?rocksdb:iterator_move(I, next), - Ref, I); - {ok, <> = Key, _} -> - [Key | - do_delete_bag_(Sz, K, ?rocksdb:iterator_move(I, next), - Ref, I)]; - _ -> - [] - end. - -do_match_delete(Pat, #st{ets = Ets, ref = Ref, tab = Tab, type = Type, - maintain_size = MaintainSize} = St) -> - Fun = fun(_, Key, Acc) -> [Key|Acc] end, - Keys = do_fold(Ref, Tab, Type, Fun, [], [{Pat,[],['$_']}], 30), - case {Keys, MaintainSize} of - {[], _} -> - ok; - {_, false} -> - db_write(Ref, [{delete, K} || K <- Keys], [], St), - ok; - {_, true} -> - CurSz = read_info(size, 0, Ets), - NewSz = max(CurSz - length(Keys), 0), - {Ki, Vi} = info_obj(size, NewSz), - db_write(Ref, [{put, Ki, Vi} | - [{delete, K} || K <- Keys]], [], St), - ets_insert_info(Ets, size, NewSz), - ok - end. - -recover_size_info(#st{ ref = Ref - , tab = Tab - , type = Type - , maintain_size = MaintainSize - } = St) -> - %% TODO: shall_update_size_info is obsolete, remove - case shall_update_size_info(Tab) of - true -> - Sz = do_fold(Ref, Tab, Type, fun(_, Acc) -> Acc+1 end, - 0, [{'_',[],['$_']}], 3), - write_info_(size, Sz, St); - false -> - case MaintainSize of - true -> - %% info initialized by rocksdb_to_ets/2 - %% TODO: if there is no stored size, recompute it - ignore; - false -> - %% size is not maintained, ensure it's marked accordingly - delete_info_(size, St) - end - end, - St. - -shall_update_size_info({_, index, _}) -> - false; -shall_update_size_info(Tab) -> - property(Tab, update_size_info, false). - -should_maintain_size(Tab) -> - property(Tab, maintain_size, false). - -property(Tab, Prop, Default) -> - try mnesia:read_table_property(Tab, Prop) of - {Prop, P} -> - P - catch - error:_ -> Default; - exit:_ -> Default - end. - -write_info_(Item, Val, #st{ets = Ets, ref = Ref} = St) -> - rocksdb_insert_info(Ref, Item, Val, St), - ets_insert_info(Ets, Item, Val). - -ets_insert_info(Ets, Item, Val) -> - ets:insert(Ets, {{info, Item}, Val}). - -ets_delete_info(Ets, Item) -> - ets:delete(Ets, {info, Item}). - -rocksdb_insert_info(Ref, Item, Val, St) -> - EncKey = info_key(Item), - EncVal = encode_val(Val), - db_put(Ref, EncKey, EncVal, [], St). - -rocksdb_delete_info(Ref, Item, St) -> - EncKey = info_key(Item), - db_delete(Ref, EncKey, [], St). - -info_obj(Item, Val) -> - {info_key(Item), encode_val(Val)}. - -info_key(Item) -> - <>. - -delete_info_(Item, #st{ets = Ets, ref = Ref} = St) -> - rocksdb_delete_info(Ref, Item, St), - ets_delete_info(Ets, Item). - -read_info(Item, Default, Ets) -> - case ets:lookup(Ets, {info,Item}) of - [] -> - Default; - [{_,Val}] -> - Val - end. - -tab_name(icache, Tab) -> - list_to_atom("mnesia_ext_icache_" ++ tabname(Tab)). +do_delete(Key, #st{ref = Ref} = St) -> + return_catch(fun() -> db_delete(Ref, Key, [], St) end). proc_name(_Alias, Tab) -> - list_to_atom("mnesia_ext_proc_" ++ tabname(Tab)). - - -%% ---------------------------------------------------------------------------- -%% PRIVATE SELECT MACHINERY -%% ---------------------------------------------------------------------------- - -do_select(Ref, Tab, Type, MS, Limit) -> - do_select(Ref, Tab, Type, MS, false, Limit). - -do_select(Ref, Tab, _Type, MS, AccKeys, Limit) when is_boolean(AccKeys) -> - Keypat = keypat(MS, keypos(Tab)), - Sel = #sel{tab = Tab, - ref = Ref, - keypat = Keypat, - ms = MS, - compiled_ms = ets:match_spec_compile(MS), - key_only = needs_key_only(MS), - limit = Limit}, - with_iterator(Ref, fun(I) -> i_do_select(I, Sel, AccKeys, []) end). - -i_do_select(I, #sel{keypat = Pfx, - compiled_ms = MS, - limit = Limit} = Sel, AccKeys, Acc) -> - StartKey = case Pfx of - <<>> -> - <>; - _ -> - Pfx - end, - select_traverse(?rocksdb:iterator_move(I, StartKey), Limit, - Pfx, MS, I, Sel, AccKeys, Acc). - -needs_key_only([{HP,_,Body}]) -> - BodyVars = lists:flatmap(fun extract_vars/1, Body), - %% Note that we express the conditions for "needs more than key" and negate. - not(wild_in_body(BodyVars) orelse - case bound_in_headpat(HP) of - {all,V} -> lists:member(V, BodyVars); - Vars when is_list(Vars) -> any_in_body(lists:keydelete(2,1,Vars), BodyVars) - end); -needs_key_only(_) -> - %% don't know - false. - -extract_vars([H|T]) -> - extract_vars(H) ++ extract_vars(T); -extract_vars(T) when is_tuple(T) -> - extract_vars(tuple_to_list(T)); -extract_vars(T) when T=='$$'; T=='$_' -> - [T]; -extract_vars(T) when is_atom(T) -> - case is_wild(T) of - true -> - [T]; - false -> - [] - end; -extract_vars(_) -> - []. - -any_in_body(Vars, BodyVars) -> - lists:any(fun({_,Vs}) -> - intersection(Vs, BodyVars) =/= [] - end, Vars). - -intersection(A,B) when is_list(A), is_list(B) -> - A -- (A -- B). - -wild_in_body(BodyVars) -> - intersection(BodyVars, ['$$','$_']) =/= []. - -bound_in_headpat(HP) when is_atom(HP) -> - {all, HP}; -bound_in_headpat(HP) when is_tuple(HP) -> - [_|T] = tuple_to_list(HP), - map_vars(T, 2). - -map_vars([H|T], P) -> - case extract_vars(H) of - [] -> - map_vars(T, P+1); - Vs -> - [{P, Vs}|map_vars(T, P+1)] - end; -map_vars([], _) -> - []. - -select_traverse({ok, K, V}, Limit, Pfx, MS, I, #sel{tab = Tab} = Sel, - AccKeys, Acc) -> - case is_prefix(Pfx, K) of - true -> - Rec = setelement(keypos(Tab), decode_val(V), decode_key(K)), - case ets:match_spec_run([Rec], MS) of - [] -> - select_traverse( - ?rocksdb:iterator_move(I, next), Limit, Pfx, MS, - I, Sel, AccKeys, Acc); - [Match] -> - Acc1 = if AccKeys -> - [{K, Match}|Acc]; - true -> - [Match|Acc] - end, - traverse_continue(K, decr(Limit), Pfx, MS, I, Sel, AccKeys, Acc1) - end; - false -> - {lists:reverse(Acc), '$end_of_table'} - end; -select_traverse({error, _}, _, _, _, _, _, _, Acc) -> - {lists:reverse(Acc), '$end_of_table'}. - -is_prefix(A, B) when is_binary(A), is_binary(B) -> - Sa = byte_size(A), - case B of - <> -> - true; - _ -> - false - end. - -decr(I) when is_integer(I) -> - I-1; -decr(infinity) -> - infinity. - -traverse_continue(K, 0, Pfx, MS, _I, #sel{limit = Limit, ref = Ref} = Sel, AccKeys, Acc) -> - {lists:reverse(Acc), - fun() -> - with_iterator(Ref, - fun(NewI) -> - select_traverse(iterator_next(NewI, K), - Limit, Pfx, MS, NewI, Sel, - AccKeys, []) - end) - end}; -traverse_continue(_K, Limit, Pfx, MS, I, Sel, AccKeys, Acc) -> - select_traverse(?rocksdb:iterator_move(I, next), Limit, Pfx, MS, I, Sel, AccKeys, Acc). - -iterator_next(I, K) -> - case ?rocksdb:iterator_move(I, K) of - {ok, K, _} -> - ?rocksdb:iterator_move(I, next); - Other -> - Other - end. - -keypat([H|T], KeyPos) -> - keypat(T, KeyPos, keypat_pfx(H, KeyPos)). - -keypat(_, _, <<>>) -> <<>>; -keypat([H|T], KeyPos, Pfx0) -> - Pfx = keypat_pfx(H, KeyPos), - keypat(T, KeyPos, common_prefix(Pfx, Pfx0)); -keypat([], _, Pfx) -> - Pfx. - -common_prefix(<>, <>) -> - <>; -common_prefix(_, _) -> - <<>>. - -keypat_pfx({HeadPat,_Gs,_}, KeyPos) when is_tuple(HeadPat) -> - KP = element(KeyPos, HeadPat), - sext:prefix(KP); -keypat_pfx(_, _) -> - <<>>. + list_to_atom("mnesia_ext_proc_" ++ mnesia_rocksdb_lib:tabname(Tab)). +whereis_proc(Alias, Tab) -> + whereis(proc_name(Alias, Tab)). %% ---------------------------------------------------------------------------- %% Db wrappers @@ -1620,236 +892,15 @@ return_catch(F) when is_function(F, 0) -> badarg end. -db_put(Ref, K, V, Opts, St) -> - write_result(mnesia_rocksdb_lib:put(Ref, K, V, Opts), put, [Ref, K, V, Opts], St). - -db_write(Ref, List, Opts, St) -> - write_result(mnesia_rocksdb_lib:write(Ref, List, Opts), write, [Ref, List, Opts], St). +db_insert(Ref, Obj, Opts, St) -> + write_result(mrdb:insert(Ref, Obj, Opts), insert, [Ref, Obj, Opts], St). db_delete(Ref, K, Opts, St) -> - write_result(mnesia_rocksdb_lib:delete(Ref, K, Opts), delete, [Ref, K, Opts], St). + write_result(mrdb:delete(Ref, K, Opts), delete, [Ref, K, Opts], St). write_result(ok, _, _, _) -> - ok; -write_result(Res, Op, Args, #st{tab = Tab, on_write_error = Rpt, on_write_error_store = OWEStore}) -> - RptOp = rpt_op(Rpt), - maybe_store_error(OWEStore, Res, Tab, Op, Args, erlang:system_time(millisecond)), - mnesia_lib:RptOp("FAILED rocksdb:~p(" ++ rpt_fmt(Args) ++ ") -> ~p~n", - [Op | Args] ++ [Res]), - if Rpt == fatal; Rpt == error -> - throw(badarg); - true -> - ok - end. - -maybe_store_error(undefined, _, _, _, _, _) -> - ok; -maybe_store_error(Table, Err, IntTable, put, [_, K, _, _], Time) -> - insert_error(Table, IntTable, K, Err, Time); -maybe_store_error(Table, Err, IntTable, delete, [_, K, _], Time) -> - insert_error(Table, IntTable, K, Err, Time); -maybe_store_error(Table, Err, IntTable, write, [_, List, _], Time) -> - lists:map(fun - ({put, K, _}) -> - insert_error(Table, IntTable, K, Err, Time); - ({delete, K}) -> - insert_error(Table, IntTable, K, Err, Time) - end, List). - -insert_error(Table, {Type, _, _}, K, Err, Time) -> - {_, K1} = decode_key(K), - ets:insert(Table, {{Type, K1}, Err, Time}); -insert_error(Table, Type, K, Err, Time) when is_atom(Type) -> - ets:insert(Table, {{Type, K}, Err, Time}). - -rpt_fmt([_|T]) -> - lists:append(["~p" | [", ~p" || _ <- T]]). - -rpt_op(debug) -> - dbg_out; -rpt_op(Op) -> - Op. - -valid_mnesia_op(Op) -> - if Op==debug - ; Op==verbose - ; Op==warning - ; Op==error - ; Op==fatal -> - true; - true -> - false - end. + ok. %% ---------------------------------------------------------------------------- %% COMMON PRIVATE %% ---------------------------------------------------------------------------- - -%% Note that since a callback can be used as an indexing backend, we -%% cannot assume that keypos will always be 2. For indexes, the tab -%% name will be {Tab, index, Pos}, and The object structure will be -%% {{IxKey,Key}} for an ordered_set index, and {IxKey,Key} for a bag -%% index. -%% -keypos({_, index, _}) -> - 1; -keypos({_, retainer, _}) -> - 2; -keypos(Tab) when is_atom(Tab) -> - 2. - --spec encode_key(any()) -> binary(). -encode_key(Key) -> - sext:encode(Key). - --spec decode_key(binary()) -> any(). -decode_key(CodedKey) -> - case sext:partial_decode(CodedKey) of - {full, Result, _} -> - Result; - _ -> - error(badarg, CodedKey) - end. - --spec encode_val(any()) -> binary(). -encode_val(Val) -> - term_to_binary(Val). - --spec decode_val(binary()) -> any(). -decode_val(CodedVal) -> - binary_to_term(CodedVal). - -create_mountpoint(Tab) -> - MPd = data_mountpoint(Tab), - case filelib:is_dir(MPd) of - false -> - file:make_dir(MPd), - ok; - true -> - Dir = mnesia_lib:dir(), - case lists:prefix(Dir, MPd) of - true -> - ok; - false -> - {error, exists} - end - end. - -%% delete_mountpoint(Tab) -> -%% MPd = data_mountpoint(Tab), -%% assert_proper_mountpoint(Tab, MPd), -%% ok = destroy_db(MPd, []). - -assert_proper_mountpoint(_Tab, _MPd) -> - %% TODO: not yet implemented. How to verify that the MPd var points - %% to the directory we actually want deleted? - ok. - -data_mountpoint(Tab) -> - Dir = mnesia_monitor:get_env(dir), - filename:join(Dir, tabname(Tab) ++ ".extrdb"). - -tabname({Tab, index, {{Pos},_}}) -> - atom_to_list(Tab) ++ "-=" ++ atom_to_list(Pos) ++ "=-_ix"; -tabname({Tab, index, {Pos,_}}) -> - atom_to_list(Tab) ++ "-" ++ integer_to_list(Pos) ++ "-_ix"; -tabname({Tab, retainer, Name}) -> - atom_to_list(Tab) ++ "-" ++ retainername(Name) ++ "-_RET"; -tabname(Tab) when is_atom(Tab) -> - atom_to_list(Tab) ++ "-_tab". - -retainername(Name) when is_atom(Name) -> - atom_to_list(Name); -retainername(Name) when is_list(Name) -> - try binary_to_list(list_to_binary(Name)) - catch - error:_ -> - lists:flatten(io_lib:write(Name)) - end; -retainername(Name) -> - lists:flatten(io_lib:write(Name)). - -related_resources(Tab) -> - TabS = atom_to_list(Tab), - Dir = mnesia_monitor:get_env(dir), - case file:list_dir(Dir) of - {ok, Files} -> - lists:flatmap( - fun(F) -> - Full = filename:join(Dir, F), - case is_index_dir(F, TabS) of - false -> - case is_retainer_dir(F, TabS) of - false -> - []; - {true, Name} -> - [{{Tab, retainer, Name}, Full}] - end; - {true, Pos} -> - [{{Tab, index, {Pos,ordered}}, Full}] - end - end, Files); - _ -> - [] - end. - -is_index_dir(F, TabS) -> - case re:run(F, TabS ++ "-([0-9]+)-_ix.extrdb", [{capture, [1], list}]) of - nomatch -> - false; - {match, [P]} -> - {true, list_to_integer(P)} - end. - -is_retainer_dir(F, TabS) -> - case re:run(F, TabS ++ "-(.+)-_RET", [{capture, [1], list}]) of - nomatch -> - false; - {match, [Name]} -> - {true, Name} - end. - --spec get_ref(alias(), table()) -> {rocksdb:db_handle(), table_type()}. -get_ref(Alias, Tab) -> - call(Alias, Tab, get_ref). - -fold(Alias, Tab, Fun, Acc, MS, N) -> - {Ref, Type} = get_ref(Alias, Tab), - do_fold(Ref, Tab, Type, Fun, Acc, MS, N). - -%% can be run on the server side. -do_fold(Ref, Tab, Type, Fun, Acc, MS, N) -> - {AccKeys, F} = - if is_function(Fun, 3) -> - {true, fun({K,Obj}, Acc1) -> - Fun(Obj, K, Acc1) - end}; - is_function(Fun, 2) -> - {false, Fun} - end, - do_fold1(do_select(Ref, Tab, Type, MS, AccKeys, N), F, Acc). - -do_fold1('$end_of_table', _, Acc) -> - Acc; -do_fold1({L, Cont}, Fun, Acc) -> - Acc1 = lists:foldl(Fun, Acc, L), - do_fold1(select(Cont), Fun, Acc1). - -is_wild('_') -> - true; -is_wild(A) when is_atom(A) -> - case atom_to_list(A) of - "\$" ++ S -> - try begin - _ = list_to_integer(S), - true - end - catch - error:_ -> - false - end; - _ -> - false - end; -is_wild(_) -> - false. diff --git a/src/mnesia_rocksdb_admin.erl b/src/mnesia_rocksdb_admin.erl new file mode 100644 index 0000000..a3b3b57 --- /dev/null +++ b/src/mnesia_rocksdb_admin.erl @@ -0,0 +1,1424 @@ +-module(mnesia_rocksdb_admin). + +-behaviour(gen_server). + +-export([ ensure_started/0 + , add_aliases/1 + , remove_aliases/1 + , create_table/3 %% (Alias, Name, Props) -> {ok, Ref} | error() + , delete_table/2 %% (Alias, Name) -> ok + , load_table/2 %% (Alias, Name) -> ok + , related_resources/2 %% (Alias, Name) -> [RelatedTab] + , prep_close/2 %% (Alias, Tab) -> ok + , get_ref/1 %% (Name) -> Ref | abort() + , get_ref/2 %% (Name, Default -> Ref | Default + , request_ref/2 %% (Alias, Name) -> Ref + , close_table/2 + ]). + +-export([ migrate_standalone/2 ]). + +-export([ start_link/0 + , init/1 + , handle_info/2 + , handle_call/3 + , handle_cast/2 + , terminate/2 + , code_change/3 ]). + +-export([ read_info/1 %% (TRec) + , read_info/2 %% (Alias, Tab) + , read_info/4 %% (Alias, Tab, Key, Default) + , write_info/4 %% (Alias, Tab, Key, Value) + , write_table_property/3 %% (Alias, Tab, Property) + ]). + +-export([meta/0]). + +-include("mnesia_rocksdb.hrl"). +-include("mnesia_rocksdb_int.hrl"). +-include_lib("hut/include/hut.hrl"). + +-record(st, { + backends = #{} :: #{ alias() => backend() } + , standalone = #{} :: #{{alias(), table()} := cf() } + , default_opts = [] :: [{atom(), _}] + }). + +-type st() :: #st{}. + +-type alias() :: atom(). +-type tabname() :: atom(). +-type table() :: tabname() + | {admin, alias()} + | {tabname(), index, any()} + | {tabname(), retainer, any()}. + +-type backend() :: #{ db_ref := db_ref() + , cf_info := #{ table() := cf() } + }. +-type db_ref() :: rocksdb:db_handle(). +-type properties() :: [{atom(), any()}]. + +-type cf() :: mrdb:db_ref(). + +-type req() :: {create_table, table(), properties()} + | {delete_table, table()} + | {load_table, table()} + | {related_resources, table()} + | {get_ref, table()} + | {add_aliases, [alias()]} + | {write_table_property, tabname(), tuple()} + | {remove_aliases, [alias()]} + | {migrate, [{tabname(), map()}]} + | {prep_close, table()} + | {close_table, table()}. + +-type reason() :: any(). +-type reply() :: any(). +-type gen_server_reply() :: {reply, reply(), st()} + | {stop, reason(), reply(), st()}. + +-type gen_server_noreply() :: {noreply, st()} + | {stop, reason(), st()}. + +-define(PT_KEY, {mnesia_rocksdb, meta}). + +-spec ensure_started() -> ok. +ensure_started() -> + case whereis(?MODULE) of + undefined -> + do_start(); + _ -> + ok + end. + +do_start() -> + stick_rocksdb_dir(), + application:ensure_all_started(mnesia_rocksdb), + case mnesia_ext_sup:start_proc(?MODULE, ?MODULE, start_link, [], + [ {restart, permanent} + , {shutdown, 10000} + , {type, worker} + , {modules, [?MODULE]} ]) of + {ok, _Pid} -> + ok; + {error, {already_started, _Pid}} -> + ok + end. + +put_pt(Name, Value) -> + Meta = meta(), + persistent_term:put(?PT_KEY, Meta#{Name => Value}). + +put_pts_map(PTs) -> + Meta = maps:merge(meta(), PTs), + persistent_term:put(?PT_KEY, Meta). + +erase_pt(Name) -> + Meta = meta(), + persistent_term:put(?PT_KEY, maps:remove(Name, Meta)). + +%% Avoid multiple updates to persistent terms, since each will trigger +%% a gc. +erase_pt_list(Names) -> + Meta = meta(), + persistent_term:put(?PT_KEY, maps:without(Names, Meta)). + +meta() -> + persistent_term:get(?PT_KEY, #{}). + +prep_close(Alias, Tab) when is_atom(Tab) -> + call(Alias, {prep_close, Tab}); +prep_close(_, _) -> + ok. + +get_pt(Name, Default) -> + maps:get(Name, meta(), Default). + +create_table(Alias, Name, Props) -> + call(Alias, {create_table, Name, Props}). + +-spec delete_table(alias(), tabname()) -> ok. +delete_table(Alias, Name) -> + call(Alias, {delete_table, Name}). + +load_table(Alias, Name) -> + call(Alias, {load_table, Name}). + +related_resources(Alias, Name) -> + if is_atom(Name) -> + call(Alias, {related_resources, Name}); + true -> + [] + end. + +get_ref(Name) -> + case get_ref(Name, error) of + error -> + mnesia:abort({bad_type, Name}); + Other -> + Other + end. + +get_ref(Name, Default) -> + get_pt(Name, Default). + +request_ref(Alias, Name) -> + call(Alias, {get_ref, Name}). + +close_table(Alias, Name) -> + call(Alias, {close_table, Name}). + +add_aliases(Aliases) -> + call([], {add_aliases, Aliases}). + +remove_aliases(Aliases) -> + call([], {remove_aliases, Aliases}). + +read_info(Alias, Tab, K, Default) -> + read_info_(get_ref({admin, Alias}), Tab, K, Default). + +read_info(Alias, Tab) -> + read_all_info_(get_ref({admin, Alias}), Tab). + +read_info(#{alias := _, name := Tab} = TRec) -> + read_all_info_(TRec, Tab). + +read_all_info_(ARef, Tab) -> + Pat = [{ {{info,Tab,'$1'},'$2'}, [], [{{'$1','$2'}}] }], + mrdb_select:select(ARef, Pat, infinity). + +read_info_(ARef, Tab, K, Default) -> + EncK = mnesia_rocksdb_lib:encode_key({info, Tab, K}, sext), + get_info_res(mrdb:rdb_get(ARef, EncK, []), Default). + +get_info_res(Res, Default) -> + case Res of + not_found -> + Default; + {ok, Bin} -> + %% no fancy tricks when encoding/decoding info values + binary_to_term(Bin); + {error, E} -> + error(E) + end. + +%% Admin info: metadata written by the admin proc to keep track of +%% the derived status of tables (such as detected version and encoding +%% of existing standalone tables.) +%% +write_admin_info(K, V, Alias, Name) -> + mrdb:rdb_put(get_ref({admin, Alias}), + admin_info_key(K, Name), + term_to_binary(V)). + +read_admin_info(K, Alias, Name) -> + EncK = admin_info_key(K, Name), + case mrdb:rdb_get(get_ref({admin,Alias}), EncK) of + {ok, Bin} -> + {ok, binary_to_term(Bin)}; + _ -> + error + end. + +delete_admin_info(K, Alias, Name) -> + EncK = admin_info_key(K, Name), + mrdb:rdb_delete(get_ref({admin, Alias}), EncK). + +admin_info_key(K, Name) -> + mnesia_rocksdb_lib:encode_key({admin_info, Name, K}, sext). + +%% Table metadata info maintained by users +%% +write_info(Alias, Tab, K, V) -> + write_info_(get_ref({admin, Alias}), Tab, K, V). + +write_info_(Ref, Tab, K, V) -> + EncK = mnesia_rocksdb_lib:encode_key({info,Tab,K}, sext), + maybe_write_standalone_info(Ref, K, V), + mrdb:rdb_put(Ref, EncK, term_to_binary(V), []). + +maybe_write_standalone_info(Ref, K, V) -> + case Ref of + #{type := standalone, vsn := 1, db_ref := DbRef} -> + EncK = mnesia_rocksdb_lib:encode_key(K, sext), + Key = <>, + EncV = mnesia_rocksdb_lib:encode_val(V, term), + rocksdb:put(DbRef, Key, EncV, []); + _ -> + ok + end. + +write_table_property(Alias, Tab, Prop) when is_tuple(Prop), size(Prop) >= 1 -> + call(Alias, {write_table_property, Tab, Prop}). + +migrate_standalone(Alias, Tabs) -> + call(Alias, {migrate, Tabs}). + +-spec call(alias() | [], req()) -> no_return() | any(). +call(Alias, Req) -> + call(Alias, Req, infinity). + +call(Alias, Req, Timeout) -> + case gen_server:call(?MODULE, {Alias, Req}, Timeout) of + {abort, Reason} -> + mnesia:abort(Reason); + {error, {mrdb_abort, Reason}} -> + mnesia:abort(Reason); + Reply -> + Reply + end. + +start_link() -> + mrdb_mutex:ensure_tab(), + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +init([]) -> + Opts = default_opts(), + process_flag(trap_exit, true), + mnesia:subscribe({table, schema, simple}), + {ok, recover_state(#st{default_opts = Opts})}. + +recover_state(St0) -> + Meta = maps:to_list(meta()), + {Admins, Tabs} = lists:partition(fun is_admin/1, Meta), + recover_tabs(Tabs, recover_admins(Admins, St0)). + +is_admin({{admin,_},_}) -> true; +is_admin(_ ) -> false. + +recover_admins(Admins, St) -> + lists:foldl(fun recover_admin/2, St, Admins). + +recover_admin({{admin,Alias} = T, #{db_ref := DbRef, + cf_handle := CfH, + mountpoint := MP} = R}, + #st{backends = Backends} = St) -> + case cf_is_accessible(DbRef, CfH) of + true -> + B = #{cf_info => #{T => R}, + db_ref => DbRef, + mountpoint => MP}, + St#st{backends = Backends#{Alias => B}}; + false -> + error({cannot_access_alias_db, Alias}) + end. + +recover_tabs(Tabs, St) -> + lists:foldl(fun recover_tab/2, St, Tabs). + +recover_tab({T, #{db_ref := DbRef, + cf_handle := CfH, + alias := Alias} = R}, St) -> + case cf_is_accessible(DbRef, CfH) of + true -> + update_cf(Alias, T, R, St); + false -> + error({cannot_access_table, T}) + end. + +%% TODO: generalize +get_aliases() -> + %% Return a list of registered aliases paired with alias-specific options + [{rocksdb_copies, []}]. + +default_opts() -> + %% TODO: make this configurable + []. + +alias_opts(Alias) -> + %% TODO: User should have a way to configure rocksdb options for the admin db + %% of a user-defined alias. + proplists:get_value(Alias, get_aliases(), []). + +maybe_load_admin_db({Alias, Opts}, #st{backends = Bs} = St) -> + case maps:find(Alias, Bs) of + {ok, #{db_ref := _}} -> + %% already open + St; + error -> + try_load_admin_db(Alias, Opts, St) + end. + +try_load_admin_db(Alias, AliasOpts, #st{ backends = Bs + , default_opts = DefaultOpts} = St) -> + case load_admin_db(Alias, AliasOpts ++ DefaultOpts) of + {ok, #{cf_info := CfI0, mountpoint := MP} = AdminDb} -> + %% We need to store the persistent ref explicitly here, + %% since mnesia knows nothing of our admin table. + AdminTab = {admin, Alias}, + CfI = update_cf_info(AdminTab, #{ status => open + , name => AdminTab + , vsn => ?VSN + , encoding => {sext,{value,term}} + , attr_pos => #{key => 1, + value => 2} + , mountpoint => MP + , properties => + #{ attributes => [key, val] + }}, CfI0), + PTs = maps:filter( + fun(T, _) -> + case T of + {admin,_} -> true; + {ext, _, _} -> true; + _ -> false + end + end, CfI), + put_pts_map(PTs), + Bs1 = Bs#{Alias => AdminDb#{cf_info => CfI}}, + St#st{backends = Bs1}; + {error, _} = Error -> + mnesia_lib:fatal("Cannot load admin db for alias ~p: ~p~n", + [Alias, Error]) + end. + +-spec handle_call({alias(), req()}, any(), st()) -> gen_server_reply(). +handle_call({[], {add_aliases, Aliases}}, _From, St) -> + St1 = do_add_aliases(Aliases, St), + {reply, ok, St1}; +handle_call({[], {remove_aliases, Aliases}}, _From, St) -> + St1 = do_remove_aliases(Aliases, St), + {reply, ok, St1}; +handle_call({Alias, Req}, _From, St) -> + handle_call_for_alias(Alias, Req, St); +handle_call(_Req, _From, St) -> + {reply, {error, unknown_request}, St}. + +-spec handle_cast(any(), st()) -> gen_server_noreply(). +handle_cast(_Msg, St) -> + {noreply, St}. + +-spec handle_info(any(), st()) -> gen_server_noreply(). +handle_info({mnesia_table_event, Event}, St) -> + ?log(debug, "Table event: ~p", [Event]), + case Event of + {write, {schema, Tab, Props}, _} -> + case find_cf(Tab, St) of + error -> + ?log(debug, "No Cf found (~p)", [Tab]), + {noreply, St}; + #{} = Cf -> + ?log(debug, "Located Cf: ~p", [Cf]), + case try_refresh_cf(Cf, Props, St) of + false -> + ?log(debug, "Nothing changed (~p)", [Tab]), + {noreply, St}; + {true, NewCf, St1} -> + ?log(debug, "NewCf = ~p", [NewCf]), + maybe_update_pt(Tab, NewCf), + {noreply, St1} + end + end; + _ -> + {noreply, St} + end; +handle_info(_Msg, St) -> + {noreply, St}. + +terminate(shutdown, St) -> + close_all(St), + ok; +terminate(_, _) -> + ok. + +code_change(_FromVsn, St, _Extra) -> + {ok, St}. + +-spec handle_call_for_alias(alias(), req(), st()) -> gen_server_reply(). +handle_call_for_alias(Alias, Req, #st{backends = Backends} = St) -> + case maps:find(Alias, Backends) of + {ok, Backend} -> + try handle_req(Alias, Req, Backend, St) + catch + error:E:ST -> + io:fwrite(standard_io, "CAUGHT error:~p / ~p~n", + [E, ST]), + {reply, {error, E}, St} + end; + error -> + {reply, {error, unknown_alias}, St} + end. + +do_add_aliases(Aliases, St) -> + New = [{A, alias_opts(A)} || A <- Aliases, + not is_open(A, St)], + lists:foldl(fun maybe_load_admin_db/2, St, New). + +do_remove_aliases(Aliases, #st{backends = Bs} = St) -> + Known = intersection(Aliases, maps:keys(Bs)), + lists:foldl(fun remove_admin_db/2, St, Known). + +intersection(A, B) -> + A -- (A -- B). + +-spec handle_req(alias(), req(), backend(), st()) -> gen_server_reply(). +handle_req(Alias, {create_table, Name, Props}, Backend, St) -> + case create_trec(Alias, Name, Props, Backend, St) of + {ok, NewCf} -> + ?log(debug, "NewCf = ~p", [NewCf]), + St1 = update_cf(Alias, Name, NewCf, St), + {reply, {ok, NewCf}, St1}; + {error, _} = Error -> + {reply, Error, St} + end; +handle_req(Alias, {load_table, Name}, Backend, St) -> + case find_cf(Alias, Name, Backend, St) of + {ok, #{status := open}} -> + ?log(info, "load_table(~p) when table already loaded", [Name]), + {reply, ok, St}; + {ok, TRec} -> + case create_table_from_trec(Alias, Name, TRec, Backend, St) of + {ok, TRec1, St1} -> + TRec2 = TRec1#{status => open}, + St2 = update_cf(Alias, Name, TRec2, St1), + ?log(debug, "Table loaded ~p", [Name]), + put_pt(Name, TRec2), + {reply, ok, St2}; + {error, _} = Error -> + {reply, Error, St} + end; + error -> + {reply, {abort, {bad_type, Name}}, St} + end; +handle_req(_Alias, {prep_close, Name}, Backend, St) -> + ok_reply(do_prep_close(Name, Backend, St), St); +handle_req(Alias, {close_table, Name}, Backend, St) -> + ok_reply(do_close_table(Alias, Name, Backend, St), St); +handle_req(Alias, {delete_table, Name}, Backend, St) -> + case do_delete_table(Alias, Name, Backend, St) of + {ok, St1} -> + {reply, ok, maybe_update_main(Alias, Name, delete, St1)}; + {error, not_found} -> + {reply, ok, St} + end; +handle_req(Alias, {get_ref, Name}, Backend, #st{} = St) -> + case find_cf(Alias, Name, Backend, St) of + {ok, #{status := open} = Ref} -> + {reply, {ok, Ref}, St}; + {ok, _} -> % not open - treat as not_found + {reply, {error, not_found}, St}; + error -> + {reply, {error, not_found}, St} + end; +handle_req(_Alias, {related_resources, Tab}, Backend, St) -> + Res = get_related_resources(Tab, Backend), + {reply, Res, St}; +handle_req(Alias, {write_table_property, Tab, Prop}, Backend, St) -> + case find_cf(Alias, Tab, Backend, St) of + {ok, #{status := opens} = Cf0} -> + case mnesia_schema:schema_transaction( + fun() -> + erase_pt(Tab), + Cf = update_user_properties(Prop, Cf0), + St1 = update_cf(Alias, Tab, Cf, St), + put_pt(Tab, Cf), + St1 + end) of + {atomic, NewSt} -> + {reply, ok, NewSt}; + {aborted, _} -> + {reply, {error, badarg}, St} + end; + _ -> + {reply, {error, not_found}, St} + end; +handle_req(Alias, {migrate, Tabs0}, Backend, St) -> + case prepare_migration(Alias, Tabs0, St) of + {ok, Tabs} -> + {Res, St1} = do_migrate_tabs(Alias, Tabs, Backend, St), + {reply, Res, St1}; + {error, _} = Error -> + {reply, Error, St} + end. + + +%% if an index table has been created or deleted, make sure the main +%% ref reflects it. +maybe_update_main(Alias, {Main, index, I}, Op, St) -> + case find_cf_from_state(Alias, Main, St) of + {ok, #{properties := #{index := Index} = Props} = CfM} -> + case {Op, lists:member(I, Index)} of + {delete, true} -> + CfM1 = CfM#{properties => Props#{index => Index -- [I]}}, + maybe_update_pt(Main, CfM1), + update_cf(Alias, Main, CfM1, St); + _ -> + St + end; + _ -> + %% Might happen, perhaps. Don't worry about it here + St + end; +maybe_update_main(_, _, _, St) -> + St. + +%% The pt may not have been created yet. If so, don't do it here. +maybe_update_pt(Name, Ref) -> + case get_pt(Name, error) of + error -> + ok; + _Old -> + put_pt(Name, Ref) + end. + +ok_reply({ok, St}, _) -> + {reply, ok, St}; +ok_reply({error, _} = Error, St) -> + {reply, {abort, Error}, St}. + +get_related_resources(Tab, #{cf_info := CfInfo}) -> + F = fun(K, _, Acc) -> + acc_related_to(Tab, K, Acc) + end, + maps:fold(F, [], CfInfo). + +acc_related_to(T, {T, _, _} = Tab, Acc) -> + [Tab | Acc]; +acc_related_to(_, _, Acc) -> + Acc. + +update_cf(Alias, Name, Cf, #st{backends = Bs} = St) -> + #{cf_info := CfI} = B = maps:get(Alias, Bs), + CfI1 = update_cf_info(Name, Cf, CfI), + St#st{backends = Bs#{Alias => B#{cf_info => CfI1}}}. + +update_cf_info(Name, Cf, CfI) -> + Cf1 = case maps:find(Name, CfI) of + {ok, Cf0} -> + maps:merge(Cf0, Cf); + error -> + Cf + end, + CfI#{Name => Cf1}. + +delete_cf(Alias, Name, #st{backends = Bs} = St) -> + #{cf_info := CfI} = B = maps:get(Alias, Bs), + CfI1 = maps:remove(Name, CfI), + St#st{backends = Bs#{Alias => B#{cf_info => CfI1}}}. + +find_cf_from_state(Alias, Name, #st{backends = Backends} = St) -> + case maps:find(Alias, Backends) of + {ok, Backend} -> + find_cf(Alias, Name, Backend, St); + error -> + {error, not_found} + end. + +find_cf(Name, #st{backends = Backends}) -> + maps:fold( + fun(_Alias, #{cf_info := CfI}, Acc) -> + case maps:find(Name, CfI) of + {ok, Cf} -> Cf; + error -> Acc + end + end, error, Backends). + +find_cf(Alias, Name, #{cf_info := CfI}, #st{standalone = Ts}) -> + case maps:find(Name, CfI) of + {ok, _} = Ok -> + Ok; + error -> + maps:find({Alias, Name}, Ts) + end. + +get_table_mountpoint(Alias, Name, #st{standalone = Ts}) -> + case maps:find({Alias, Name}, Ts) of + {ok, #{mountpoint := MP}} -> + {ok, MP}; + _ -> + error + end. + +cf_is_accessible(DbRef, CfH) -> + try _ = estimated_num_keys(DbRef, CfH), + true + catch + error:_ -> false + end. + +-dialyzer({nowarn_function, estimated_num_keys/2}). +estimated_num_keys(DbRef, CfH) -> + case rocksdb:get_property(DbRef, CfH, <<"rocksdb.estimate-num-keys">>) of + {error, _} -> 0; + {ok, Bin} -> + %% return value mis-typed in rocksdb as string() + binary_to_integer(Bin) + end. + +create_trec(Alias, Name, Props, Backend, St) -> + %% io:fwrite("do_create_table(~p, ~p, ~p, ~p)~n", [Alias, Name, Backend, St]), + %% TODO: we're doing double-checking here + case find_cf(Alias, Name, Backend, St) of + {ok, #{status := open}} -> + {error, exists}; + {ok, TRec0} -> + do_create_trec(Alias, Name, Props, TRec0, St); + _Other -> + do_create_trec(Alias, Name, Props, #{}, St) + end. + +do_create_trec(Alias, Name, Props, TRec0, #st{} = St) -> + Type = case rdb_opt_standalone(Props) of + true -> standalone; + false -> column_family + end, + PMap = props_to_map(Name, Props), + {ok, maybe_map_retainer( + Alias, Name, + maybe_map_index( + Alias, Name, + maybe_map_attrs( + TRec0#{ semantics => semantics(Name, PMap) + , name => Name + , type => Type + , alias => Alias + , properties => PMap + , status => created })), St)}. + +create_table_from_trec(Alias, Name, #{cf_handle := CfH, db_ref := DbRef} = R, + _Backend, St) -> + case cf_is_accessible(DbRef, CfH) of + true -> + R1 = check_version_and_encoding(R), + {ok, R1, update_cf(Alias, Name, R1, St)}; + false -> + {stop, {cf_not_accessible, {Alias, Name}}, St} + end; +create_table_from_trec(Alias, Name, #{type := column_family} = TRec, + #{db_ref := DbRef} = Backend, St) -> + case should_we_migrate_standalone(TRec) of + false -> + create_table_as_cf(Alias, Name, TRec#{db_ref => DbRef}, St); + {false, MP} -> + ?log(debug, "will create ~p as standalone (no migrate)", [Name]), + create_table_as_standalone(Alias, Name, true, MP, TRec, St); + {true, MP} -> + ?log(debug, "will create ~p as standalone and migrate", [Name]), + case create_table_as_standalone(Alias, Name, false, MP, TRec, St) of + {ok, OldTRec, _} -> + create_cf_and_migrate(Alias, Name, OldTRec, TRec, Backend, St); + Other -> + ?log(info, "Couldn't open what seems to be a standalone table" + " (~p): ~p", [Name, Other]), + create_table_as_cf(Alias, Name, TRec#{db_ref => DbRef}, St) + end + end; +create_table_from_trec(Alias, Name, #{type := standalone} = TRec, _, St) -> + {Exists, MP} = table_exists_as_standalone(Name), + create_table_as_standalone(Alias, Name, Exists, MP, TRec, St). + +create_cf_and_migrate(Alias, Name, OldTRec, TRec, #{db_ref := DbRef}, St) -> + ?log(debug, "Migrate to cf (~p)", [Name]), + {ok, NewCf, St1} = create_table_as_cf( + Alias, Name, TRec#{db_ref => DbRef}, St), + {ok, St2} = migrate_standalone_to_cf(OldTRec, NewCf, St1), + {ok, NewCf, St2}. + +%% Return {Migrate, MP} iff table exists standalone; just false if it doesn't +should_we_migrate_standalone(#{name := Name}) -> + case table_exists_as_standalone(Name) of + {true, MP} -> + ?log(debug, "table ~p exists as standalone: ~p", [Name, MP]), + case auto_migrate_to_cf(Name) of + true -> + ?log(debug, "auto_migrate(~p): true", [Name]), + {true, MP}; + false -> + ?log(debug, "auto_migrate(~p): false", [Name]), + {false, MP} + end; + {false, _} -> + false + end. + +prepare_migration(Alias, Tabs, St) -> + Res = lists:map(fun(T) -> + prepare_migration_(Alias, T, St) + end, Tabs), + Res1 = add_related_tabs(Res, maps:get(Alias, St#st.backends), Alias, St), + case [E || {error, _} = E <- Res1] of + [] -> {ok, Res1}; + [_|_] = Errors -> + {error, Errors} + end. + +add_related_tabs(Ts, Backend, Alias, St) -> + lists:flatmap( + fun({error,_} = E) -> [E]; + ({T, _, _} = TI) -> + [TI | [prepare_migration_(Alias, Rel, St) + || Rel <- get_related_resources(T, Backend)]] + end, Ts). + +prepare_migration_(Alias, T, #st{} = St) -> + {TName, Opts} = case T of + {_, Map} when is_map(Map) -> T; + _ -> {T, #{}} + end, + case find_cf_from_state(Alias, TName, St) of + {ok, #{type := standalone} = TRec} -> + TRec1 = apply_migration_opts( + Opts, + maps:without([db_ref, type, cf_info, cf_handle, + encoding, mountpoint, status], TRec)), + {TName, TRec, TRec1}; + {ok, _} -> + {error, {not_standalone, TName}}; + error -> + {error, {no_such_table, TName}} + end. + +do_migrate_tabs(Alias, Tabs, Backend, St) -> + lists:mapfoldl(fun(T, St1) -> + do_migrate_table(Alias, T, Backend, St1) + end, St, Tabs). + +do_migrate_table(Alias, {Name, OldTRec, TRec0}, Backend, St) when is_map(TRec0) -> + T0 = erlang:system_time(millisecond), + TRec = maps:without([encoding, vsn], TRec0), + maybe_write_user_props(TRec), + {ok, CF, St1} = create_cf_and_migrate(Alias, Name, OldTRec, + TRec, Backend, St), + put_pt(Name, CF), + T1 = erlang:system_time(millisecond), + Time = T1 - T0, + io:fwrite("~p migrated, ~p ms~n", [Name, Time]), + {{Name, {ok, Time}}, St1}. + +migrate_standalone_to_cf(OldTRec, #{name := T, alias := Alias} = TRec, + #st{standalone = Ts} = St) -> + ChunkSz = chunk_size(TRec), + KeyPos = mnesia_rocksdb_lib:keypos(T), + migrate_to_cf(mrdb:select(OldTRec, [{'_',[],['$_']}], ChunkSz), + TRec, OldTRec, KeyPos), + case maps:is_key({Alias,T}, Ts) + andalso table_is_empty(OldTRec) of + true -> + St1 = close_and_delete_standalone(OldTRec, St), + {ok, St1}; + false -> + {ok, St} + end. + +migrate_to_cf({L, Cont}, Cf, DbRec, KeyPos) -> + mrdb:as_batch( + Cf, + fun(New) -> + mrdb:as_batch( + DbRec, + fun(Old) -> + lists:foreach( + fun(Obj) -> + mrdb:insert(New, Obj), + mrdb:delete(Old, element(KeyPos,Obj)) + end, L) + end) + end), + migrate_to_cf(cont(Cont), Cf, DbRec, KeyPos); +migrate_to_cf('$end_of_table', _, _, _) -> + ok. + +cont('$end_of_table' = E) -> E; +cont(F) when is_function(F,0) -> + F(). + +chunk_size(_) -> + 300. + +maybe_write_user_props(#{name := T, properties := #{user_properties := UPMap}}) -> + %% The UP map is #{Key => Prop}, where element(1, Prop) == Key + UPs = maps:values(UPMap), + SchemaProps = mnesia:table_info(T, user_properties), + WritePs = props_to_write(UPs, SchemaProps), + DelPs = props_to_delete(UPs, SchemaProps), + case {WritePs, DelPs} of + {[], []} -> ok; + _ -> + mnesia_schema:schema_transaction( + fun() -> + [mnesia_schema:do_write_table_property(T, P) + || P <- WritePs], + [mnesia_schema:do_delete_table_property(T, K) + || K <- DelPs] + end) + end; +maybe_write_user_props(#{} = TRec) -> + TRec. + +props_to_write(UPs, SchemaProps) -> + %% Include both new and modified + [P || P <- UPs, + not lists:member(P, SchemaProps)]. + +props_to_delete(UPs, SchemaProps) -> + lists:filtermap( + fun(P) -> + K = element(1, P), + case lists:keymember(K, 1, UPs) of + false -> {true, K}; + true -> false + end + end, SchemaProps). + +apply_migration_opts(Opts, TRec) -> + TRec1 = trec_without_user_prop(rocksdb_standalone, TRec), + try maps:fold(fun apply_migration_opt/3, TRec1, Opts) + catch + throw:Error -> + Error + end. + +apply_migration_opt(user_properties, UPs, #{properties := Props} = TRec) -> + lists:foreach( + fun(P) when is_tuple(P), size(P) >= 1 -> ok; + (P) -> + throw({error, {invalid_user_property, {tname(TRec), P}}}) + end, UPs), + TRec#{properties => Props#{user_properties => UPs}}; +apply_migration_opt(encoding, Enc0, #{properties := Props} = TRec) -> + case mnesia_rocksdb_lib:check_encoding(Enc0, maps:get(attributes, Props)) of + {ok, Enc} -> + update_user_properties({mrdb_encoding, Enc}, TRec); + {error, _} -> + throw({error, {invalid_encoding, {tname(TRec), Enc0}}}) + end. + +trec_without_user_prop(P, #{properties := #{user_properties := UPs} = Ps} = T) -> + T#{properties := Ps#{user_properties := maps:remove(P, UPs)}}; +trec_without_user_prop(_, TRec) -> + TRec. + +maybe_map_retainer(Alias, {MainTab, retainer, _}, #{properties := Ps0} = Map, St) -> + {ok, #{properties := #{record_name := RecName}}} = + find_cf_from_state(Alias, MainTab, St), + Map#{properties => Ps0#{record_name => RecName}}; +maybe_map_retainer(_, _, Map, _) -> + Map. + +maybe_map_index(Alias, {MainTab, index, {Pos, _IxType}}, Map) -> + Storage = {ext, Alias, mnesia_rocksdb}, + Map#{ ix_vals_f => mnesia_index:index_vals_f(Storage, MainTab, Pos) }; +maybe_map_index(_, _, Map) -> + Map. + +maybe_map_attrs(#{name := {_,retainer,_}} = R) -> + R#{attr_pos => #{key => 2, val => 3}}; +maybe_map_attrs(#{name := Name, properties := #{attributes := Attrs}} = R) + when is_atom(Name) -> + {AMap, _} = lists:foldl( + fun(A, {M, P}) -> + {M#{A => P}, P+1} + end, {#{}, 2}, Attrs), + R#{attr_pos => AMap}; +maybe_map_attrs(R) -> + R. + +rdb_opt_standalone(Props) -> + (os:getenv("MRDB_LEGACY") == "true") orelse + proplists:get_bool( + rocksdb_standalone, proplists:get_value(user_properties, Props, [])). + +auto_migrate_to_cf(Name) -> + Tabs = application:get_env(mnesia_rocksdb, auto_migrate_to_cf, []), + lists:member(Name, Tabs). + +props_to_map(TabName, Props) when is_atom(TabName) -> + #{user_properties := UPs} = PMap = maps:without([name, cookie, version], + maps:from_list(Props)), + %% Note that user properties can have arity >= 1 + PMap#{user_properties => maps:from_list([{element(1,P), P} || P <- UPs])}; +props_to_map({Tab,_,_}, _) -> + #{main_table => Tab, attributes => [key, val]}. + +try_refresh_cf(#{alias := Alias, name := Name, properties := Ps} = Cf, Props, St) -> + PMap = props_to_map(Name, Props), + ?log(debug, "PMap = ~p", [PMap]), + case PMap =:= Ps of + true -> false; + false -> + NewCf = maybe_map_attrs(Cf#{properties => PMap}), + {true, NewCf, update_cf(Alias, Name, NewCf, St)} + end; +try_refresh_cf(_, _, _) -> + false. + +update_user_properties(Prop, #{properties := Ps} = Cf) -> + Key = element(1, Prop), + UserProps = case maps:find(user_properties, Ps) of + {ok, UPs} -> UPs#{Key => Prop}; + error -> #{Key => Prop} + end, + Cf#{properties => Ps#{user_properties => UserProps}}. + +semantics({_,index,_} , _) -> ordered_set; +semantics({_,retainer,_}, _) -> set; +semantics(T, #{type := Type}) when is_atom(T) -> Type. + +table_exists_as_standalone(Name) -> + MP = mnesia_rocksdb_lib:data_mountpoint(Name), + Exists = case file:read_link_info(MP) of + {ok, _} -> true; + {error, _} -> false + end, + {Exists, MP}. + +create_table_as_standalone(Alias, Name, Exists, MP, TRec, St) -> + case create_table_as_standalone_(Alias, Name, Exists, MP, TRec, St) of + {ok, #{type := standalone, vsn := Vsn1, + encoding := Enc1} = Cf, _St1} = Ok -> + write_admin_info(standalone_vsn_and_enc, {Vsn1, Enc1}, + Alias, Name), + case Vsn1 of + 1 -> + load_info(Alias, Name, Cf); + _ -> + skip + end, + Ok; + Other -> + Other + end. + +create_table_as_standalone_(Alias, Name, Exists, MP, TRec, St) -> + Vsn = check_version(TRec), + TRec1 = TRec#{vsn => Vsn, encoding => get_encoding(Vsn, TRec)}, + do_open_standalone(true, Alias, Name, Exists, MP, TRec1, St). + +do_open_standalone(CreateIfMissing, Alias, Name, Exists, MP, TRec0, + #st{standalone = Ts} = St) -> + Opts = rocksdb_opts_from_trec(TRec0), + case open_db_(MP, Alias, Opts, [], CreateIfMissing) of + {ok, #{ cf_info := CfI }} -> + DbRec = maps:get({ext,Alias,"default"}, CfI), + ?log(debug, "successfully opened db ~p", [Name]), + CfNames = maps:keys(CfI), + DbRec1 = DbRec#{ cfs => CfNames, + mountpoint => MP }, + TRec = maps:merge(TRec0, DbRec#{type => standalone}), + TRec1 = guess_table_vsn_and_encoding(Exists, TRec), + ?log(debug, "TRec1 = ~p", [TRec1]), + {ok, TRec1, St#st{standalone = Ts#{{Alias, Name} => DbRec1}}}; + {error, _} = Err -> + ?log(debug, "open_db error: ~p", [Err]), + Err + end. + +%% When opening a standalone table, chances are it's a legacy table +%% where legacy encoding is already in place. We try to read the +%% first object and apply legacy encoding. If successful, we set +%% legacy encoding in the TRec. If we migrate the data to a column +%% family, we should apply the defined encoding for the cf. +%% +%% The first object can either be an info object (in the legacy case) +%% or a data object, with a sext-encoded key, and a term_to_binary- +%% encoded object as value, where the key position is set to []. +%% The info objects would be sext-encoded key + term-encoded value. +guess_table_vsn_and_encoding(false, TRec) -> + TRec; +guess_table_vsn_and_encoding(true, #{properties := #{attributes := As}, + alias := Alias, name := Name} = R) -> + ?log(debug, "guess_vsn_and_encoding(R = ~p)", [R]), + case read_admin_info(standalone_vsn_and_enc, Alias, Name) of + {ok, {V, E}} -> + ?log(debug, "admin_info exists: ~p", [{V,E}]), + R#{vsn => V, encoding => E}; + error -> + ?log(debug, "no admin_info; will iterate", []), + R1 = set_default_guess(R), + mrdb:with_rdb_iterator( + R1, fun(I) -> + guess_table_vsn_and_encoding_( + mrdb:rdb_iterator_move(I, first), I, As, R1) + end) + end. + +set_default_guess(#{type := standalone} = R) -> + case application:get_env(mnesia_rocksdb, standalone_default_vsn, ?VSN) of + 1 -> + R#{vsn => 1, encoding => {sext, {object, term}}}; + V -> + R#{vsn => V} + end. + +guess_table_vsn_and_encoding_({ok, K, V}, _I, As, R) -> + Arity = length(As) + 1, + case K of + <> -> + try _ = {mnesia_rocksdb_lib:decode(EncK, sext), + mnesia_rocksdb_lib:decode(V, term)}, + %% This is a vsn 1 standalone table + ?log(debug, "Found info tag; this is a vsn 1", []), + R#{vsn => 1, encoding => {sext, {object, term}}} + catch + error:_ -> + ?log(debug, "caught bad guess K=~p, V=~p", [K,V]), + R + end; + _ -> + ?log(debug, "not info obj K=~p", [K]), + Enc = guess_obj_encoding(K, V, Arity), + ?log(debug, "guessed Enc = ~p", [Enc]), + R#{encoding => Enc} + end; +guess_table_vsn_and_encoding_(Other, _, _, R) -> + ?log(debug, "Iter Other=~p", [Other]), + R. + +guess_obj_encoding(K, V, Arity) -> + {guess_key_encoding(K), guess_val_encoding(V, Arity)}. + +guess_encoding(Bin) -> + try {sext, sext:decode(Bin)} + catch + error:_ -> + try {term, binary_to_term(Bin)} + catch + error:_ -> raw + end + end. + +guess_key_encoding(Bin) -> + case guess_encoding(Bin) of + raw -> raw; + {Enc, _} -> Enc + end. + +guess_val_encoding(Bin, Arity) -> + case guess_encoding(Bin) of + raw -> {value, raw}; + {Enc, Term} -> + if is_tuple(Term), size(Term) == Arity, + element(2, Term) == [] -> + {object, Enc}; + true -> + {value, Enc} + end + end. + +%% This is slightly different from `rocksdb:is_empty/1`, since it allows +%% for the presence of some metadata, and still considers it empty if there +%% is no user data. +table_is_empty(#{} = DbRec) -> + Start = iterator_data_start(DbRec), + mrdb:with_rdb_iterator( + DbRec, fun(I) -> + case mrdb:rdb_iterator_move(I, Start) of + {ok, _, _} -> false; + _ -> true + end + end). + +iterator_data_start(#{vsn := 1}) -> <>; +iterator_data_start(_) -> first. + +load_info(Alias, Name, Cf) -> + ARef = get_ref({admin, Alias}), + mrdb:with_rdb_iterator( + Cf, fun(I) -> + load_info_(rocksdb:iterator_move(I, first), I, ARef, Name) + end). + +load_info_(Res, I, ARef, Tab) -> + case Res of + {ok, << ?INFO_TAG, K/binary >>, V} -> + DecK = mnesia_rocksdb_lib:decode_key(K), + case read_info_(ARef, Tab, DecK, undefined) of + undefined -> + write_info_(ARef, Tab, DecK, V); + _ -> + skip + end, + load_info_(rocksdb:iterator_move(I, next), I, ARef, Tab); + _ -> + ok + end. + +check_version(TRec) -> + user_property(mrdb_version, TRec, ?VSN). + +check_version_and_encoding(#{} = TRec) -> + Vsn = check_version(TRec), + Encoding = get_encoding(Vsn, TRec), + TRec#{vsn => Vsn, encoding => Encoding}. + +%% This access function assumes that the requested user property is +%% a 2-tuple. Mnesia allows user properties to be any non-empty tuple. +user_property(P, #{properties := #{user_properties := UPs}}, Default) -> + case maps:find(P, UPs) of + {ok, {_, V}} -> V; + error -> Default + end; +user_property(_, _, Default) -> + Default. + +tname(#{name := Name}) -> Name. + +get_encoding(1, _) -> {sext, {object, term}}; +get_encoding(?VSN, TRec) -> + case user_property(mrdb_encoding, TRec, undefined) of + undefined -> + default_encoding(TRec); + E -> + check_encoding(E, TRec) + end. + +default_encoding(#{name := Name, semantics := Sem, + properties := #{attributes := As}}) -> + mnesia_rocksdb_lib:default_encoding(Name, Sem, As). + +check_encoding(E, #{properties := #{attributes := As}}) -> + case mnesia_rocksdb_lib:check_encoding(E, As) of + {ok, Encoding} -> Encoding; + _Error -> + mrdb:abort(invalid_encoding) + end. + +rocksdb_opts_from_trec(TRec) -> + user_property(rocksdb_opts, TRec, []). + +create_table_as_cf(Alias, Name, #{db_ref := DbRef} = R, St) -> + CfName = tab_to_cf_name(Name), + case rocksdb:create_column_family(DbRef, CfName, cfopts()) of + {ok, CfH} -> + R1 = check_version_and_encoding(R#{ cf_handle => CfH + , type => column_family }), + {ok, R1, update_cf(Alias, Name, R1, St)}; + {error, _} = Error -> + Error + end. + +do_prep_close(Name, Backend, St) -> + RelTabs = get_related_resources(Name, Backend), + erase_pt_list([Name | RelTabs]), + {ok, St}. + +close_all(#st{backends = Bs, standalone = Ts}) -> + persistent_term:erase(?PT_KEY), + maps:fold(fun close_backend/3, ok, Bs), + maps:fold(fun close_standalone/3, ok, Ts). + +close_backend(_Alias, #{db_ref := DbRef}, _) -> + _ = rocksdb_close(DbRef), + ok. + +close_standalone({_Alias, _Name}, #{db_ref := DbRef}, _) -> + _ = rocksdb_close(DbRef), + ok. + +do_close_table(Alias, Name, Backend, #st{standalone = Ts} = St) -> + case find_cf(Alias, Name, Backend, St) of + {ok, #{type := column_family} = Cf} -> + %% We don't actually close column families + erase_pt(Name), + {ok, update_cf(Alias, Name, Cf#{status => closed}, St)}; + {ok, #{type := standalone, db_ref := DbRef}} -> + T = {Alias, Name}, + TRec = maps:get(T, Ts), + erase_pt(Name), + St1 = St#st{standalone = Ts#{T => TRec#{status => closed}}}, + _ = rocksdb_close(DbRef), + {ok, St1}; + error -> + {error, not_found} + end. + +do_delete_table(Alias, Name, Backend, #st{} = St) -> + case find_cf(Alias, Name, Backend, St) of + {ok, Where} -> + erase_pt(Name), + case Where of + #{db_ref := DbRef, cf_handle := CfH, type := column_family} -> + rocksdb:drop_column_family(DbRef, CfH), + rocksdb:destroy_column_family(DbRef, CfH), + {ok, delete_cf(Alias, Name, St)}; + #{type := standalone} = R -> + St1 = close_and_delete_standalone(R, St), + {ok, St1} + end; + error -> + {error, not_found} + end. + +load_admin_db(Alias, Opts) -> + DbName = {admin, Alias}, + open_db(DbName, Alias, Opts, [DbName], true). + +open_db(DbName, Alias, Opts, CFs, CreateIfMissing) -> + MP = mnesia_rocksdb_lib:data_mountpoint(DbName), + open_db_(MP, Alias, Opts, CFs, CreateIfMissing). + +open_db_(MP, Alias, Opts, CFs0, CreateIfMissing) -> + Acc0 = #{ mountpoint => MP }, + case filelib:is_dir(MP) of + false when CreateIfMissing -> + %% not yet created + CFs = cfs(CFs0), + file:make_dir(MP), + OpenOpts = [ {create_if_missing, true} + , {create_missing_column_families, true} + , {merge_operator, erlang_merge_operator} + | Opts ], + OpenRes = mnesia_rocksdb_lib:open_rocksdb(MP, OpenOpts, CFs), + map_cfs(OpenRes, CFs, Alias, Acc0); + false -> + {error, enoent}; + true -> + %% Assumption: even an old rocksdb database file will have at least "default" + {ok,CFs} = rocksdb:list_column_families(MP, Opts), + CFs1 = [{CF,[]} || CF <- CFs], %% TODO: this really needs more checking + map_cfs(rocksdb_open(MP, Opts, CFs1), CFs1, Alias, Acc0) + end. + +rocksdb_open(MP, Opts, CFs) -> + %% rocksdb:open(MP, Opts, CFs), + mnesia_rocksdb_lib:open_rocksdb(MP, Opts, CFs). + +is_open(Alias, #st{backends = Bs}) -> + case maps:find(Alias, Bs) of + {ok, #{db_ref := _}} -> + true; + _ -> + false + end. + +remove_admin_db(Alias, #st{backends = Bs} = St) -> + case maps:find(Alias, Bs) of + {ok, #{db_ref := DbRef, mountpoint := MP}} -> + close_and_delete(DbRef, MP), + St#st{backends = maps:remove(Alias, Bs)}; + error -> + St + end. + +%% TODO: Support user provision of cf-specific options +cfs(CFs) -> + [{"default", cfopts()}] ++ lists:flatmap(fun admin_cfs/1, CFs). + +cfopts() -> + [{merge_operator, erlang_merge_operator}]. + +admin_cfs(Tab) when is_atom(Tab) -> [ {tab_to_cf_name(Tab), cfopts()} ]; +admin_cfs({_, _, _} = T) -> [ {tab_to_cf_name(T), cfopts()} ]; +admin_cfs({admin, _Alias} = A) -> [ {tab_to_cf_name(A), cfopts()} ]; +admin_cfs({ext, CF}) -> [ {CF, cfopts()} ]; +admin_cfs({info, _} = I) -> [ {tab_to_cf_name(I), cfopts()} ]. + + +map_cfs({ok, Ref, CfHandles}, CFs, Alias, Acc) -> + ZippedCFs = lists:zip(CFs, CfHandles), + %% io:fwrite("ZippedCFs = ~p~n", [ZippedCFs]), + CfInfo = maps:from_list( + [{cf_name_to_tab(N, Alias), #{ db_ref => Ref + , cf_handle => H + , alias => Alias + , status => pre_existing + , type => column_family }} + || {{N,_}, H} <- ZippedCFs]), + {ok, Acc#{ db_ref => Ref + , cf_info => CfInfo }}. + +tab_to_cf_name(Tab) when is_atom(Tab) -> write_term({d, Tab}); +tab_to_cf_name({admin, Alias}) -> write_term({a, Alias}); +tab_to_cf_name({info, Tab}) -> write_term({n, Tab}); +tab_to_cf_name({Tab, index, I}) -> write_term({i, Tab, I}); +tab_to_cf_name({Tab, retainer, R}) -> write_term({r, Tab, R}). + +write_term(T) -> + lists:flatten(io_lib:fwrite("~w", [T])). + +cf_name_to_tab(Cf, Alias) -> + case read_term(Cf) of + {ok, {d, Table}} -> Table; + {ok, {i, Table, I}} -> {Table, index, I}; + {ok, {r, Table, R}} -> {Table, retainer, R}; + {ok, {n, Table}} -> {info, Table}; + {ok, {a, Alias}} -> {admin, Alias}; + _ -> + {ext, Alias, Cf} + end. + +read_term(Str) -> + case erl_scan:string(Str) of + {ok, Tokens, _} -> + erl_parse:parse_term(Tokens ++ [{dot,1}]); + Other -> + Other + end. + +%% Prevent reloading of modules in rocksdb itself during runtime, since it +%% can lead to inconsistent state in rocksdb and silent data corruption. +stick_rocksdb_dir() -> + case code:which(rocksdb) of + BeamPath when is_list(BeamPath), BeamPath =/= "" -> + Dir = filename:dirname(BeamPath), + case code:stick_dir(Dir) of + ok -> ok; + error -> warn_stick_dir({error, Dir}) + end; + Other -> + warn_stick_dir({not_found, Other}) + end. + +warn_stick_dir(Reason) -> + mnesia_lib:warning("cannot make rocksdb directory sticky:~n~p~n", + [Reason]). + +close_and_delete_standalone(#{alias := Alias, + name := Name, + type := standalone, + db_ref := DbRef}, St) -> + case get_table_mountpoint(Alias, Name, St) of + {ok, MP} -> + close_and_delete(DbRef, MP), + delete_admin_info(standalone_vsn_and_enc, Alias, Name), + St#st{standalone = maps:remove({Alias,Name}, St#st.standalone)}; + error -> + St + end. + +close_and_delete(DbRef, MP) -> + try rocksdb_close(DbRef) catch error:_ -> ok end, + destroy_db(MP, []). + +rocksdb_close(undefined) -> + ok; +rocksdb_close(Ref) -> + Res = rocksdb:close(Ref), + erlang:garbage_collect(), + Res. + +destroy_db(MPd, Opts) -> + destroy_db(MPd, Opts, get_retries()). + +%% Essentially same code as above. +destroy_db(MPd, Opts, Retries) -> + _DRes = destroy_db(MPd, Opts, max(1, Retries), undefined), + [_|_] = MPd, % ensure MPd is non-empty + _RmRes = os:cmd("rm -rf " ++ MPd ++ "/*"), + ok. + +destroy_db(_, _, 0, LastError) -> + {error, LastError}; +destroy_db(MPd, Opts, RetriesLeft, _) -> + case rocksdb:destroy(MPd, Opts) of + ok -> + ok; + %% Check specifically for lock error, this can be caused if + %% destroy follows quickly after close. + {error, {error_db_destroy, Err}=Reason} -> + case lists:prefix("IO error: lock ", Err) of + true -> + SleepFor = get_retry_delay(), + timer:sleep(SleepFor), + destroy_db(MPd, Opts, RetriesLeft - 1, Reason); + false -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end. + +get_retries() -> 30. +get_retry_delay() -> 10000. diff --git a/src/mnesia_rocksdb_int.hrl b/src/mnesia_rocksdb_int.hrl new file mode 100644 index 0000000..3bfcfe7 --- /dev/null +++ b/src/mnesia_rocksdb_int.hrl @@ -0,0 +1,17 @@ +-include_lib("hut/include/hut.hrl"). + +%% enable debugging messages through mnesia:set_debug_level(debug) +-ifndef(MNESIA_ROCKSDB_NO_DBG). +-define(dbg(Fmt, Args), ?log(debug, Fmt, Args)). +%% -define(dbg(Fmt, Args), +%% %% avoid evaluating Args if the message will be dropped anyway +%% case mnesia_monitor:get_env(debug) of +%% none -> ok; +%% verbose -> ok; +%% _ -> mnesia_lib:dbg_out("~p:~p: "++(Fmt),[?MODULE,?LINE|Args]) +%% end). +-else. +-define(dbg(Fmt, Args), ok). +-endif. + +-define(DEFAULT_RETRIES, 1). diff --git a/src/mnesia_rocksdb_lib.erl b/src/mnesia_rocksdb_lib.erl index 0a4a463..d42b340 100644 --- a/src/mnesia_rocksdb_lib.erl +++ b/src/mnesia_rocksdb_lib.erl @@ -1,17 +1,341 @@ -%%% @doc RocksDB update wrappers, in separate module for easy tracing and mocking. +%%% @doc RocksDB update wrappers, in separate module for easy tracing and mocking. %%% -module(mnesia_rocksdb_lib). --export([put/4, - write/3, - delete/3]). +-export([ put/4 + , write/3 + , delete/3 + ]). +-export([ open_rocksdb/3 + , data_mountpoint/1 + , create_mountpoint/1 + , tabname/1 + ]). +-export([ default_encoding/3 + , check_encoding/2 + , valid_obj_type/2 + , valid_key_type/2 ]). + +-export([ keypos/1 + , encode_key/1, encode_key/2 + , decode_key/1, decode_key/2 + , encode_val/1, encode_val/2 + , decode_val/1, decode_val/3 + , encode/2 + , decode/2 + ]). + +-include("mnesia_rocksdb.hrl"). +-include_lib("hut/include/hut.hrl"). + +put(#{db := Ref, cf := CF}, K, V, Opts) -> + rocksdb:put(Ref, CF, K, V, Opts); put(Ref, K, V, Opts) -> rocksdb:put(Ref, K, V, Opts). -write(Ref, L, Opts) -> - rocksdb:write(Ref, L, Opts). +write(#{db := Ref, cf := CF}, L, Opts) -> + write_as_batch(L, Ref, CF, Opts). delete(Ref, K, Opts) -> rocksdb:delete(Ref, K, Opts). + + +write_as_batch(L, Ref, CF, Opts) -> + {ok, Batch} = rocksdb:batch(), + lists:foreach( + fun({put, K, V}) -> + ok = rocksdb:batch_put(Batch, CF, K, V); + ({delete, K}) -> + ok = rocksdb:batch_delete(Batch, CF, K) + end, L), + rocksdb:write_batch(Ref, Batch, Opts). + +create_mountpoint(Tab) -> + MPd = data_mountpoint(Tab), + case filelib:is_dir(MPd) of + false -> + file:make_dir(MPd), + ok; + true -> + Dir = mnesia_lib:dir(), + case lists:prefix(Dir, MPd) of + true -> + ok; + false -> + {error, exists} + end + end. + +data_mountpoint(Tab) -> + Dir = mnesia_monitor:get_env(dir), + filename:join(Dir, tabname(Tab) ++ ".extrdb"). + +tabname({admin, Alias}) -> + "mnesia_rocksdb-" ++ atom_to_list(Alias) ++ "-_db"; +tabname({Tab, index, {{Pos},_}}) -> + atom_to_list(Tab) ++ "-=" ++ atom_to_list(Pos) ++ "=-_ix"; +tabname({Tab, index, {Pos,_}}) -> + atom_to_list(Tab) ++ "-" ++ integer_to_list(Pos) ++ "-_ix"; +tabname({Tab, retainer, Name}) -> + atom_to_list(Tab) ++ "-" ++ retainername(Name) ++ "-_RET"; +tabname(Tab) when is_atom(Tab) -> + atom_to_list(Tab) ++ "-_tab". + +default_encoding({_, index, _}, _, _) -> + {sext, {value, raw}}; +default_encoding({_, retainer, _}, _, _) -> + {term, {value, term}}; +default_encoding(_, Type, As) -> + KeyEnc = case Type of + ordered_set -> sext; + set -> term; + bag -> sext + end, + ValEnc = case As of + [_, _] -> + {value, term}; + [_, _ | _] -> + {object, term} + end, + {KeyEnc, ValEnc}. + +check_encoding(Encoding, Attributes) -> + try check_encoding_(Encoding, Attributes) + catch + throw:Error -> + Error + end. + +check_encoding_({Key, Val}, As) -> + Key1 = check_key_encoding(Key), + Val1 = check_value_encoding(Val, As), + {ok, {Key1, Val1}}; +check_encoding_(E, _) -> + throw({error, {invalid_encoding, E}}). + +check_key_encoding(E) when E==sext; E==term; E==raw -> + E; +check_key_encoding(E) -> + throw({error, {invalid_key_encoding, E}}). + +check_value_encoding(raw, [_, _]) -> {value, raw}; +check_value_encoding({value, E} = V, [_, _]) when E==term; E==raw; E==sext -> V; +check_value_encoding({object, E} = V, _) when E==term; E==raw; E==sext -> V; +check_value_encoding(term, As) -> {val_encoding_type(As), term}; +check_value_encoding(sext, As) -> {val_encoding_type(As), sext}; +check_value_encoding(E, _) -> + throw({error, {invalid_value_encoding, E}}). + +val_encoding_type(Attrs) -> + case Attrs of + [_, _] -> value; + [_, _|_] -> object + end. + +valid_obj_type(#{encoding := Enc}, Obj) -> + case {Enc, Obj} of + {{binary, {value, binary}}, {_, K, V}} -> + is_binary(K) andalso is_binary(V); + {{binary, _}, _} -> + is_binary(element(2, Obj)); + {{_, {value, binary}}, {_, _, V}} -> + is_binary(V); + _ -> + %% No restrictions on object type + %% unless key and/or value typed to binary + true + end. + +valid_key_type(#{encoding := Enc}, Key) -> + case Enc of + {binary, _} when is_binary(Key) -> + true; + {binary, _} -> + false; + _ -> + true + end. + + +-spec encode_key(any()) -> binary(). +encode_key(Key) -> + encode(Key, sext). + +encode(Value, sext) -> + sext:encode(Value); +encode(Value, raw) when is_binary(Value) -> + Value; +encode(Value, term) -> + term_to_binary(Value). + + +encode_key(Key, #{encoding := {Enc,_}}) -> + encode(Key, Enc); +encode_key(Key, _) -> + encode(Key, sext). + +-spec decode_key(binary()) -> any(). +decode_key(CodedKey) -> + decode(CodedKey, sext). + +decode_key(CodedKey, #{encoding := {Enc, _}}) -> + decode(CodedKey, Enc); +decode_key(CodedKey, Enc) -> + decode(CodedKey, Enc). + +decode(Val, sext) -> + case sext:partial_decode(Val) of + {full, Result, _} -> + Result; + _ -> + error(badarg, Val) + end; +decode(Val, raw) -> + Val; +decode(Val, term) -> + binary_to_term(Val). + +-spec encode_val(any()) -> binary(). +encode_val(Val) -> + encode(Val, term). + +encode_val(Val, Enc) when is_atom(Enc) -> + encode(Val, Enc); +encode_val(_, #{name := {_,index,_}}) -> + <<>>; +encode_val(Val, #{encoding := {_, Enc0}, attr_pos := AP}) -> + {Type, Enc} = enc_type(Enc0), + case {map_size(AP), Type} of + {2, value} -> + encode(element(3, Val), Enc); + {_, object} -> + encode(setelement(2, Val, []), Enc) + end. + +enc_type({T, _} = E) when T==value; T==object -> + E; +enc_type(E) when is_atom(E) -> + {object, E}. + +-spec decode_val(binary()) -> any(). +decode_val(CodedVal) -> + binary_to_term(CodedVal). + +decode_val(<<>>, K, #{name := {_,index,_}}) -> + {K}; +decode_val(CodedVal, Key, Ref) -> + {Type, Enc} = value_encoding(Ref), + case Type of + object -> + setelement(2, decode(CodedVal, Enc), Key); + value -> + make_rec(Key, decode(CodedVal, Enc), Ref) + end. + +make_rec(Key, _Val, #{name := {_, index, {_,ordered}}}) -> + {Key}; +make_rec(Key, Val, #{properties := #{record_name := Tag}}) -> + {Tag, Key, Val}; +make_rec(Key, Val, #{attr_pos := AP}) -> + %% no record name + case AP of + #{key := 1} -> {Key, Val}; + #{key := 2} -> {Val, Key} %% Yeah, right, but people are weird + end. + +value_encoding(#{encoding := {_, Enc}}) -> + enc_type(Enc); +value_encoding(#{}) -> + {object, term}; +value_encoding({Type, Enc} = E) when is_atom(Type), is_atom(Enc) -> + E. + +keypos({admin, _}) -> + 1; +keypos({_, index, _}) -> + 1; +keypos({_, retainer, _}) -> + 2; +keypos(Tab) when is_atom(Tab) -> + 2. + +%% ====================================================================== +%% Private functions +%% ====================================================================== + +retainername(Name) when is_atom(Name) -> + atom_to_list(Name); +retainername(Name) when is_list(Name) -> + try binary_to_list(list_to_binary(Name)) + catch + error:_ -> + lists:flatten(io_lib:write(Name)) + end; +retainername(Name) -> + lists:flatten(io_lib:write(Name)). + +open_rocksdb(MPd, RdbOpts, CFs) -> + open_rocksdb(MPd, rocksdb_open_opts_(RdbOpts), CFs, get_retries()). + +%% Code adapted from basho/riak_kv_eleveldb_backend.erl +open_rocksdb(MPd, Opts, CFs, Retries) -> + open_db(MPd, Opts, CFs, max(1, Retries), undefined). + +open_db(_, _, _, 0, LastError) -> + {error, LastError}; +open_db(MPd, Opts, CFs, RetriesLeft, _) -> + case rocksdb:open_optimistic_transaction_db(MPd, Opts, CFs) of + {ok, _Ref, _CFRefs} = Ok -> + ?log(debug, "Open - Rocksdb: ~s (~p) -> ~p", [MPd, Opts, Ok]), + Ok; + %% Check specifically for lock error, this can be caused if + %% a crashed mnesia takes some time to flush rocksdb information + %% out to disk. The process is gone, but the NIF resource cleanup + %% may not have completed. + {error, {db_open, OpenErr}=Reason} -> + case lists:prefix("IO error: lock ", OpenErr) of + true -> + SleepFor = get_retry_delay(), + ?log(debug, ("Open - Rocksdb backend retrying ~p in ~p ms" + " after error ~s"), [MPd, SleepFor, OpenErr]), + timer:sleep(SleepFor), + open_db(MPd, Opts, CFs, RetriesLeft - 1, Reason); + false -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end. + +get_retries() -> 30. +get_retry_delay() -> 100. + +rocksdb_open_opts_(RdbOpts) -> + lists:foldl( + fun({K,_} = Item, Acc) -> + lists:keystore(K, 1, Acc, Item) + end, default_open_opts(), RdbOpts). + +default_open_opts() -> + [ {create_if_missing, true} + , {cache_size, + list_to_integer(get_env_default("ROCKSDB_CACHE_SIZE", "32212254"))} + , {block_size, 1024} + , {max_open_files, 30} + , {write_buffer_size, + list_to_integer(get_env_default( + "ROCKSDB_WRITE_BUFFER_SIZE", "4194304"))} + , {compression, + list_to_atom(get_env_default("ROCKSDB_COMPRESSION", "true"))} + , {use_bloomfilter, true} + ]. + +get_env_default(Key, Default) -> + case os:getenv(Key) of + false -> + Default; + Value -> + Value + end. diff --git a/src/mnesia_rocksdb_params.erl b/src/mnesia_rocksdb_params.erl index 19ade6e..1730b04 100644 --- a/src/mnesia_rocksdb_params.erl +++ b/src/mnesia_rocksdb_params.erl @@ -33,17 +33,12 @@ code_change/3]). -include("mnesia_rocksdb_tuning.hrl"). +-include("mnesia_rocksdb_int.hrl"). -define(KB, 1024). -define(MB, 1024 * 1024). -define(GB, 1024 * 1024 * 1024). --ifdef(DEBUG). --define(dbg(Fmt, Args), io:fwrite(user,"~p:~p: "++(Fmt),[?MODULE,?LINE|Args])). --else. --define(dbg(Fmt, Args), ok). --endif. - lookup(Tab, Default) -> try ets:lookup(?MODULE, Tab) of [{_, Params}] -> @@ -113,21 +108,21 @@ store_params(Params) -> NTabs = length(Params), Env0= mnesia_rocksdb_tuning:describe_env(), Env = Env0#tuning{n_tabs = NTabs}, - ?dbg("Env = ~p~n", [Env]), + ?log(debug, "Env = ~p~n", [Env]), TotalFiles = lists:sum([mnesia_rocksdb_tuning:max_files(Sz) || {_, Sz} <- Params]), - ?dbg("TotalFiles = ~p~n", [TotalFiles]), + ?log(debug, "TotalFiles = ~p~n", [TotalFiles]), MaxFs = Env#tuning.max_files, - ?dbg("MaxFs = ~p~n", [MaxFs]), + ?log(debug, "MaxFs = ~p~n", [MaxFs]), FsHeadroom = MaxFs * 0.6, - ?dbg("FsHeadroom = ~p~n", [FsHeadroom]), + ?log(debug, "FsHeadroom = ~p~n", [FsHeadroom]), FilesFactor = if TotalFiles =< FsHeadroom -> 1; % don't have to scale down true -> FsHeadroom / TotalFiles end, Env1 = Env#tuning{files_factor = FilesFactor}, - ?dbg("Env1 = ~p~n", [Env1]), + ?log(debug, "Env1 = ~p~n", [Env1]), lists:foreach( fun({Tab, Sz}) when is_atom(Tab); is_atom(element(1,Tab)), diff --git a/src/mrdb.erl b/src/mrdb.erl new file mode 100644 index 0000000..c390b5d --- /dev/null +++ b/src/mrdb.erl @@ -0,0 +1,1452 @@ +%% @doc Mid-level access API for Mnesia-managed rocksdb tables +%% +%% This module implements access functions for the mnesia_rocksdb +%% backend plugin. The functions are designed to also support +%% direct access to rocksdb with little overhead. Such direct +%% access will maintain existing indexes, but not support +%% replication. +%% +%% Each table has a metadata structure stored as a persistent +%% term for fast access. The structure of the metadata is as +%% follows: +%% +%% ``` +%% #{ name := +%% , db_ref := +%% , cf_handle := +%% , batch := +%% , tx_handle := +%% , attr_pos := #{AttrName := Pos} +%% , mode := +%% , properties := +%% , type := column_family | standalone +%% } +%% ''' +%% +%% Helper functions like `as_batch(Ref, fun(R) -> ... end)' and +%% `with_iterator(Ref, fun(I) -> ... end)' add some extra +%% convenience on top of the `rocksdb' API. +%% +%% Note that no automatic provision exists to manage concurrent +%% updates via mnesia AND direct access to this API. It's advisable +%% to use ONE primary mode of access. If replication is used, +%% the mnesia API will support this, but direct `mrdb' updates will +%% not be replicated. + +-module(mrdb). + +-export([ get_ref/1 + , ensure_ref/1 , ensure_ref/2 + , alias_of/1 + , new_tx/1 , new_tx/2 + , tx_ref/2 + , tx_commit/1 + , with_iterator/2, with_iterator/3 + , with_rdb_iterator/2, with_rdb_iterator/3 + , iterator_move/2 + , rdb_iterator_move/2 + , iterator/1 , iterator/2 + , iterator_close/1 + , read/2 , read/3 + , index_read/3 + , insert/2 , insert/3 + , delete/2 , delete/3 + , delete_object/2, delete_object/3 + , match_delete/2 + , batch_write/2 , batch_write/3 + , update_counter/3, update_counter/4 + , as_batch/2 , as_batch/3 + , get_batch/1 + , snapshot/1 + , release_snapshot/1 + , first/1 , first/2 + , next/2 , next/3 + , prev/2 , prev/3 + , last/1 , last/2 + , select/2 , select/3 + , select/1 + , fold/3 , fold/4 , fold/5 + , rdb_fold/4 , rdb_fold/5 + , write_info/3 + , read_info/2 + , read_info/1 + ]). + +-export([ activity/3 + , current_context/0]). + +-export([abort/1]). + +%% Low-level access wrappers. +-export([ rdb_put/3, rdb_put/4 + , rdb_get/2, rdb_get/3 + , rdb_delete/2, rdb_delete/3 + , rdb_iterator/1, rdb_iterator/2 ]). + +%% For use of trace_runner +-export([ patterns/0 ]). + +-import(mnesia_rocksdb_lib, [ keypos/1 + , encode_key/1 + , encode_key/2 + , decode_key/2 + , encode_val/2 + , decode_val/3 ]). + +-export_type( [ mrdb_iterator/0 + , itr_handle/0 + , iterator_action/0 + , db_ref/0 + , ref_or_tab/0 + , index_position/0 + ]). + +-include("mnesia_rocksdb.hrl"). +-include("mnesia_rocksdb_int.hrl"). + +-type tab_name() :: atom(). +-type alias() :: atom(). +-type admin_tab() :: {admin, alias()}. +-type retainer() :: {tab_name(), retainer, any()}. +-type index() :: {tab_name(), index, any()}. +-type table() :: atom() + | admin_tab() + | index() + | retainer(). + +-type retries() :: non_neg_integer(). + +%% activity type 'ets' makes no sense in this context +-type mnesia_activity_type() :: transaction + | sync_transaction + | async_dirty + | sync_dirty. + +-type tx_options() :: #{ retries => retries() + , no_snapshot => boolean() }. +-type mrdb_activity_type() :: tx | {tx, tx_options()} | batch. + +-type activity_type() :: mrdb_activity_type() | mnesia_activity_type(). + +-type key() :: any(). +-type obj() :: tuple(). +-type index_position() :: atom() | pos(). + +-type db_handle() :: rocksdb:db_handle(). +-type cf_handle() :: rocksdb:cf_handle(). +-type tx_handle() :: rocksdb:transaction_handle(). +-type itr_handle() :: rocksdb:itr_handle(). +-type batch_handle() :: rocksdb:batch_handle(). + +-type pos() :: non_neg_integer(). + +-type properties() :: #{ record_name := atom() + , attributes := [atom()] + , index := [{pos(), bag | ordered}] + }. +-type semantics() :: bag | set. +-type key_encoding() :: 'raw' | 'sext' | 'term'. +-type val_encoding() :: {'value' | 'object', 'term' | 'raw'} + | 'raw'. +-type encoding() :: 'raw' | 'sext' | 'term' + | {key_encoding(), val_encoding()}. +-type attr_pos() :: #{atom() := pos()}. + +-type db_ref() :: #{ name => table() + , alias => atom() + , vsn => non_neg_integer() + , db_ref := db_handle() + , cf_handle := cf_handle() + , semantics := semantics() + , encoding := encoding() + , attr_pos := attr_pos() + , type := column_family | standalone + , status := open | closed | pre_existing + , properties := properties() + , mode => mnesia + , ix_vals_f => fun( (tuple()) -> [any()] ) + , batch => batch_handle() + , tx_handle => tx_handle() + , _ => _}. + +-type error() :: {error, any()}. + +-type ref_or_tab() :: table() | db_ref(). + +%% ============================================================ +%% these types should be exported from rocksdb.erl +-type snapshot_handle() :: rocksdb:snapshot_handle(). +-type read_options() :: [{verify_checksums, boolean()} | + {fill_cache, boolean()} | + {iterate_upper_bound, binary()} | + {iterate_lower_bound, binary()} | + {tailing, boolean()} | + {total_order_seek, boolean()} | + {prefix_same_as_start, boolean()} | + {snapshot, snapshot_handle()}]. + +-type write_options() :: [{sync, boolean()} | + {disable_wal, boolean()} | + {ignore_missing_column_families, boolean()} | + {no_slowdown, boolean()} | + {low_pri, boolean()}]. + +-type iterator_action() :: first | last | next | prev | binary() + | {seek, binary()} | {seek_for_prev, binary()}. + +%% ============================================================ + +-record(mrdb_iter, { i :: itr_handle() + , ref :: db_ref() }). + +-type mrdb_iterator() :: #mrdb_iter{}. +%% @private +%% Used by `trace_runner' to set up trace patterns. +%% +patterns() -> + [{?MODULE, F, A, []} || {F, A} <- ?MODULE:module_info(exports), + F =/= module_info andalso + F =/= patterns]. + +%% @doc Create a snapshot of the database instance associated with the +%% table reference, table name or alias. +%% +%% Snapshots provide consistent read-only views over the entire state of the key-value store. +%% @end +-spec snapshot(alias() | ref_or_tab()) -> {ok, snapshot_handle()} | error(). +snapshot(Name) when is_atom(Name) -> + case mnesia_rocksdb_admin:get_ref(Name, error) of + error -> + snapshot(get_ref({admin, Name})); + Ref -> + snapshot(Ref) + end; +snapshot(#{db_ref := DbRef}) -> + rocksdb:snapshot(DbRef); +snapshot(_) -> + {error, unknown}. + +%% @doc release a snapshot created by {@link snapshot/1}. +-spec release_snapshot(snapshot_handle()) -> ok | error(). +release_snapshot(SHandle) -> + rocksdb:release_snapshot(SHandle). + +%% @doc Run an activity (similar to {@link //mnesia/mnesia:activity/2}). +%% +%% Supported activity types are: +%%
    +%%
  • `transaction' - An optimistic `rocksdb' transaction
  • +%%
  • `{tx, TxOpts}' - A `rocksdb' transaction with sligth modifications
  • +%%
  • `batch' - A `rocksdb' batch operation
  • +%%
+%% +%% By default, transactions are combined with a snapshot with 1 retry. +%% The snapshot ensures that writes from concurrent transactions don't leak into the transaction context. +%% A transaction will be retried if it detects that the commit set conflicts with recent changes. +%% A mutex is used to ensure that only one of potentially conflicting `mrdb' transactions is run at a time. +%% The re-run transaction may still fail, if new transactions, or non-transaction writes interfere with +%% the commit set. It will then be re-run again, until the retry count is exhausted. +%% +%% Valid `TxOpts' are `#{no_snapshot => boolean(), retries => retries()}'. +%% +%% To simplify code adaptation, `tx | transaction | sync_transaction' are synonyms, and +%% `batch | async_dirty | sync_dirty' are synonyms. +%% @end +-spec activity(activity_type(), alias(), fun( () -> Res )) -> Res. +activity(Type, Alias, F) -> + #{db_ref := DbRef} = ensure_ref({admin, Alias}), + Ctxt = case tx_type(Type) of + {tx, TxOpts} -> + TxCtxt = new_tx_context(TxOpts, DbRef), + maps:merge( + #{ type => tx + , alias => Alias + , db_ref => DbRef }, TxCtxt); + batch -> + {ok, Batch} = rdb_batch(), + #{ type => batch + , alias => Alias + , db_ref => DbRef + , handle => Batch } + end, + do_activity(F, Alias, Ctxt, false). + +do_activity(F, Alias, Ctxt, WithLock) -> + try run_f(F, Ctxt, WithLock, Alias) of + Res -> + try commit_and_pop(Res) + catch + throw:{?MODULE, busy} -> + do_activity(F, Alias, Ctxt, true) + end + catch + Cat:Err when Cat==error; Cat==exit -> + abort_and_pop(Cat, Err) + end. + +run_f(F, Ctxt, false, _) -> + push_ctxt(Ctxt), + F(); +run_f(F, Ctxt, true, Alias) -> + mrdb_mutex:do( + Alias, + fun() -> + push_ctxt(incr_attempt(Ctxt)), + F() + end). + +incr_attempt(#{ type := tx, db_ref := DbRef, attempt := A } = C) -> + {ok, TxH} = rdb_transaction(DbRef, []), + C1 = C#{ attempt := A+1, handle := TxH }, + case maps:is_key(snapshot, C) of + true -> + {ok, SH} = rocksdb:snapshot(DbRef), + C1#{snapshot := SH}; + false -> + C1 + end. + +ctxt() -> {?MODULE, ctxt}. + +push_ctxt(C) -> + K = ctxt(), + C1 = case get(K) of + undefined -> [C]; + C0 -> [C|C0] + end, + put(K, C1), + ok. + +pop_ctxt() -> + K = ctxt(), + case get(K) of + undefined -> error(no_ctxt); + [C] -> erase(K) , maybe_release_snapshot(C); + [H|T] -> put(K, T), maybe_release_snapshot(H) + end. + +maybe_release_snapshot(#{snapshot := SH} = C) -> + try rocksdb:release_snapshot(SH) + catch + error:_ -> + ok + end, + C; +maybe_release_snapshot(C) -> + C. + +current_context() -> + case get(ctxt()) of + [C|_] -> + C; + undefined -> + undefined + end. + +tx_type(T) -> + case T of + _ when T==batch; + T==async_dirty; + T==sync_dirty -> batch; + _ when T==tx; + T==transaction; + T==sync_transaction -> {tx, apply_tx_opts(#{})}; + {tx, Opts} when is_map(Opts) -> {tx, apply_tx_opts(Opts)}; + _ -> abort(invalid_activity_type) + end. + +default_tx_opts() -> + #{ retries => ?DEFAULT_RETRIES + , no_snapshot => false }. + +apply_tx_opts(Opts0) when is_map(Opts0) -> + check_tx_opts(maps:merge(default_tx_opts(), Opts0)). + +check_tx_opts(Opts) -> + check_retries(check_nosnap(Opts)). + +check_retries(#{retries := Retries} = Opts) -> + if is_integer(Retries), Retries >= 0 -> + Opts; + true -> + error({invalid_tx_option, {retries, Retries}}) + end. + +check_nosnap(#{no_snapshot := NoSnap} = Opts) -> + if is_boolean(NoSnap) -> Opts; + true -> error({invalid_tx_option, {no_snapshot, NoSnap}}) + end. + +new_tx_context(Opts, DbRef) -> + maybe_snapshot(create_tx(Opts, DbRef), DbRef). + +create_tx(Opts, DbRef) -> + {ok, TxH} = rdb_transaction(DbRef, []), + Opts#{handle => TxH, attempt => 1}. + +maybe_snapshot(#{no_snapshot := NoSnap} = Opts, DbRef) -> + case NoSnap of + false -> + {ok, SH} = rocksdb:snapshot(DbRef), + Opts#{snapshot => SH}; + _ -> + Opts + end. + +commit_and_pop(Res) -> + #{type := Type, handle := H, db_ref := DbRef} = Ctxt = current_context(), + case Type of + tx -> + case rdb_transaction_commit_and_pop(H) of + ok -> + Res; + {error, {error, "Resource busy" ++ _ = Busy}} -> + case Ctxt of + #{retries := Retries, attempt := Att} + when Att =< Retries -> + throw({?MODULE, busy}); + _ -> + error({error, Busy}) + end; + {error, Reason} -> + error(Reason) + end; + batch -> + case rdb_write_batch_and_pop(DbRef, H) of + ok -> Res; + Other -> + Other + end + end. + +abort_and_pop(Cat, Err) -> + %% We can pop the context right away, since there is no + %% complex failure handling (like retry-on-busy) for rollback. + #{type := Type, handle := H} = pop_ctxt(), + case Type of + tx -> ok = rdb_transaction_rollback(H); + batch -> ok = rdb_release_batch(H) + end, + case Cat of + error -> error(Err); + exit -> exit(Err) + %% throw -> throw(Err) + end. + +rdb_transaction(DbRef, Opts) -> + rocksdb:transaction(DbRef, Opts). + +rdb_transaction_commit_and_pop(H) -> + try rdb_transaction_commit(H) + after + pop_ctxt() + end. + +rdb_transaction_commit(H) -> + rocksdb:transaction_commit(H). + +rdb_transaction_rollback(H) -> + rocksdb:transaction_rollback(H). + +rdb_batch() -> + rocksdb:batch(). + +rdb_write_batch_and_pop(DbRef, H) -> + %% TODO: derive write_opts(R) + try rocksdb:write_batch(DbRef, H, []) + after + pop_ctxt() + end. + +rdb_release_batch(H) -> + rocksdb:release_batch(H). + +%% @doc Aborts an ongoing {@link activity/2} +abort(Reason) -> + erlang:error({mrdb_abort, Reason}). + +-spec new_tx(table() | db_ref()) -> db_ref(). +new_tx(Tab) -> + new_tx(Tab, []). + +-spec new_tx(ref_or_tab(), write_options()) -> db_ref(). +new_tx(Tab, Opts) -> + #{db_ref := DbRef} = R = ensure_ref(Tab), + {ok, TxH} = rdb_transaction(DbRef, write_opts(R, Opts)), + R#{tx_handle => TxH}. + +-spec tx_ref(ref_or_tab() | db_ref() | db_ref(), tx_handle()) -> db_ref(). +tx_ref(Tab, TxH) -> + case ensure_ref(Tab) of + #{tx_handle := TxH} = R -> + R; + #{tx_handle := OtherTxH} -> + error({tx_handle_conflict, OtherTxH}); + R -> + R#{tx_handle => TxH} + end. + +-spec tx_commit(tx_handle() | db_ref()) -> ok. +tx_commit(#{tx_handle := TxH}) -> + rdb_transaction_commit(TxH); +tx_commit(TxH) -> + rdb_transaction_commit(TxH). + +-spec get_ref(table()) -> db_ref(). +get_ref(Tab) -> + mnesia_rocksdb_admin:get_ref(Tab). + +-spec ensure_ref(ref_or_tab()) -> db_ref(). +ensure_ref(Ref) when is_map(Ref) -> + Ref; +ensure_ref(Other) -> + maybe_tx_ctxt(get(ctxt()), get_ref(Other)). + +ensure_ref(Ref, R) when is_map(Ref) -> + inherit_ctxt(Ref, R); +ensure_ref(Other, R) -> + inherit_ctxt(get_ref(Other), R). + +maybe_tx_ctxt(undefined, R) -> R; +maybe_tx_ctxt(_, #{batch := _} = R) -> R; +maybe_tx_ctxt(_, #{tx_handle := _} = R) -> R; +maybe_tx_ctxt([#{type := Type, handle := H} = C|_], R) -> + case Type of + tx -> + maps:merge(maps:with([snapshot], C), R#{tx_handle => H}); + batch -> + R#{batch => H} + end. + +inherit_ctxt(Ref, R) -> + maps:merge(Ref, maps:with([batch, tx_handle], R)). + +-spec with_iterator(ref_or_tab(), fun( (mrdb_iterator()) -> Res )) -> Res. +with_iterator(Tab, Fun) -> + with_iterator(Tab, Fun, []). + +-spec with_iterator(ref_or_tab(), fun( (mrdb_iterator()) -> Res ), read_options()) -> Res. +with_iterator(Tab, Fun, Opts) -> + R = ensure_ref(Tab), + with_iterator_(R, Fun, Opts). + +-spec with_rdb_iterator(ref_or_tab(), fun( (itr_handle()) -> Res )) -> Res. +with_rdb_iterator(Tab, Fun) -> + with_rdb_iterator(Tab, Fun, []). + +-spec with_rdb_iterator(ref_or_tab(), fun( (itr_handle()) -> Res ), read_options()) -> Res. +with_rdb_iterator(Tab, Fun, Opts) when is_function(Fun, 1) -> + R = ensure_ref(Tab), + with_rdb_iterator_(R, Fun, read_opts(R, Opts)). + +with_iterator_(R, Fun, Opts) -> + {ok, I} = rdb_iterator_(R, Opts), + try Fun(#mrdb_iter{ i = I + , ref = R }) + after + rocksdb:iterator_close(I) + end. + +with_rdb_iterator_(Ref, Fun, ROpts) -> + {ok, I} = rdb_iterator_(Ref, ROpts), + try Fun(I) + after + rocksdb:iterator_close(I) + end. + +-spec iterator_move(mrdb_iterator(), iterator_action()) -> + {ok, tuple()} | {error, any()}. +iterator_move(#mrdb_iter{i = I, ref = Ref}, Dir) -> + case i_move(I, Dir) of + {ok, EncK, EncV} -> + K = decode_key(EncK, Ref), + Obj = decode_val(EncV, K, Ref), + {ok, Obj}; + Other -> + Other + end. + +-spec iterator(ref_or_tab()) -> {ok, mrdb_iterator()} | {error, _}. +iterator(Tab) -> + iterator(Tab, []). + +-spec iterator(ref_or_tab(), read_options()) -> {ok, mrdb_iterator()} | {error, _}. +iterator(Tab, Opts) -> + Ref = ensure_ref(Tab), + case rdb_iterator(Ref, Opts) of + {ok, I} -> + {ok, #mrdb_iter{ i = I + , ref = Ref }}; + Other -> + Other + end. + +-spec iterator_close(mrdb_iterator()) -> ok. +iterator_close(#mrdb_iter{i = I}) -> + rocksdb:iterator_close(I). + +rdb_iterator_move(I, Dir) -> + i_move(I, Dir). + +-spec insert(ref_or_tab(), obj()) -> ok. +insert(Tab, Obj) -> + insert(Tab, Obj, []). + +-spec insert(ref_or_tab(), obj(), write_options()) -> ok. +insert(Tab, Obj0, Opts) -> + #{name := Name} = Ref = ensure_ref(Tab), + Obj = validate_obj(Obj0, Ref), + Pos = keypos(Name), + Key = element(Pos, Obj), + EncVal = encode_val(Obj, Ref), + insert_(Ref, Key, encode_key(Key, Ref), EncVal, Obj, Opts). + +validate_obj(Obj, #{mode := mnesia}) -> + Obj; +validate_obj(Obj, #{attr_pos := AP, + properties := #{record_name := RN}} = Ref) + when is_tuple(Obj) -> + Arity = map_size(AP) + 1, + case {element(1, Obj), tuple_size(Obj)} of + {RN, Arity} -> + validate_obj_type(Obj, Ref); + _ -> + abort(badarg) + end; +validate_obj({{_,_}} = Obj, #{name := {_,index,{_,ordered}}}) -> + Obj; +validate_obj(_, _) -> + abort(badarg). + +validate_obj_type(Obj, Ref) -> + case mnesia_rocksdb_lib:valid_obj_type(Ref, Obj) of + true -> Obj; + false -> + abort({bad_type, Obj}) + end. + +insert_(#{semantics := bag} = Ref, Key, EncKey, EncVal, Obj, Opts) -> + batch_if_index(Ref, insert, bag, fun insert_bag/5, Key, EncKey, EncVal, Obj, Opts); + %% insert_bag(Ref, Obj, Opts); +insert_(Ref, Key, EncKey, EncVal, Obj, Opts) -> + batch_if_index(Ref, insert, set, fun insert_set/5, Key, EncKey, EncVal, Obj, Opts). + %% insert_set(Ref, Obj, Opts). + +insert_set(Ref, EncKey, EncVal, _, Opts) -> + rdb_put(Ref, EncKey, EncVal, Opts). + +insert_bag(Ref, EncKey, EncVal, _, Opts) -> + %% case Ref of + %% #{vsn := 1} -> + insert_bag_v1(Ref, EncKey, EncVal, Opts). + +batch_if_index(#{mode := mnesia} = Ref, _, _, F, _Key, EncKey, Data, _Obj, Opts) -> + F(Ref, EncKey, Data, undefined, Opts); +batch_if_index(#{name := Name, properties := #{index := [_|_] = Ixs}} = Ref, + Op, SoB, F, Key, EncKey, Data, Obj, Opts) when is_atom(Name) -> + IxF = fun(R) -> + IxRes = update_index(Ixs, Op, SoB, Name, R, Key, Obj, Opts), + F(R, EncKey, Data, IxRes, Opts) + end, + as_batch(Ref, IxF, Opts); +batch_if_index(Ref, _, _, F, _, EncKey, Data, _, Opts) -> + F(Ref, EncKey, Data, undefined, Opts). + +update_index(Ixs, insert, SoB, Name, R, Key, Obj, Opts) -> + update_index_i(Ixs, SoB, Name, R, Key, Obj, Opts); +update_index(Ixs, delete, set, Name, R, Key, _Obj, Opts) -> + update_index_d(Ixs, Name, R, Key, Opts); +update_index(Ixs, delete_obj, set, Name, R, Key, Obj, Opts) -> + update_index_do_set(Ixs, Name, R, Key, Obj, Opts); +update_index(Ixs, delete_obj, bag, Name, R, Key, Obj, Opts) -> + update_index_do_bag(Ixs, Name, R, Key, Obj, Opts). + +update_index_i([{_Pos,ordered} = I|Ixs], + SoB, Name, R, Key, Obj, Opts) -> + Tab = {Name, index, I}, + #{ix_vals_f := IxValsF} = IRef = ensure_ref(Tab, R), + EncVal = <<>>, + NewVals = IxValsF(Obj), + case SoB of + set -> + OldObjs = read(R, Key, Opts), + {Del, Put} = ix_vals_to_delete(OldObjs, IxValsF, NewVals), + [rdb_delete(IRef, encode_key({IxVal, Key}, IRef), Opts) + || IxVal <- Del], + [rdb_put(IRef, encode_key({IxVal, Key}, IRef), EncVal, Opts) + || IxVal <- Put]; + bag -> + [rdb_put(IRef, encode_key({IxVal, Key}, IRef), EncVal, Opts) + || IxVal <- NewVals] + end, + update_index_i(Ixs, SoB, Name, R, Key, Obj, Opts); +update_index_i([], _, _, _, _, _, _) -> + ok. + +update_index_d(Ixs, Name, R, Key, Opts) -> + Found = read(R, Key, Opts), + update_index_d_(Ixs, Name, R, Key, Found, Opts). + +update_index_d_([{_Pos,ordered} = I|Ixs], Name, R, Key, Found, Opts) -> + Tab = {Name, index, I}, + #{ix_vals_f := IxValsF} = IRef = ensure_ref(Tab, R), + IxVals = + lists:foldl( + fun(Obj, Acc) -> + ordsets:union(Acc, ordsets:from_list(IxValsF(Obj))) + end, ordsets:new(), Found), + [rdb_delete(IRef, encode_key({IxVal, Key}, IRef), Opts) || IxVal <- IxVals], + update_index_d_(Ixs, Name, R, Key, Found, Opts); +update_index_d_([], _, _, _, _, _) -> + undefined. + +update_index_do_set(Ixs, Name, R, Key, Obj, Opts) -> + EncKey = encode_key(Key, R), + case read_raw(R, EncKey, Key, Opts) of + [Obj] -> + update_index_do_set_(Ixs, Name, R, Key, Obj, Opts), + EncKey; + _ -> + not_found + end. + +update_index_do_set_([{_Pos,ordered} = I|Ixs], Name, R, Key, Obj, Opts) -> + Tab = {Name, index, I}, + #{ix_vals_f := IxValsF} = IRef = ensure_ref(Tab, R), + IxVals = IxValsF(Obj), + [rdb_delete(IRef, encode_key({IxVal, Key}, IRef), Opts) || IxVal <- IxVals], + update_index_do_set_(Ixs, Name, R, Key, Obj, Opts); +update_index_do_set_([], _, _, _, _, _) -> + ok. + +%% TODO: make IxRefs for all ix positions, traverse the main tab once +update_index_do_bag(Ixs, Name, R, Key, Obj, Opts) -> + case read_bag_ret_raw_keys(R, Key, Opts) of + [] -> + not_found; + Found -> + case lists:keytake(Obj, 2, Found) of + {value, {RawK, _}, Rest} -> + update_index_do(Ixs, Name, R, Key, Obj, Rest, Opts), + RawK; + false -> + not_found + end + end. + +update_index_do([{_Pos,ordered} = Ix|Ixs], Name, R, Key, Obj, Rest, Opts) -> + Tab = {Name, index, Ix}, + #{ix_vals_f := IxValsF} = IRef = ensure_ref(Tab, R), + IxVals = IxValsF(Obj), + IxVals1 = lists:foldl(fun({_,O}, Acc) -> Acc -- IxValsF(O) end, IxVals, Rest), + [rdb_delete(IRef, encode_key({IxVal, Key}, IRef), Opts) || IxVal <- IxVals1], + update_index_do(Ixs, Name, R, Key, Obj, Rest, Opts); +update_index_do([], _, _, _, _, _, _) -> + ok. + +ix_vals_to_delete(OldObjs, IxValsF, NewVals) -> + ix_vals_to_delete(OldObjs, IxValsF, NewVals, []). + +ix_vals_to_delete([H|T], IxValsF, New, Del) -> + Del1 = lists:foldl( + fun(V, D) -> + case not lists:member(V, New) + andalso not lists:member(V, Del) of + true -> + [V|D]; + false -> + D + end + end, Del, IxValsF(H)), + ix_vals_to_delete(T, IxValsF, New, Del1); +ix_vals_to_delete([], _, New, Del) -> + {Del, New -- Del}. + + +read(Tab, Key) -> + read(Tab, Key, []). + +read(Tab, Key, Opts) -> + read_(ensure_ref(Tab), Key, Opts). + +read_(#{semantics := bag} = Ref, Key, Opts) -> + read_bag_(Ref, Key, Opts); +read_(Ref, Key, Opts) -> + read_raw(Ref, encode_key(Key, Ref), Key, read_opts(Ref, Opts)). + +read_raw(Ref, EncKey, Key, Opts) -> + case rdb_get(Ref, EncKey, read_opts(Ref, Opts)) of + not_found -> + []; + {ok, Bin} -> + Obj = decode_val(Bin, Key, Ref), + [Obj]; + {error, _} = Error -> + mnesia:abort(Error) + end. + +read_bag_(Ref, Key, Opts) -> + read_bag_(Ref, Key, false, Opts). + +read_bag_ret_raw_keys(Ref, Key, Opts) -> + read_bag_(Ref, Key, true, Opts). + +read_bag_(#{name := Name} = Ref, Key, RetRaw, Opts) -> + Pos = keypos(Name), + Enc = encode_key(Key, Ref), + Sz = byte_size(Enc), + with_rdb_iterator_( + Ref, fun(I) -> + read_bag_i_(Sz, Enc, i_move(I, Enc), Key, I, Pos, Ref, RetRaw) + end, read_opts(Ref, Opts)). + +read_bag_i_(Sz, Enc, {ok, Enc, _}, K, I, KP, Ref, RetRaw) -> + %% When exactly can this happen, and why skip? (this is from the old code) + read_bag_i_(Sz, Enc, i_move(I, next), K, I, KP, Ref, RetRaw); +read_bag_i_(Sz, Enc, Res, K, I, KP, Ref, RetRaw) -> + case Res of + {ok, <> = RawK, V} -> + Obj = decode_val(V, K, Ref), + [maybe_wrap(RetRaw, Obj, RawK) | + read_bag_i_(Sz, Enc, i_move(I, next), K, I, KP, Ref, RetRaw)]; + _ -> + [] + end. + +maybe_wrap(false, Obj, _ ) -> Obj; +maybe_wrap(true , Obj, RawK) -> {RawK, Obj}. + +index_read(Tab, Val, Ix) -> + index_read_(ensure_ref(Tab), Val, Ix). + +index_read_(#{name := Main, semantics := Sem} = Ref, Val, Ix) -> + I = case Ix of + _ when is_atom(Ix) -> + {attr_pos(Ix, Ref), ordered}; + {_} -> + Ix; + _ when is_integer(Ix) -> + {Ix, ordered} + end, + #{ix_vals_f := IxValF} = IxRef = ensure_ref({Main, index, I}), + IxValPfx = sext:prefix({Val,'_'}), + Sz = byte_size(IxValPfx), + Fun = case Sem of + bag -> fun(It) -> + index_read_i_bag(rocksdb:iterator_move(It, IxValPfx), + It, IxValPfx, Sz, Val, IxValF, Ref) + end; + _ -> fun(It) -> + index_read_i(rocksdb:iterator_move(It, IxValPfx), + It, IxValPfx, Sz, Ref) + end + end, + with_rdb_iterator(IxRef, Fun). + %% IxRef, + %% fun(It) -> + %% index_read_i(rocksdb:iterator_move(It, IxValPfx), + %% It, IxValPfx, Sz, Ref) + %% end). + +attr_pos(A, #{attr_pos := AP}) -> + maps:get(A, AP). + +index_read_i({ok, K, _}, I, Pfx, Sz, Ref) -> + case K of + <> -> + {_, Key} = sext:decode(K), + read(Ref, Key, []) ++ + index_read_i(rocksdb:iterator_move(I, next), + I, Pfx, Sz, Ref); + _ -> + [] + end; +index_read_i({error, invalid_iterator}, _, _, _, _) -> + []. + +index_read_i_bag({ok, K, _}, I, Pfx, Sz, Val, ValsF, Ref) -> + case K of + <> -> + {_, Key} = sext:decode(K), + filter_objs(read(Ref, Key, []), Val, ValsF) ++ + index_read_i_bag(rocksdb:iterator_move(I, next), + I, Pfx, Sz, Val, ValsF, Ref); + _ -> + [] + end; +index_read_i_bag({error, invalid_iterator}, _, _, _, _, _, _) -> + []. + +filter_objs([], _, _) -> + []; +filter_objs([H|T], Val, ValsF) -> + case lists:member(Val, ValsF(H)) of + true -> [H | filter_objs(T, Val, ValsF)]; + false -> filter_objs(T, Val, ValsF) + end. + +%% @doc Returns the alias of a given table or table reference. +-spec alias_of(ref_or_tab()) -> alias(). +alias_of(Tab) -> + #{alias := Alias} = ensure_ref(Tab), + Alias. + +%% @doc Creates a `rocksdb' batch context and executes the fun `F' in it. +%% +%% %% Rocksdb batches aren't tied to a specific DbRef until written. +%% This can cause surprising problems if we're juggling multiple +%% rocksdb instances (as we do if we have standalone tables). +%% At the time of writing, all objects end up in the DbRef the batch +%% is written to, albeit not necessarily in the intended column family. +%% This will probably change, but no failure mode is really acceptable. +%% The code below ensures that separate batches are created for each +%% DbRef, under a unique reference stored in the pdict. When writing, +%% all batches are written separately to the corresponding DbRef, +%% and when releasing, all batches are released. This will not ensure +%% atomicity, but there is no way in rocksdb to achieve atomicity +%% across db instances. At least, data should end up where you expect. +%% +%% @end +-spec as_batch(ref_or_tab(), fun( (db_ref()) -> Res )) -> Res. +as_batch(Tab, F) -> + as_batch(Tab, F, []). + +%% @doc as {@link as_batch/2}, but with the ability to pass `Opts' to `rocksdb:write_batch/2' +as_batch(Tab, F, Opts) when is_function(F, 1), is_list(Opts) -> + as_batch_(ensure_ref(Tab), F, Opts). + +as_batch_(#{batch := _} = R, F, _) -> + %% If already inside a batch, add to that batch (batches don't seem to nest) + F(R); +as_batch_(#{db_ref := DbRef} = R, F, Opts) -> + BatchRef = get_batch_(DbRef), + try F(R#{batch => BatchRef}) of + Res -> + case write_batches(BatchRef, write_opts(R, Opts)) of + ok -> + Res; + {error, Reason} -> + abort(Reason) + end + after + release_batches(BatchRef) + end. + + +get_batch(#{db_ref := DbRef, batch := BatchRef}) -> + try {ok, get_batch_(DbRef, BatchRef)} + catch + error:Reason -> + {error, Reason} + end; +get_batch(_) -> + {error, badarg}. + +get_batch_(DbRef) -> + Ref = make_ref(), + {ok, Batch} = rocksdb:batch(), + put({mrdb_batch, Ref}, #{DbRef => Batch}), + Ref. + +get_batch_(DbRef, BatchRef) -> + Key = {mrdb_batch, BatchRef}, + case get(Key) of + undefined -> + error(stale_batch_ref); + #{DbRef := Batch} -> + Batch; + Map -> + {ok, Batch} = rocksdb:batch(), + put(Key, Map#{DbRef => Batch}), + Batch + end. + +write_batches(BatchRef, Opts) -> + case get({mrdb_batch, BatchRef}) of + undefined -> + error(stale_batch_ref); + Map -> + %% Some added complication since we deal with potentially + %% multiple DbRefs, and will want to return errors. + ret_batch_write_acc( + maps:fold( + fun(DbRef, Batch, Acc) -> + case rocksdb:write_batch(DbRef, Batch, Opts) of + ok -> + Acc; + {error,E} -> + acc_batch_write_error(E, DbRef, Acc) + end + end, ok, Map)) + end. + +ret_batch_write_acc(ok) -> + ok; +ret_batch_write_acc(Es) when is_list(Es) -> + {error, lists:reverse(Es)}. + +acc_batch_write_error(E, DbRef, ok) -> + [{DbRef, E}]; + +acc_batch_write_error(E, DbRef, Es) when is_list(Es) -> + [{DbRef, E}|Es]. + +release_batches(BatchRef) -> + case get({mrdb_batch, BatchRef}) of + undefined -> + error(stale_batch_ref); + Map -> + maps_foreach( + fun(_, Batch) -> + rocksdb:release_batch(Batch) + end, Map), + erase(BatchRef), + ok + end. + +%% maps:foreach/2 doesn't exist in OTP 22 ... +maps_foreach(F, Map) -> + I = maps:iterator(Map), + maps_foreach_(F, maps:next(I)). + +maps_foreach_(F, {K, V, I}) -> + F(K, V), + maps_foreach_(F, maps:next(I)); +maps_foreach_(_, none) -> + ok. + +%% Corresponding to `rocksdb:write()`, renamed to avoid confusion with +%% `mnesia:write()`, which has entirely different semantics. +%% +batch_write(Tab, L) when is_list(L) -> + batch_write(Tab, L, []). + +batch_write(Tab, L, Opts) -> + batch_write_(ensure_ref(Tab), L, Opts). + +batch_write_(Ref, L, Opts) -> + as_batch(Ref, fun(R) -> write_as_batch(R, L) end, Opts). + +write_as_batch(Ref, L) -> + lists:foreach( + fun({put, K, V}) -> + rdb_put(Ref, K, V, []); + ({delete, K}) -> + rdb_delete(Ref, K, []) + end, L). + +update_counter(Tab, C, Val) -> + update_counter(Tab, C, Val, []). + +update_counter(Tab, C, Val, Opts) -> + Ref = ensure_ref(Tab), + case Ref of + #{encoding := {_, {value, term}}} -> + update_counter_(Ref, encode_key(C, Ref), Val, Opts); + _ -> + abort(badarg) + end. + +update_counter_(Ref, EncKey, Val, Opts) -> + rdb_merge(Ref, EncKey, {int_add, Val}, Opts). + +-spec delete(ref_or_tab(), key()) -> ok. +delete(Tab, Key) -> + delete(Tab, Key, []). + +-spec delete(ref_or_tab(), key(), write_options()) -> ok. +delete(Tab, Key, Opts) -> + Ref = ensure_ref(Tab), + delete_(Ref, Key, encode_key(Key, Ref), Opts). + +delete_(#{semantics := bag} = Ref, Key, EncKey, Opts) -> + batch_if_index(Ref, delete, bag, fun delete_bag/5, Key, EncKey, [], [], Opts); +delete_(Ref, Key, EncKey, Opts) -> + batch_if_index(Ref, delete, set, fun delete_set/5, Key, EncKey, [], [], Opts). + +delete_object(Tab, Obj) -> + delete_object(Tab, Obj, []). + +delete_object(Tab, Obj, Opts) -> + #{name := Name} = Ref = ensure_ref(Tab), + Key = element(keypos(Name), Obj), + EncKey = encode_key(Key, Ref), + delete_object_(Ref, Key, EncKey, Obj, Opts). + +delete_object_(#{semantics := bag} = Ref, Key, EncKey, Obj, Opts) -> + batch_if_index(Ref, delete_obj, bag, fun delete_obj_bag/5, Key, + EncKey, Obj, Obj, Opts); +delete_object_(Ref, Key, EncKey, Obj, Opts) -> + batch_if_index(Ref, delete_obj, set, fun delete_obj_set/5, Key, + EncKey, Obj, Obj, Opts). + +-spec first(ref_or_tab()) -> key() | '$end_of_table'. +first(Tab) -> + first(Tab, []). + +-spec first(ref_or_tab(), read_options()) -> key() | '$end_of_table'. +first(Tab, Opts) -> + Ref = ensure_ref(Tab), + with_rdb_iterator(Ref, fun(I) -> i_first(I, Ref) end, Opts). + +-spec last(ref_or_tab()) -> key() | '$end_of_table'. +last(Tab) -> + last(Tab, []). + +-spec last(ref_or_tab(), read_options()) -> key() | '$end_of_table'. +last(Tab, Opts) -> + Ref = ensure_ref(Tab), + with_rdb_iterator(Ref, fun(I) -> i_last(I, Ref) end, Opts). + +-spec next(ref_or_tab(), key()) -> key() | '$end_of_table'. +next(Tab, K) -> + next(Tab, K, []). + +-spec next(ref_or_tab(), key(), read_options()) -> key() | '$end_of_table'. +next(Tab, K, Opts) -> + Ref = ensure_ref(Tab), + EncKey = encode_key(K, Ref), + with_rdb_iterator(Ref, fun(I) -> i_next(I, EncKey, Ref) end, Opts). + +-spec prev(ref_or_tab(), key()) -> key() | '$end_of_table'. +prev(Tab, K) -> + prev(Tab, K, []). + +-spec prev(ref_or_tab(), key(), read_options()) -> key() | '$end_of_table'. +prev(Tab, K, Opts) -> + Ref = ensure_ref(Tab), + EncKey = encode_key(K, Ref), + with_rdb_iterator(Ref, fun(I) -> i_prev(I, EncKey, Ref) end, Opts). + +select(Tab, Pat) -> + select(Tab, Pat, infinity). + +select(Tab, Pat, Limit) when Limit == infinity; is_integer(Limit), Limit > 0 -> + true = valid_limit(Limit), + mrdb_select:select(ensure_ref(Tab), Pat, Limit). + +select(Cont) -> + mrdb_select:select(Cont). + +match_delete(Tab, Pat) -> + Ref = ensure_ref(Tab), + MatchSpec = [{Pat, [], [true]}], + as_batch(Ref, fun(R) -> + %% call select() with AccKeys=true, returning [{Key, _}] + match_delete_(mrdb_select:select(Ref, MatchSpec, true, 30), R) + end). + +match_delete_({L, Cont}, Ref) -> + [rdb_delete(Ref, K, []) || {K,_} <- L], + match_delete_(select(Cont), Ref); +match_delete_('$end_of_table', _) -> + ok. + +fold(Tab, Fun, Acc) -> + fold(Tab, Fun, Acc, [{'_', [], ['$_']}]). + +fold(Tab, Fun, Acc, MatchSpec) -> + fold(Tab, Fun, Acc, MatchSpec, infinity). + +fold(Tab, Fun, Acc, MatchSpec, Limit) -> + true = valid_limit(Limit), + mrdb_select:fold(ensure_ref(Tab), Fun, Acc, MatchSpec, Limit). + +rdb_fold(Tab, Fun, Acc, Prefix) when is_function(Fun, 3) + , is_binary(Prefix) -> + rdb_fold(Tab, Fun, Acc, Prefix, infinity). + +rdb_fold(Tab, Fun, Acc, Prefix, Limit) when is_function(Fun, 3) + , is_binary(Prefix) -> + true = valid_limit(Limit), + mrdb_select:rdb_fold(ensure_ref(Tab), Fun, Acc, Prefix, Limit). + +valid_limit(L) -> + case L of + infinity -> + true; + _ when is_integer(L), L > 0 -> + true; + _ -> + abort(badarg) + end. + +write_info(Tab, K, V) -> + R = ensure_ref(Tab), + Alias = case R of + #{type := standalone, vsn := 1, alias := A} = TRef -> + %% Also write on legacy info format + write_info_standalone(TRef, K, V), + A; + #{alias := A} -> + A + end, + write_info_(ensure_ref({admin, Alias}), Tab, K, V). + +write_info_(#{} = R, Tab, K, V) -> + EncK = encode_key({info,Tab,K}, sext), + EncV = term_to_binary(V), + rdb_put(R, EncK, EncV, write_opts(R, [])). + +read_info(Tab) -> + mnesia_rocksdb_admin:read_info(ensure_ref(Tab)). + +read_info(Tab, K) -> + read_info(Tab, K, undefined). + +read_info(Tab, K, Default) when K==size; K==memory -> + read_direct_info_(ensure_ref(Tab), K, Default); +read_info(Tab, K, Default) -> + #{alias := Alias} = R = ensure_ref(Tab), + case R of + #{type := standalone, vsn := 1} = TRef -> + read_info_standalone(TRef, K, Default); + #{alias := Alias} -> + mnesia_rocksdb_admin:read_info(Alias, Tab, K, Default) + end. + +read_direct_info_(R, memory, _Def) -> + get_property(R, <<"rocksdb.total-sst-files-size">>, integer, 0); +read_direct_info_(R, size, _Def) -> + get_property(R, <<"rocksdb.estimate-num-keys">>, integer, 0). + +-dialyzer({nowarn_function, get_property/4}). +get_property(#{db_ref := R, cf_handle := CfH}, Prop, Type, Default) -> + case rocksdb:get_property(R, CfH, Prop) of + {error, _} -> + Default; + {ok, Res} -> + case Type of +%% boolean -> rocksdb_boolean(Res); +%% string -> Res; + %% get_property/3 is incorrectly typed as returning string() + integer -> binary_to_integer(Res) + end + end. + +%%rocksdb_boolean(<<"1">>) -> true; +%%rocksdb_boolean(<<"0">>) -> false. + +write_info_standalone(#{} = R, K, V) -> + EncK = <>, + EncV = term_to_binary(V), + rdb_put(R, EncK, EncV, write_opts(R, [])). + +read_info_standalone(#{} = R, K, Default) -> + EncK = <>, + get_info_res(rdb_get(R, EncK, read_opts(R, [])), Default). + +get_info_res(Res, Default) -> + case Res of + not_found -> + Default; + {ok, Bin} -> + %% no fancy tricks when encoding/decoding info values + binary_to_term(Bin); + {error, E} -> + error(E) + end. + +%% insert_bag_v2(Ref, K, V, Opts) -> +%% rdb_merge(Ref, K, {list_append, [V]} + +insert_bag_v1(Ref, K, V, Opts) -> + KSz = byte_size(K), + with_rdb_iterator( + Ref, fun(I) -> + do_insert_bag_v1(KSz, K, i_move(I, K), I, V, 0, Ref, Opts) + end). + +do_insert_bag_v1(Sz, K, Res, I, V, Prev, Ref, Opts) -> + case Res of + {ok, <>, V} -> + %% object exists + ok; + {ok, <>, _} -> + do_insert_bag_v1(Sz, K, i_move(I, next), I, V, N, Ref, Opts); + _ -> + Key = <>, + rdb_put(Ref, Key, V, Opts) + end. + +delete_set(Ref, EncKey, _Data, _IxRes, Opts) -> + rdb_delete(Ref, EncKey, Opts). + +delete_bag(_, _, _, not_found, _) -> + %% Indexing function already tried reading the object, and didn't find it + ok; +delete_bag(Ref, _, _, RawKey, Opts) when is_binary(RawKey) -> + rdb_delete(Ref, RawKey, Opts); +delete_bag(Ref, EncKey, _, _, Opts) -> + Sz = byte_size(EncKey), + Found = with_rdb_iterator( + Ref, fun(I) -> + do_delete_bag_(Sz, EncKey, i_move(I, EncKey), Ref, I) + end), + case Found of + [] -> + ok; + _ -> + batch_write(Ref, [{delete, K} || K <- Found], Opts) + end. + +delete_obj_bag(_, _, _, not_found, _) -> + ok; +delete_obj_bag(Ref, _EncKey, _Obj, RawKey, Opts) when is_binary(RawKey) -> + rdb_delete(Ref, RawKey, Opts); +delete_obj_bag(Ref, EncKey, Obj, _, Opts) -> + Sz = byte_size(EncKey), + with_rdb_iterator( + Ref, fun(I) -> + do_del_obj_bag_(Sz, EncKey, i_move(I, EncKey), Obj, Ref, I, Opts) + end). + +do_delete_bag_(Sz, K, Res, Ref, I) -> + case Res of + {ok, K, _} -> + do_delete_bag_(Sz, K, i_move(I, next), Ref, I); + {ok, <> = Key, _} -> + [Key | do_delete_bag_(Sz, K, i_move(I, next), Ref, I)]; + _ -> + [] + end. + +do_del_obj_bag_(Sz, K, Res, Obj, #{name := Name} = Ref, I, Opts) -> + case Res of + {ok, <> = RawKey, V} -> + Key = element(keypos(Name), Obj), + case decode_val(V, Key, Ref) of + Obj -> + rdb_delete(Ref, RawKey, Opts); + _ -> + do_del_obj_bag_(Sz, K, i_move(I, next), Obj, Ref, I, Opts) + end; + _ -> + ok + end. + +delete_obj_set(_, _, _, not_found, _) -> + ok; +delete_obj_set(Ref, _, _, RawKey, Opts) when is_binary(RawKey) -> + rdb_delete(Ref, RawKey, Opts); +delete_obj_set(#{name := Name} = Ref, EncKey, Obj, _, Opts) -> + case rdb_get(Ref, EncKey, []) of + {ok, Bin} -> + Key = element(keypos(Name), Obj), + case decode_val(Bin, Key, Ref) of + Obj -> + rdb_delete(Ref, EncKey, Opts); + _ -> + ok + end; + _ -> + ok + end. + +rdb_put(R, K, V) -> rdb_put(R, K, V, []). +rdb_put(R, K, V, Opts) -> + rdb_put_(R, K, V, write_opts(R, Opts)). + +rdb_put_(#{batch := BatchRef, + db_ref := DbRef, + cf_handle := CfH}, K, V, _Opts) -> + Batch = get_batch_(DbRef, BatchRef), + rocksdb:batch_put(Batch, CfH, K, V); +rdb_put_(#{tx_handle := TxH, cf_handle := CfH}, K, V, _Opts) -> + rocksdb:transaction_put(TxH, CfH, K, V); +rdb_put_(#{db_ref := DbRef, cf_handle := CfH}, K, V, WOpts) -> + rocksdb:put(DbRef, CfH, K, V, WOpts). + +rdb_get(R, K) -> rdb_get(R, K, []). +rdb_get(R, K, Opts) -> + rdb_get_(R, K, read_opts(R, Opts)). + +rdb_get_(#{tx_handle := TxH, cf_handle := CfH, snapshot := SH}, K, _Opts) -> + rocksdb:transaction_get(TxH, CfH, K, [{snapshot, SH}]); +rdb_get_(#{tx_handle := TxH, cf_handle := CfH}, K, _Opts) -> + rocksdb:transaction_get(TxH, CfH, K, []); +rdb_get_(#{db_ref := DbRef, cf_handle := CfH}, K, ROpts) -> + rocksdb:get(DbRef, CfH, K, ROpts). + +rdb_delete(R, K) -> rdb_delete(R, K, []). +rdb_delete(R, K, Opts) -> + rdb_delete_(R, K, write_opts(R, Opts)). + +rdb_delete_(#{batch := BatchRef, + db_ref := DbRef, + cf_handle := CfH}, K, _Opts) -> + Batch = get_batch_(DbRef, BatchRef), + rocksdb:batch_delete(Batch, CfH, K); +rdb_delete_(#{tx_handle := TxH, cf_handle := CfH}, K, _Opts) -> + rocksdb:transaction_delete(TxH, CfH, K); +rdb_delete_(#{db_ref := DbRef, cf_handle := CfH}, K, WOpts) -> + rocksdb:delete(DbRef, CfH, K, WOpts). + +rdb_iterator(R) -> rdb_iterator(R, []). +rdb_iterator(R, Opts) -> + rdb_iterator_(R, read_opts(R, Opts)). + +rdb_iterator_(#{db_ref := DbRef, tx_handle := TxH, cf_handle := CfH}, ROpts) -> + rocksdb:transaction_iterator(DbRef, TxH, CfH, ROpts); +rdb_iterator_(#{db_ref := DbRef, cf_handle := CfH}, ROpts) -> + rocksdb:iterator(DbRef, CfH, ROpts). + +rdb_merge(R, K, Op, Opts) -> + rdb_merge_(R, K, term_to_binary(Op), write_opts(R, Opts)). + +rdb_merge_(#{db_ref := DbRef, cf_handle := CfH}, K, Op, WOpts) -> + rocksdb:merge(DbRef, CfH, K, Op, WOpts). + +write_opts(#{write_opts := Os}, Opts) -> Os ++ Opts; +write_opts(_, Opts) -> + Opts. + +read_opts(#{read_opts := Os}, Opts) -> Os ++ Opts; +read_opts(_, Opts) -> + Opts. + +-define(EOT, '$end_of_table'). + +i_first(I, Ref) -> + case i_move(I, first) of + {ok, First, _} -> + decode_key(First, Ref); + _ -> + ?EOT + end. + +i_last(I, Ref) -> + case i_move(I, last) of + {ok, Last, _} -> + decode_key(Last, Ref); + _ -> + ?EOT + end. + +i_move(I, Where) -> + rocksdb:iterator_move(I, Where). + +i_next(I, Key, Ref) -> + i_move_to_next(i_move(I, Key), I, Key, Ref). + +i_prev(I, Key, Ref) -> + case i_move(I, Key) of + {ok, _, _} -> + i_move_to_prev(i_move(I, prev), I, Key, Ref); + {error, invalid_iterator} -> + i_move_to_prev(i_move(I, last), I, Key, Ref) + end. + +i_move_to_next({ok, Key, _}, I, Key, Ref) -> + i_move_to_next(i_move(I, next), I, Key, Ref); +i_move_to_next({ok, NextKey, _}, _, _, Ref) -> + decode_key(NextKey, Ref); +i_move_to_next(_, _, _, _) -> + ?EOT. + +i_move_to_prev({ok, K, _}, _I, Key, Ref) when K < Key -> + decode_key(K, Ref); +i_move_to_prev({ok, _, _}, I, Key, Ref) -> + i_move_to_prev(i_move(I, prev), I, Key, Ref); +i_move_to_prev(_, _, _, _) -> + ?EOT. diff --git a/src/mrdb_index.erl b/src/mrdb_index.erl new file mode 100644 index 0000000..bf42842 --- /dev/null +++ b/src/mrdb_index.erl @@ -0,0 +1,191 @@ +-module(mrdb_index). + +-export([ + with_iterator/3 + , iterator_move/2 + , iterator/2 + , iterator_close/1 + ]). + +-record(mrdb_ix_iter, { i :: mrdb:iterator() + , type = set :: set | bag + , sub :: mrdb:ref() | pid() + }). + +-type ix_iterator() :: #mrdb_ix_iter{}. +-type index_value() :: any(). +-type iterator_action() :: mrdb:iterator_action(). + +-type object() :: tuple(). + +-record(subst, { i :: mrdb:iterator() + , vals_f + , cur + , mref }). + +-define(TIMEOUT, 5000). + +-import(mnesia_rocksdb_lib, [ encode_key/2 ]). + +-export_type([ ix_iterator/0 ]). + +-spec with_iterator(mrdb:ref_or_tab(), mrdb:index_position(), fun( (ix_iterator()) -> Res)) -> Res. +with_iterator(Tab, IxPos, Fun) when is_function(Fun, 1) -> + {ok, I} = iterator(Tab, IxPos), + try Fun(I) + after + iterator_close(I) + end. + +-spec iterator(mrdb:ref_or_tab(), mrdb:index_position()) -> {ok, ix_iterator()} + | {error, _}. +iterator(Tab, IxPos) -> + #{semantics := Sem} = R = mrdb:ensure_ref(Tab), + #{ix_vals_f := IxValsF} = IxR = ensure_index_ref(IxPos, R), + case mrdb:iterator(IxR, []) of + {ok, I} -> + case Sem of + bag -> + P = sub_new(R, IxValsF), + {ok, #mrdb_ix_iter{ i = I + , sub = P }}; + _ -> + {ok, #mrdb_ix_iter{i = I, sub = R}} + end; + Err -> + Err + end. + +-spec iterator_move(ix_iterator(), iterator_action()) -> {ok, index_value(), object()} + | {error, _}. +iterator_move(#mrdb_ix_iter{type = set} = IxI, Dir) -> iterator_move_set(IxI, Dir); +iterator_move(#mrdb_ix_iter{type = bag} = IxI, Dir) -> iterator_move_bag(IxI, Dir). + +iterator_move_set(#mrdb_ix_iter{i = I, sub = Sub}, Dir) -> + case mrdb:iterator_move(I, Dir) of + {ok, {{FKey, PKey}}} -> + {ok, FKey, opt_read(Sub, PKey)}; + Other -> + Other + end. + +iterator_move_bag(#mrdb_ix_iter{i = I, sub = Sub}, Dir) -> + case call_sub(Sub, {move_rel, Dir}) of + not_found -> + case mrdb:iterator_move(I, Dir) of + {ok, {FKey, PKey}} -> + call_sub(Sub, {move_abs, FKey, PKey}); + Other -> + Other + end; + Other -> + Other + end. + +opt_read(R, Key) -> + case mrdb:read(R, Key, []) of + [Obj] -> + Obj; + [] -> + [] + end. + +sub_new(R, ValsF) when is_function(ValsF, 1) -> + Me = self(), + {Pid, MRef} = spawn_monitor( + fun() -> + MRef = monitor(process, Me), + case mrdb:iterator(R) of + {ok, I} -> + Me ! {self(), ok}, + sub_loop(#subst{ mref = MRef + , i = I + , vals_f = ValsF + , cur = undefined}); + Error -> + Me ! {self(), Error} + end + end), + receive + {'DOWN', MRef, _, _, Crash} -> + mrdb:abort({error, Crash}); + {Pid, ok} -> + demonitor(MRef), + Pid; + {Pid, Error} -> + demonitor(MRef), + mrdb:abort(Error) + end. + +sub_loop(#subst{i = I, mref = MRef} = St) -> + receive + {'DOWN', MRef, _, _, _} -> + mrdb:iterator_close(I); + {Pid, Ref, close} -> + mrdb:iterator_close(I), + Pid ! {Ref, ok}; + {Pid, Ref, cur} -> + Pid ! {Ref, St#subst.cur}, + sub_loop(St); + {Pid, Ref, {move, Cur, Dir}} when is_binary(Dir) -> + {Res, St1} = sub_abs_move(Cur, Dir, St), + Pid ! {Ref, Res}, + sub_loop(St1); + {Pid, Ref, {move_rel, Dir}} -> + {Res, St1} = sub_rel_move(Dir, St), + Pid ! {Ref, Res}, + sub_loop(St1) + end. + +sub_abs_move(Cur, Dir, #subst{i = I} = St) -> + case mrdb:iterator_move(I, Dir) of + {ok, _} = Ok -> + {Ok, St#subst{cur = Cur}}; + Other -> + {Other, St#subst{cur = undefined}} + end. + +sub_rel_move(Dir, #subst{i = I, vals_f = VF, cur = Prev} = St) -> + case mrdb:iterator_move(I, Dir) of + {ok, Obj} = Ok -> + case lists:member(Prev, VF(Obj)) of + true -> + {Ok, St}; + false -> + {not_found, St#subst{cur = undefined}} + end; + Other -> + {Other, St#subst{cur = undefined}} + end. + +call_sub(Pid, Req) -> + MRef = monitor(process, Pid), + Pid ! {self(), MRef, Req}, + receive + {MRef, Reply} -> + demonitor(MRef), + Reply; + {'DOWN', MRef, _, _, Reason} -> + error(Reason) + after ?TIMEOUT -> + error(timeout) + end. + +-spec iterator_close(ix_iterator()) -> ok. +iterator_close(#mrdb_ix_iter{i = I, sub = Sub}) -> + mrdb:iterator_close(I), + iterator_close_sub(Sub). + +iterator_close_sub(P) when is_pid(P) -> + call_sub(P, close); +iterator_close_sub(_) -> + ok. + +ensure_index_ref(IxPos, #{name := Name, attr_pos := AP, properties := #{index := Ixs}}) -> + {_,ordered} = Ix = lists:keyfind(index_pos(IxPos, AP), 1, Ixs), + mrdb:get_ref({Name, index, Ix}). + +index_pos(P, AP) when is_atom(P) -> + maps:get(P, AP); +index_pos(P, _) -> + P. diff --git a/src/mrdb_mutex.erl b/src/mrdb_mutex.erl new file mode 100644 index 0000000..98eae78 --- /dev/null +++ b/src/mrdb_mutex.erl @@ -0,0 +1,80 @@ +-module(mrdb_mutex). + +-export([ do/2 ]). + +-export([ ensure_tab/0 ]). + +-define(LOCK_TAB, ?MODULE). + +%% We use a wrapping ets counter (default: 0) as a form of semaphor. +%% The claim operation is done using an atomic list of two updates: +%% first, incrementing with 0 - this returns the previous value +%% then, incrementing with 1, but wrapping at 1, ensuring that we get 1 back, +%% regardless of previous value. This means that if [0,1] is returned, the resource +%% was not locked previously; if [1,1] is returned, it was. +%% +%% Releasing the resource is done by deleting the resource. If we just decrement, +%% we will end up with lingering unlocked resources, so we might as well delete. +%% Either operation is atomic, and the claim op creates the object if it's missing. + +do(Rsrc, F) when is_function(F, 0) -> + true = claim(Rsrc), + try F() + after + release(Rsrc) + end. + +claim(Rsrc) -> + case claim_(Rsrc) of + true -> true; + false -> busy_wait(Rsrc, 1000) + end. + +claim_(Rsrc) -> + case ets:update_counter(?LOCK_TAB, Rsrc, [{2, 0}, + {2, 1, 1, 1}], {Rsrc, 0}) of + [0, 1] -> + %% have lock + true; + [1, 1] -> + false + end. + +%% The busy-wait function makes use of the fact that we can read a timer to find out +%% if it still has time remaining. This reduces the need for selective receive, looking +%% for a timeout message. We yield, then retry the claim op. Yielding at least used to +%% also be necessary for the `read_timer/1` value to refresh. +%% +busy_wait(Rsrc, Timeout) -> + Ref = erlang:send_after(Timeout, self(), {claim, Rsrc}), + do_wait(Rsrc, Ref). + +do_wait(Rsrc, Ref) -> + erlang:yield(), + case erlang:read_timer(Ref) of + false -> + erlang:cancel_timer(Ref), + error(lock_wait_timeout); + _ -> + case claim_(Rsrc) of + true -> + erlang:cancel_timer(Ref), + ok; + false -> + do_wait(Rsrc, Ref) + end + end. + +release(Rsrc) -> + ets:delete(?LOCK_TAB, Rsrc), + ok. + + +%% Called by the process holding the ets table. +ensure_tab() -> + case ets:info(?LOCK_TAB, name) of + undefined -> + ets:new(?LOCK_TAB, [set, public, named_table, {write_concurrency, true}]); + _ -> + true + end. diff --git a/src/mrdb_select.erl b/src/mrdb_select.erl new file mode 100644 index 0000000..30cb1eb --- /dev/null +++ b/src/mrdb_select.erl @@ -0,0 +1,270 @@ +-module(mrdb_select). + +-export([ select/3 %% (Ref, MatchSpec, Limit) + , select/4 %% (Ref, MatchSpec, AccKeys, Limit) + , select/1 %% (Cont) + , fold/5 %% (Ref, Fun, Acc, MatchSpec, Limit) + , rdb_fold/5 %% (Ref, Fun, Acc, Prefix, Limit) + ]). + +-import(mnesia_rocksdb_lib, [ keypos/1 + , decode_key/2 + , decode_val/3 + ]). + +-include("mnesia_rocksdb.hrl"). + +-record(sel, { alias % TODO: not used + , tab + , ref + , keypat + , ms % TODO: not used + , compiled_ms + , limit + , key_only = false % TODO: not used + , direction = forward % TODO: not used + }). + +select(Ref, MS, Limit) when is_map(Ref), is_list(MS) -> + select(Ref, MS, false, Limit). + +select(Ref, MS, AccKeys, Limit) + when is_map(Ref), is_list(MS), is_boolean(AccKeys) -> + Sel = mk_sel(Ref, MS, Limit), + mrdb:with_rdb_iterator(Ref, fun(I) -> i_select(I, Sel, AccKeys, []) end). + +mk_sel(#{name := Tab} = Ref, MS, Limit) -> + Keypat = keypat(MS, keypos(Tab), Ref), + #sel{tab = Tab, + ref = Ref, + keypat = Keypat, + ms = MS, + compiled_ms = ets:match_spec_compile(MS), + key_only = needs_key_only(MS), + limit = Limit}. + +select(Cont) -> + case Cont of + '$end_of_table' -> '$end_of_table'; + _ -> Cont() + end. + +fold(Ref, Fun, Acc, MS, Limit) -> + {AccKeys, F} = + if is_function(Fun, 3) -> + {true, fun({K, Obj}, Acc1) -> + Fun(Obj, K, Acc1) + end}; + is_function(Fun, 2) -> + {false, Fun}; + true -> + mrdb:abort(invalid_fold_fun) + end, + fold_(select(Ref, MS, AccKeys, Limit), F, Acc). + +fold_('$end_of_table', _, Acc) -> + Acc; +fold_(L, Fun, Acc) when is_list(L) -> + lists:foldl(Fun, Acc, L); +fold_({L, Cont}, Fun, Acc) -> + fold_(select(Cont), Fun, lists:foldl(Fun, Acc, L)). + +rdb_fold(Ref, Fun, Acc, Prefix, Limit) -> + mrdb:with_rdb_iterator( + Ref, fun(I) -> + MovRes = rocksdb:iterator_move(I, first(Ref)), + i_rdb_fold(MovRes, I, Prefix, Fun, Acc, Limit) + end). + +first(#{vsn := 1}) -> <>; +first(_) -> first. + +i_rdb_fold({ok, K, V}, I, Pfx, Fun, Acc, Limit) when Limit > 0 -> + case is_prefix(Pfx, K) of + true -> + i_rdb_fold(rocksdb:iterator_move(I, next), I, Pfx, Fun, + Fun(K, V, Acc), decr(Limit)); + false -> + Acc + end; +i_rdb_fold(_, _, _, _, Acc, _) -> + Acc. + +i_select(I, #sel{ keypat = Pfx + , compiled_ms = MS + , limit = Limit + , ref = #{vsn := Vsn, encoding := Enc} } = Sel, AccKeys, Acc) -> + StartKey = case {Pfx, Vsn, Enc} of + {<<>>, 1, {sext, _}} -> + <>; + {_, _, {term, _}} -> + <<>>; + _ -> + Pfx + end, + select_traverse(rocksdb:iterator_move(I, StartKey), Limit, + Pfx, MS, I, Sel, AccKeys, Acc). + +needs_key_only([Pat]) -> + needs_key_only_(Pat); +needs_key_only([_|_] = Pats) -> + lists:all(fun needs_key_only_/1, Pats). + +needs_key_only_({HP, _, Body}) -> + BodyVars = lists:flatmap(fun extract_vars/1, Body), + %% Note that we express the conditions for "needs more than key" and negate. + not(wild_in_body(BodyVars) orelse + case bound_in_headpat(HP) of + {all,V} -> lists:member(V, BodyVars); + Vars when is_list(Vars) -> any_in_body(lists:keydelete(2,1,Vars), BodyVars) + end). + +extract_vars([H|T]) -> + extract_vars(H) ++ extract_vars(T); +extract_vars(T) when is_tuple(T) -> + extract_vars(tuple_to_list(T)); +extract_vars(T) when T=='$$'; T=='$_' -> + [T]; +extract_vars(T) when is_atom(T) -> + case is_wild(T) of + true -> + [T]; + false -> + [] + end; +extract_vars(_) -> + []. + +any_in_body(Vars, BodyVars) -> + lists:any(fun({_,Vs}) -> + intersection(Vs, BodyVars) =/= [] + end, Vars). + +intersection(A,B) when is_list(A), is_list(B) -> + A -- (A -- B). + +is_wild('_') -> + true; +is_wild(A) when is_atom(A) -> + case atom_to_list(A) of + "\$" ++ S -> + try begin + _ = list_to_integer(S), + true + end + catch + error:_ -> + false + end; + _ -> + false + end. + +wild_in_body(BodyVars) -> + intersection(BodyVars, ['$$','$_']) =/= []. + +bound_in_headpat(HP) when is_atom(HP) -> + {all, HP}; +bound_in_headpat(HP) when is_tuple(HP) -> + [_|T] = tuple_to_list(HP), + map_vars(T, 2). + +map_vars([H|T], P) -> + case extract_vars(H) of + [] -> + map_vars(T, P+1); + Vs -> + [{P, Vs}|map_vars(T, P+1)] + end; +map_vars([], _) -> + []. + +select_traverse({ok, K, V}, Limit, Pfx, MS, I, #sel{ref = R} = Sel, + AccKeys, Acc) -> + case is_prefix(Pfx, K) of + true -> + DecKey = decode_key(K, R), + Rec = decode_val(V, DecKey, R), + case ets:match_spec_run([Rec], MS) of + [] -> + select_traverse( + rocksdb:iterator_move(I, next), Limit, Pfx, MS, + I, Sel, AccKeys, Acc); + [Match] -> + Acc1 = if AccKeys -> + [{K, Match}|Acc]; + true -> + [Match|Acc] + end, + traverse_continue(K, decr(Limit), Pfx, MS, I, Sel, AccKeys, Acc1) + end; + false when Limit == infinity -> + lists:reverse(Acc); + false -> + {lists:reverse(Acc), '$end_of_table'} + end; +select_traverse({error, _}, Limit, _, _, _, _, _, Acc) -> + select_return(Limit, {lists:reverse(Acc), '$end_of_table'}). + +select_return(infinity, {L, '$end_of_table'}) -> + L; +select_return(_, Ret) -> + Ret. + +is_prefix(A, B) when is_binary(A), is_binary(B) -> + Sa = byte_size(A), + case B of + <> -> + true; + _ -> + false + end. + +decr(I) when is_integer(I) -> + I-1; +decr(infinity) -> + infinity. + +traverse_continue(K, 0, Pfx, MS, _I, #sel{limit = Limit, ref = Ref} = Sel, AccKeys, Acc) -> + {lists:reverse(Acc), + fun() -> + mrdb:with_rdb_iterator( + Ref, + fun(NewI) -> + select_traverse(iterator_next(NewI, K), + Limit, Pfx, MS, NewI, Sel, + AccKeys, []) + end) + end}; +traverse_continue(_K, Limit, Pfx, MS, I, Sel, AccKeys, Acc) -> + select_traverse(rocksdb:iterator_move(I, next), Limit, Pfx, MS, I, Sel, AccKeys, Acc). + +iterator_next(I, K) -> + case rocksdb:iterator_move(I, K) of + {ok, K, _} -> + rocksdb:iterator_move(I, next); + Other -> + Other + end. + +keypat([H|T], KeyPos, Ref) -> + keypat(T, KeyPos, Ref, keypat_pfx(H, KeyPos, Ref)). + +keypat(_, _, _, <<>>) -> <<>>; +keypat([H|T], KeyPos, Ref, Pfx0) -> + Pfx = keypat_pfx(H, KeyPos, Ref), + keypat(T, KeyPos, Ref, common_prefix(Pfx, Pfx0)); +keypat([], _, _, Pfx) -> + Pfx. + +common_prefix(<>, <>) -> + <>; +common_prefix(_, _) -> + <<>>. + +keypat_pfx({HeadPat,_Gs,_}, KeyPos, #{encoding := {sext,_}}) when is_tuple(HeadPat) -> + KP = element(KeyPos, HeadPat), + sext:prefix(KP); +keypat_pfx(_, _, _) -> + <<>>. + diff --git a/test/mnesia_rocksdb_SUITE.erl b/test/mnesia_rocksdb_SUITE.erl index d7315da..64ddfd0 100644 --- a/test/mnesia_rocksdb_SUITE.erl +++ b/test/mnesia_rocksdb_SUITE.erl @@ -12,10 +12,23 @@ , end_per_testcase/2 ]). --export([error_handling/1]). +-export([ encoding_sext_attrs/1 + , encoding_binary_binary/1 + , encoding_defaults/1 + ]). +-export([ mrdb_transactions/1 + , mrdb_repeated_transactions/1 + , mrdb_abort/1 + , mrdb_two_procs/1 + , mrdb_two_procs_tx_restart/1 + , mrdb_two_procs_snap/1 + , mrdb_three_procs/1 + ]). -include_lib("common_test/include/ct.hrl"). +-define(TABS_CREATED, tables_created). + suite() -> []. @@ -23,18 +36,38 @@ all() -> [{group, all_tests}]. groups() -> - [{all_tests, [sequence], [error_handling]}]. + [ + {all_tests, [sequence], [ {group, checks} + , {group, mrdb} ]} + %% , error_handling ]} + , {checks, [sequence], [ encoding_sext_attrs + , encoding_binary_binary + , encoding_defaults ]} + , {mrdb, [sequence], [ mrdb_transactions + , mrdb_repeated_transactions + , mrdb_abort + , mrdb_two_procs + , mrdb_two_procs_tx_restart + , mrdb_two_procs_snap + , mrdb_three_procs ]} + ]. -error_handling(_Config) -> - mnesia_rocksdb_error_handling:run(). +%% error_handling(Config) -> +%% mnesia_rocksdb_error_handling:run(Config). init_per_suite(Config) -> - Config. + tr_ct:set_activation_checkpoint(?TABS_CREATED, Config). end_per_suite(_Config) -> ok. +init_per_group(G, Config) when G==mrdb + ; G==checks -> + mnesia:stop(), + ok = mnesia_rocksdb_tlib:start_mnesia(reset), + Config; + init_per_group(_, Config) -> Config. @@ -46,3 +79,453 @@ init_per_testcase(_, Config) -> end_per_testcase(_, _Config) -> ok. + +encoding_sext_attrs(Config) -> + tr_ct:with_trace(fun encoding_sext_attrs_/1, Config, + tr_patterns(mnesia_rocksdb, + [{mnesia_rocksdb,'_',x}], tr_opts())). + +encoding_sext_attrs_(Config) -> + Created = create_tabs([{t, [{attributes, [k, v]}]}], Config), + ok = mrdb:insert(t, {t, 1, a}), + ok = mnesia:dirty_write({t, 2, b}), + expect_error(fun() -> mrdb:insert(t, {t, a}) end, ?LINE, + error, {mrdb_abort, badarg}), + expect_error(fun() -> mnesia:dirty_write({t, a}) end, ?LINE, + exit, '_'), + delete_tabs(Created), + ok. + +encoding_defaults(Config) -> + UP = fun(T) -> mnesia:table_info(T, user_properties) end, + Created = create_tabs([ {a, [ {attributes, [k, v]} + , {type, set}]} + , {b, [ {attributes, [k, v, w]} + , {type, ordered_set}]} + , {c, [ {attributes, [k, v]} + , {type, bag} ]}], Config), + [{mrdb_encoding,{term,{value,term}}}] = UP(a), + [{mrdb_encoding,{sext,{object,term}}}] = UP(b), + [{mrdb_encoding,{sext,{value,term}}}] = UP(c), + delete_tabs(Created), + ok. + +encoding_binary_binary(Config) -> + Created = create_tabs([ {a, [ {attributes, [k,v]} + , {user_properties, + [{mrdb_encoding, {raw, raw}}]}]} + , {b, [ {attributes, [k, v, w]} + , {user_properties, + [{mrdb_encoding, {raw, {object, term}}}]}]} + ], Config), + expect_error(fun() -> + create_tab( + c, [ {attributes, [k, v, w]} + , {user_properties, + [{mrdb_encoding, {raw, {value, raw}}}]}]) + end, ?LINE, error, '_'), + delete_tabs(Created), + ok. + +expect_error(F, Line, Type, Expected) -> + try F() of + Unexpected -> error({unexpected, Line, Unexpected}) + catch + Type:Expected -> + ct:log("Caught expected ~p:~p (Line: ~p)", [Type, Expected, Line]), + ok; + Type:Error when Expected == '_' -> + ct:log("Caught expected ~p:_ (Line:~p): ~p", [Type, Line, Error]), + ok + end. + +mrdb_transactions(Config) -> + tr_ct:with_trace(fun mrdb_transactions_/1, Config, + tr_patterns( + mnesia_rocksdb_admin, + [{mnesia_rocksdb_admin,'_',x}], tr_opts())). + +mrdb_transactions_(Config) -> + Created = create_tabs([{tx, []}], Config), + mrdb:insert(tx, {tx, a, 1}), + [_] = mrdb:read(tx, a), + mrdb:activity( + tx, rdb, + fun() -> + [{tx,a,N}] = mrdb:read(tx, a), + N1 = N+1, + ok = mrdb:insert(tx, {tx,a,N1}) + end), + [{tx,a,2}] = mrdb:read(tx,a), + delete_tabs(Created), + ok. + +mrdb_repeated_transactions(Config) -> + Created = create_tabs([{rtx, []}], Config), + mrdb:insert(rtx, {rtx, a, 0}), + [_] = mrdb:read(rtx, a), + Fun = fun() -> + [{rtx, a, N}] = mrdb:read(rtx, a), + N1 = N+1, + ok = mrdb:insert(rtx, {rtx, a, N1}) + end, + [ok = mrdb:activity(tx, rdb, Fun) || _ <- lists:seq(1,100)], + [{rtx,a,100}] = mrdb:read(rtx, a), + delete_tabs(Created), + ok. + +mrdb_abort(Config) -> + Created = create_tabs([{tx_abort, []}], Config), + mrdb:insert(tx_abort, {tx_abort, a, 1}), + Pre = mrdb:read(tx_abort, a), + TRes = try mrdb:activity( + tx, rdb, + fun() -> + [{tx_abort, a, N}] = mrdb:read(tx_abort, a), + error(abort_here), + ok = mrdb:insert(tx_abort, [{tx_abort, a, N+1}]), + noooo + end) + catch + error:abort_here -> + ok + end, + ok = TRes, + Pre = mrdb:read(tx_abort, a), + delete_tabs(Created), + ok. + +mrdb_two_procs(Config) -> + tr_ct:with_trace(fun mrdb_two_procs_/1, Config, + tr_flags( + {self(), [call, sos, p]}, + tr_patterns( + mrdb, [ {mrdb, insert, 2, x} + , {mrdb, read, 2, x} + , {mrdb, activity, x} ], tr_opts()))). + +mrdb_two_procs_(Config) -> + R = ?FUNCTION_NAME, + Parent = self(), + Created = create_tabs([{R, []}], Config), + mrdb:insert(R, {R, a, 1}), + Pre = mrdb:read(R, a), + F0 = fun() -> + wait_for_other(Parent, ?LINE), + ok = mrdb:insert(R, {R, a, 17}), + wait_for_other(Parent, ?LINE) + end, + {POther, MRef} = spawn_opt( + fun() -> + ok = mrdb:activity(tx, rdb, F0) + end, [monitor]), + F1 = fun() -> + Pre = mrdb:read(R, a), + go_ahead_other(POther), + await_other_down(POther, MRef, ?LINE), + [{R, a, 17}] = mrdb:read(R, a), + ok = mrdb:insert(R, {R, a, 18}) + end, + go_ahead_other(1, POther), + try mrdb:activity({tx, #{no_snapshot => true, + retries => 0}}, rdb, F1) of + ok -> error(unexpected) + catch + error:{error, "Resource busy" ++ _} -> + ok + end, + [{R, a, 17}] = mrdb:read(R, a), + delete_tabs(Created), + ok. + +mrdb_two_procs_tx_restart(Config) -> + tr_ct:with_trace(fun mrdb_two_procs_tx_restart_/1, Config, + light_tr_opts()). + +mrdb_two_procs_tx_restart_(Config) -> + R = ?FUNCTION_NAME, + Parent = self(), + Created = create_tabs([{R, []}], Config), + mrdb:insert(R, {R, a, 1}), + Pre = mrdb:read(R, a), + F0 = fun() -> + wait_for_other(Parent, ?LINE), + ok = mrdb:insert(R, {R, a, 17}), + wait_for_other(Parent, ?LINE) + end, + {POther, MRef} = spawn_opt( + fun() -> + ok = mrdb:activity(tx, rdb, F0) + end, [monitor]), + F1 = fun() -> + OtherWrite = [{R, a, 17}], + Att = get_attempt(), + Expected = case Att of + 1 -> Pre; + _ -> OtherWrite + end, + Expected = mrdb:read(R, a), + go_ahead_other(POther), + await_other_down(POther, MRef, ?LINE), + OtherWrite = mrdb:read(R, a), + ok = mrdb:insert(R, {R, a, 18}) + end, + go_ahead_other(1, POther), + mrdb:activity({tx, #{no_snapshot => true}}, rdb, F1), + [{R, a, 18}] = mrdb:read(R, a), + delete_tabs(Created), + ok. + + +% +%% For testing purposes, we use side-effects inside the transactions +%% to synchronize the concurrent transactions. If a transaction fails due +%% to "Resource busy", it can re-run, but then mustn't attempt to sync with +%% the other transaction, which is already committed. +%% +%% To achieve this, we rely on the `mrdb:current_context()` function, which gives +%% us information about which is the current attempt; we only sync on the first +%% attempt, and ignore the sync ops on retries. +%% +-define(IF_FIRST(N, Expr), + if N == 1 -> + Expr; + true -> + ok + end). + +mrdb_two_procs_snap(Config) -> + %% _snap is now the default tx mode + R = ?FUNCTION_NAME, + Parent = self(), + Created = create_tabs([{R, []}], Config), + mrdb:insert(R, {R, a, 1}), + Pre = mrdb:read(R, a), + mrdb:insert(R, {R, b, 11}), + PreB = mrdb:read(R, b), + F0 = fun() -> + ok = mrdb:insert(R, {R, a, 17}), + wait_for_other(Parent, ?LINE) + end, + {POther, MRef} = + spawn_opt(fun() -> + ok = mrdb:activity(tx, rdb, F0) + end, [monitor]), + F1 = fun() -> + Att = get_attempt(), + go_ahead_other(Att, POther), + ARes = mrdb:read(R, a), + ARes = case Att of + 1 -> Pre; + 2 -> [{R, a, 17}] + end, + await_other_down(POther, MRef, ?LINE), + PreB = mrdb:read(R, b), + mrdb:insert(R, {R, b, 18}), + 1477 + end, + 1477 = mrdb:activity(tx, rdb, F1), + [{R, a, 17}] = mrdb:read(R, a), + [{R, b, 18}] = mrdb:read(R, b), + delete_tabs(Created), + ok. + +%% We spawn two helper processes, making it 3 transactions, with the one +%% in the parent process. P2 writes to key `a`, which the other two try to read. +%% We make sure that P2 commits before finishing the other two, and P3 and the +%% main thread sync, so as to maximize the contention for the retry lock. +mrdb_three_procs(Config) -> + tr_ct:with_trace(fun mrdb_three_procs_/1, Config, light_tr_opts()). + +mrdb_three_procs_(Config) -> + R = ?FUNCTION_NAME, + Parent = self(), + Created = create_tabs([{R, []}], Config), + A0 = {R, a, 1}, + A1 = {R, a, 11}, + A2 = {R, a, 12}, + ok = mrdb:insert(R, A0), + F1 = fun() -> + ok = mrdb:insert(R, A1), + ok = mrdb:insert(R, {R, p1, 1}) + end, + {P1, MRef1} = + spawn_opt(fun() -> + do_when_p_allows( + 1, Parent, ?LINE, + fun() -> + ok = mrdb:activity({tx,#{retries => 0}}, rdb, F1) + end) + end, [monitor]), + F2 = fun() -> + [A0] = mrdb:read(R, a), + Att = get_attempt(), + wait_for_other(Att, Parent, ?LINE), + do_when_p_allows( + Att, Parent, ?LINE, + fun() -> + [A1] = mrdb:read(R, a), + ok = mrdb:insert(R, A2), + ok = mrdb:insert(R, {R, p2, 1}) + end) + end, + {P2, MRef2} = + spawn_opt(fun() -> + try mrdb:activity( + {tx, #{retries => 0, + no_snapshot => true}}, rdb, F2) of + ok -> error(unexpected) + catch + error:{error, "Resource busy" ++ _} -> + ok + end + end, [monitor]), + ok = mrdb:activity(tx, rdb, + fun() -> + Att = get_attempt(), + ARes = case Att of + 1 -> [A0]; + 2 -> [A1] + end, + %% First, ensure that P2 tx is running + go_ahead_other(Att, P2), + ARes = mrdb:read(R, a), + allow_p(Att, P1, ?LINE), + ARes = mrdb:read(R, a), + allow_p(Att, P2, ?LINE), + ARes = mrdb:read(R, a), + await_other_down(P1, MRef1, ?LINE), + await_other_down(P2, MRef2, ?LINE), + ok = mrdb:insert(R, {R, p0, 1}) + end), + [{R, p1, 1}] = mrdb:read(R, p1), + [] = mrdb:read(R, p2), + [A1] = mrdb:read(R, a), + [{R, p0, 1}] = mrdb:read(R, p0), + delete_tabs(Created), + ok. + +tr_opts() -> + #{patterns => [ {mrdb, '_', '_', x} + , {mrdb_lib, '_', '_', x} + , {tr_ttb, event, 3, []} + , {?MODULE, go_ahead_other, 3, x} + , {?MODULE, wait_for_other, 3, x} + , {?MODULE, await_other_down, 3, x} + , {?MODULE, do_when_p_allows, 4, x} + , {?MODULE, allow_p, 3, x} + ]}. + +light_tr_opts() -> + tr_flags( + {self(), [call, sos, p]}, + tr_patterns( + mrdb, [ {mrdb, insert, 2, x} + , {mrdb, read, 2, x} + , {mrdb, activity, x} ], tr_opts())). + +tr_patterns(Mod, Ps, #{patterns := Pats} = Opts) -> + Pats1 = [P || P <- Pats, element(1,P) =/= Mod], + Opts#{patterns => Ps ++ Pats1}. + +tr_flags(Flags, Opts) when is_map(Opts) -> + Opts#{flags => Flags}. + +wait_for_other(Parent, L) -> + wait_for_other(get_attempt(), Parent, 1000, L). + +wait_for_other(Att, Parent, L) -> + wait_for_other(Att, Parent, 1000, L). + +wait_for_other(1, Parent, Timeout, L) -> + MRef = monitor(process, Parent), + Parent ! {self(), ready}, + receive + {Parent, cont} -> + demonitor(MRef), + ok; + {'DOWN', MRef, _, _, Reason} -> + ct:log("Parent died, Reason = ~p", [Reason]), + exit(Reason) + after Timeout -> + demonitor(MRef), + error({inner_timeout, L}) + end; +wait_for_other(_, _, _, _) -> + ok. + +do_when_p_allows(Att, P, Line, F) -> + wait_for_other(Att, P, Line), + F(), + %% Tell P that we're done + go_ahead_other(Att, P, Line), + %% Wait for P to acknowlege + wait_for_other(Att, P, Line). + +allow_p(Att, P, Line) -> + go_ahead_other(Att, P), + %% This is where P does its thing. + wait_for_other(Att, P, Line), + %% Acknowledge + go_ahead_other(Att, P, Line). + +go_ahead_other(POther) -> + go_ahead_other(get_attempt(), POther). + +go_ahead_other(Att, POther) -> + go_ahead_other(Att, POther, 1000). + +go_ahead_other(Att, POther, Timeout) -> + ?IF_FIRST(Att, go_ahead_other_(POther, Timeout)). + +go_ahead_other_(POther, Timeout) -> + receive + {POther, ready} -> + POther ! {self(), cont} + after Timeout -> + error(go_ahead_timeout) + end. + +%% Due to transaction restarts, we may already have collected +%% a DOWN message. In this case, P will already be dead, and there +%% will not be a 'DOWN' messsage still in the msg queue. +%% This is fine (we assume it is), and we just make sure that the +%% process didn't die abnormally. +await_other_down(P, MRef, Line) -> + Attempt = get_attempt(), + ?IF_FIRST(Attempt, await_other_down_(P, MRef, Line)). + +await_other_down_(P, MRef, Line) -> + receive {'DOWN', MRef, _, _, Reason} -> + case Reason of + normal -> ok; + _ -> + error({abnormal_termination, + [ {pid, P} + , {mref, MRef} + , {line, Line} + , {reason, Reason}]}) + end + after 1000 -> + error({monitor_timeout, Line}) + end. + +get_attempt() -> + #{attempt := Attempt} = mrdb:current_context(), + Attempt. + +create_tabs(Tabs, Config) -> + Res = lists:map(fun create_tab/1, Tabs), + tr_ct:trace_checkpoint(?TABS_CREATED, Config), + Res. + +create_tab({T, Opts}) -> create_tab(T, Opts). + +create_tab(T, Opts) -> + {atomic, ok} = mnesia:create_table(T, [{rdb,[node()]} | Opts]), + T. + +delete_tabs(Tabs) -> + [{atomic,ok} = mnesia:delete_table(T) || T <- Tabs], + ok. + diff --git a/test/mnesia_rocksdb_error_handling.erl b/test/mnesia_rocksdb_error_handling.erl deleted file mode 100644 index 4d45037..0000000 --- a/test/mnesia_rocksdb_error_handling.erl +++ /dev/null @@ -1,116 +0,0 @@ --module(mnesia_rocksdb_error_handling). - --export([run/0, - run/4]). - - -run() -> - setup(), - %% run only one test for 'fatal', to save time. - [run(Type, Op, L, MaintainSz) || MaintainSz <- [false, true], - Type <- [set, bag], - Op <- [insert, update, delete], - L <- levels()] - ++ [run(set, insert, fatal, false)]. - -run(Type, Op, Level, MaintainSz) -> - setup(), - {ok, Tab} = create_tab(Type, Level, MaintainSz), - mnesia:dirty_write({Tab, a, 1}), % pre-existing data - with_mock(Level, Op, Tab, fun() -> - try_write(Op, Type, Tab), - expect_error(Level, Tab) - end). - -levels() -> - [debug, verbose, warning, error]. - -setup() -> - mnesia:stop(), - start_mnesia(). - -create_tab(Type, Level, MaintainSz) -> - TabName = tab_name(Type, Level, MaintainSz), - %% create error store before the table - case ets:info(?MODULE) of - undefined -> - ?MODULE = ets:new(?MODULE, [bag, public, named_table]), - ok; - _ -> - ok - end, - UserProps = user_props(Level, MaintainSz), - {atomic, ok} = mnesia:create_table(TabName, [{rdb, [node()]}, - {user_properties, UserProps}]), - {ok, TabName}. - -tab_name(Type, Level, MaintainSz) -> - binary_to_atom(iolist_to_binary( - ["t" | [["_", atom_to_list(A)] - || A <- [?MODULE, Type, Level, MaintainSz]]]), utf8). - -user_props(Level, MaintainSz) -> - [{maintain_sz, MaintainSz}, - {rocksdb_opts, [ {on_write_error, Level} - , {on_write_error_store, ?MODULE} ]}]. - -start_mnesia() -> - mnesia_rocksdb_tlib:start_mnesia(reset), - ok. - -with_mock(Level, Op, Tab, F) -> - mnesia:subscribe(system), - mnesia:set_debug_level(debug), - meck:new(mnesia_rocksdb_lib, [passthrough]), - meck:expect(mnesia_rocksdb_lib, put, 4, {error, some_put_error}), - meck:expect(mnesia_rocksdb_lib, write, 3, {error, some_write_error}), - meck:expect(mnesia_rocksdb_lib, delete, 3, {error,some_delete_error}), - try {Level, Op, Tab, F()} of - {_, _, _, ok} -> - ok; - Other -> - io:fwrite("OTHER: ~p~n", [Other]), - ok - catch - exit:{{aborted,_},_} -> - Level = error, - ok - after - mnesia:set_debug_level(none), - mnesia:unsubscribe(system), - meck:unload(mnesia_rocksdb_lib) - end. - -try_write(insert, set, Tab) -> - mnesia:dirty_write({Tab, b, 2}); -try_write(insert, bag, Tab) -> - mnesia:dirty_write({Tab, a, 2}); -try_write(update, _, Tab) -> - mnesia:dirty_write({Tab, a, 1}); -try_write(delete, _, Tab) -> - mnesia:dirty_delete({Tab, a}). - - -expect_error(Level, Tab) -> - Tag = rpt_tag(Level), - receive - {mnesia_system_event, {mnesia_fatal, Fmt, Args, _Core}} -> - Tag = mnesia_fatal, - io:fwrite("EVENT(~p, ~p):~n ~s", [Tag, Tab, io_lib:fwrite(Fmt, Args)]), - ok; - {mnesia_system_event, {Tag, Fmt, Args}} -> - io:fwrite("EVENT(~p, ~p):~n ~s", [Tag, Tab, io_lib:fwrite(Fmt, Args)]), - ok - after 1000 -> - error({expected_error, [Level, Tab]}) - - end, - %% Also verify that an error entry has been written into the error store. - 1 = ets:select_delete(?MODULE, [{{{Tab, '_'}, '_', '_'}, [], [true]}]), - ok. - -rpt_tag(fatal ) -> mnesia_fatal; -rpt_tag(error ) -> mnesia_error; -rpt_tag(warning) -> mnesia_warning; -rpt_tag(verbose) -> mnesia_info; -rpt_tag(debug ) -> mnesia_info. diff --git a/test/mnesia_rocksdb_fallback.erl b/test/mnesia_rocksdb_fallback.erl index 43f1874..11bb116 100644 --- a/test/mnesia_rocksdb_fallback.erl +++ b/test/mnesia_rocksdb_fallback.erl @@ -22,7 +22,7 @@ -define(m(A,B), fun() -> L = ?LINE, case {A,B} of - {__X, __X} -> + {X__, X__} -> B; Other -> error({badmatch, [Other, diff --git a/test/mnesia_rocksdb_indexes.erl b/test/mnesia_rocksdb_indexes_SUITE.erl similarity index 52% rename from test/mnesia_rocksdb_indexes.erl rename to test/mnesia_rocksdb_indexes_SUITE.erl index 997d62c..1ff35b5 100644 --- a/test/mnesia_rocksdb_indexes.erl +++ b/test/mnesia_rocksdb_indexes_SUITE.erl @@ -16,13 +16,51 @@ %% under the License. %%---------------------------------------------------------------- --module(mnesia_rocksdb_indexes). +-module(mnesia_rocksdb_indexes_SUITE). + +-export([ + all/0 + , groups/0 + , suite/0 + , init_per_suite/1 + , end_per_suite/1 + , init_per_group/2 + , end_per_group/2 + , init_per_testcase/2 + , end_per_testcase/2 + ]). + +-export([ + index_plugin_mgmt/1 + , add_indexes/1 + , create_bag_index/1 + , create_ordered_index/1 + , test_1_ram_copies/1 + , test_1_disc_copies/1 + , fail_1_disc_only/1 + , plugin_ram_copies1/1 + , plugin_ram_copies2/1 + , plugin_disc_copies/1 + , fail_plugin_disc_only/1 + , plugin_disc_copies_bag/1 + , plugin_rdb_ordered/1 + , index_iterator/1 + ]). + +-include_lib("common_test/include/ct.hrl"). -export([run/0, + run/1, r1/0]). +-define(TAB(T), list_to_atom(lists:flatten(io_lib:fwrite("~w_~w", [T, ?LINE])))). + run() -> + run([]). + +run(Config) -> mnesia:stop(), + maybe_set_dir(Config), ok = mnesia_rocksdb_tlib:start_mnesia(reset), test(1, ram_copies, r1), test(1, disc_copies, d1), @@ -33,15 +71,86 @@ run() -> add_del_indexes(), {atomic,ok} = mnesia_schema:add_index_plugin( {pfx},mnesia_rocksdb, ix_prefixes), - test_index_plugin(pr1, ram_copies, ordered), - test_index_plugin(pr2, ram_copies, bag), - test_index_plugin(pd1, disc_copies, ordered), - fail(test_index_plugin, [pd2, disc_only_copies, ordered]), - test_index_plugin(pd2, disc_copies, bag), - test_index_plugin(pl2, rdb, ordered), - test_index_plugin_mgmt(), + test_index_plugin(cfg([pr1, ram_copies, ordered], Config)), + test_index_plugin(cfg([pr2, ram_copies, bag], Config)), + test_index_plugin(cfg([pd1, disc_copies, ordered], Config)), + fail(test_index_plugin, [cfg([pd2, disc_only_copies, ordered], Config)]), + test_index_plugin(cfg([pd2, disc_copies, bag], Config)), + test_index_plugin(cfg([pl2, rdb, ordered], Config)), + index_plugin_mgmt(Config), ok. +suite() -> + []. + +all() -> + [{group, all_tests}]. + +groups() -> + [ + {all_tests, [sequence], [ {group, mgmt}, {group, access}, {group, plugin} ]} + , {mgmt, [sequence], [ + create_bag_index + , create_ordered_index + , index_plugin_mgmt + , add_indexes + ]} + , {access, [sequence], [ + test_1_ram_copies + , test_1_disc_copies + , fail_1_disc_only + , index_iterator + ]} + , {plugin, [sequence], [ + plugin_ram_copies1 + , plugin_ram_copies2 + , plugin_disc_copies + , fail_plugin_disc_only + , plugin_disc_copies_bag + , plugin_rdb_ordered + ]} + ]. + +%% ====================================================================== + +init_per_suite(Config) -> + mnesia:stop(), + maybe_set_dir(Config), + Config. + +end_per_suite(_) -> + ok. + +init_per_group(Grp, Config) -> + mnesia_rocksdb_tlib:restart_reset_mnesia(), + case Grp of + plugin -> + {atomic,ok} = mnesia_schema:add_index_plugin( + {pfx},mnesia_rocksdb, ix_prefixes); + _ -> + ok + end, + Config. + +end_per_group(_, _) -> + ok. + +init_per_testcase(_, Config) -> + Config. + +end_per_testcase(_, _) -> + ok. + +%% ====================================================================== + +cfg([Tab, Type, IxType], Config) -> + [{my_config, #{tab => Tab, type => Type, ixtype => IxType}} | Config]; +cfg(Cfg, Config) when is_map(Cfg) -> [{my_config, Cfg} | Config]. + +cfg(Config) -> ?config(my_config, Config). + +%% ====================================================================== + r1() -> mnesia:stop(), ok = mnesia_rocksdb_tlib:start_mnesia(reset), @@ -51,17 +160,28 @@ r1() -> dbg:tpl(mnesia_schema,x), dbg:tpl(mnesia_index,x), dbg:p(all,[c]), - test_index_plugin(pd2, disc_only_copies, ordered). + test_index_plugin(cfg([pd2, disc_only_copies, ordered], [])). fail(F, Args) -> try apply(?MODULE, F, Args), error(should_fail) catch - error:_ -> + error:R when R =/= should_fail -> io:fwrite("apply(~p, ~p, ~p) -> fails as expected~n", [?MODULE, F, Args]) end. +test_1_ram_copies( _Cfg) -> test(1, ram_copies, r1). +test_1_disc_copies(_Cfg) -> test(1, disc_copies, d1). +fail_1_disc_only( _Cfg) -> fail(test, [1, disc_only_copies, do1]). + +plugin_ram_copies1(Cfg) -> test_index_plugin(cfg([pr1, ram_copies, ordered], Cfg)). +plugin_ram_copies2(Cfg) -> test_index_plugin(cfg([pr2, ram_copies, bag], Cfg)). +plugin_disc_copies(Cfg) -> test_index_plugin(cfg([pd1, disc_copies, ordered], Cfg)). +fail_plugin_disc_only(Cfg) -> fail(test_index_plugin, [cfg([pd2, disc_only_copies, ordered], Cfg)]). +plugin_disc_copies_bag(Cfg) -> test_index_plugin(cfg([pd2, disc_copies, bag], Cfg)). +plugin_rdb_ordered(Cfg) -> test_index_plugin(cfg([pl2, rdb, ordered], Cfg)). + test(N, Type, T) -> {atomic, ok} = mnesia:create_table(T, [{Type,[node()]}, {attributes,[k,a,b,c]}, @@ -81,7 +201,8 @@ add_del_indexes() -> {atomic, ok} = mnesia:add_table_index(l1, a), io:fwrite("add_del_indexes() -> ok~n", []). -test_index_plugin(Tab, Type, IxType) -> +test_index_plugin(Config) -> + #{tab := Tab, type := Type, ixtype := IxType} = cfg(Config), {atomic, ok} = mnesia:create_table(Tab, [{Type, [node()]}, {index, [{{pfx}, IxType}]}]), mnesia:dirty_write({Tab, "foobar", "sentence"}), @@ -100,10 +221,25 @@ test_index_plugin(Tab, Type, IxType) -> Res2 = lists:sort(mnesia:dirty_index_read(Tab,<<"whi">>, {pfx})), [{Tab,"foobar","sentence"}] = mnesia:dirty_index_read( Tab, <<"foo">>, {pfx}) - end, - io:fwrite("test_index_plugin(~p, ~p, ~p) -> ok~n", [Tab,Type,IxType]). + end. -test_index_plugin_mgmt() -> +create_bag_index(_Config) -> + {aborted, {combine_error, _, _}} = + mnesia:create_table(bi, [{rdb, [node()]}, {index, [{val, bag}]}]), + ok. + +create_ordered_index(_Config) -> + {atomic, ok} = + mnesia:create_table(oi, [{rdb, [node()]}, {index, [{val, ordered}]}]), + ok. + +add_indexes(_Config) -> + T = ?TAB(t1), + {atomic, ok} = mnesia:create_table(T, [{rdb, [node()]}, {attributes, [k, a, b, c]}]), + {atomic, ok} = mnesia:add_table_index(T, a), + ok. + +index_plugin_mgmt(_Config) -> {aborted,_} = mnesia:create_table(x, [{index,[{unknown}]}]), {aborted,_} = mnesia:create_table(x, [{index,[{{unknown},bag}]}]), {aborted,_} = mnesia:create_table(x, [{index,[{{unknown},ordered}]}]), @@ -166,9 +302,48 @@ test_index(3, T) -> io:fwrite("test_index(1, ~p) -> ok~n", [T]), ok. +index_iterator(_Cfg) -> + T = ?TAB(it), + Attrs = [ {rdb,[node()]} + , {record_name, i} + , {attributes, [k,a,b]} + , {index, [a,b]} ], + {atomic, ok} = mnesia:create_table(T, Attrs), + ct:log("created tab T=~p: ~p", [T, Attrs]), + L1 = [{i,K,a,y} || K <- lists:seq(4,6)], + L2 = [{i,K,b,x} || K <- lists:seq(1,3)], + true = lists:all(fun(X) -> X == ok end, + [mnesia:dirty_write(T, Obj) || Obj <- L1 ++ L2]), + ct:log("inserted ~p", [L1 ++ L2]), + ResA = [{a,X} || X <- L1] ++ [{b,Y} || Y <- L2], + ResB = [{x,X} || X <- L2] ++ [{y,Y} || Y <- L1], + F = fun iter_all/1, + ResA = mrdb_index:with_iterator(T, a, F), + ct:log("mrdb_index:with_iterator(T, a, F) -> ~p", [ResA]), + ResB = mrdb_index:with_iterator(T, b, F), + ct:log("mrdb_index:with_iterator(T, b, F) -> ~p", [ResB]), + ok. + +iter_all(I) -> + iter_all(mrdb_index:iterator_move(I, first), I). + +iter_all({ok, IxVal, Obj}, I) -> + [{IxVal, Obj} | iter_all(mrdb_index:iterator_move(I, next), I)]; +iter_all(_, _) -> + []. + indexes(1) -> [a,{b,ordered},{c,bag}]; indexes(2) -> [a,b,{c,bag}]; indexes(3) -> [a,{b,ordered},{c,ordered}]. + +maybe_set_dir(Config) -> + case proplists:get_value(priv_dir, Config) of + undefined -> + ok; + PDir -> + Dir = filename:join(PDir, "mnesia_indexes"), + application:set_env(mnesia, dir, Dir) + end. diff --git a/test/mnesia_rocksdb_migration_SUITE.erl b/test/mnesia_rocksdb_migration_SUITE.erl new file mode 100644 index 0000000..b218e29 --- /dev/null +++ b/test/mnesia_rocksdb_migration_SUITE.erl @@ -0,0 +1,190 @@ +-module(mnesia_rocksdb_migration_SUITE). + +-export([ + all/0 + , suite/0 + , groups/0 + , init_per_suite/1 + , end_per_suite/1 + , init_per_group/2 + , end_per_group/2 + , init_per_testcase/2 + , end_per_testcase/2 + ]). + +-export([ + manual_migration/1 + , migrate_with_encoding_change/1 + , auto_migration/1 + ]). + +-include_lib("common_test/include/ct.hrl"). + +-define(TABS_CREATED, tables_created). + +suite() -> + []. + +all() -> + [{group, all_tests}]. + +groups() -> + [ + {all_tests, [sequence], [ manual_migration + , migrate_with_encoding_change ]} + ]. + +init_per_suite(Config) -> + Config. + +end_per_suite(_Config) -> + ok. + +init_per_group(_, Config) -> + Config. + +end_per_group(_, _Config) -> + ok. + +init_per_testcase(_, Config) -> + mnesia:stop(), + ok = mnesia_rocksdb_tlib:start_mnesia(reset), + Config. +%% create_migrateable_db(Config). + +end_per_testcase(_, _Config) -> + ok. + +manual_migration(Config) -> + tr_ct:with_trace(fun manual_migration_/1, Config, tr_opts()). + +manual_migration_(Config) -> + create_migrateable_db(Config), + Tabs = tables(), + ct:log("Analyze (before): ~p", [analyze_tabs(Tabs)]), + Res = mnesia_rocksdb_admin:migrate_standalone(rdb, Tabs), + ct:log("migrate_standalone(rdb, ~p) -> ~p", [Tabs, Res]), + AnalyzeRes = analyze_tabs(Tabs), + ct:log("AnalyzeRes = ~p", [AnalyzeRes]), + MigRes = mnesia_rocksdb_admin:migrate_standalone(rdb, Tabs), + ct:log("MigRes = ~p", [MigRes]), + AnalyzeRes2 = analyze_tabs(Tabs), + ct:log("AnalyzeRes2 = ~p", [AnalyzeRes2]), + ct:log("Admin State = ~p", [sys:get_state(mnesia_rocksdb_admin)]), + ok. + +migrate_with_encoding_change(_Config) -> + ok = create_tab(t, [{user_properties, [{mrdb_encoding, {sext,{object,term}}}, + {rocksdb_standalone, true}]}, + {index,[val]} + ]), + mrdb:insert(t, {t, <<"1">>, <<"a">>}), + mrdb:insert(t, {t, <<"2">>, <<"b">>}), + TRef = mrdb:get_ref(t), + {ok, V1} = mrdb:rdb_get(TRef, sext:encode(<<"1">>), []), + {ok, V2} = mrdb:rdb_get(TRef, sext:encode(<<"2">>), []), + {t,[],<<"a">>} = binary_to_term(V1), + {t,[],<<"b">>} = binary_to_term(V2), + Opts = #{encoding => {raw, raw}}, + MigRes = mnesia_rocksdb_admin:migrate_standalone(rdb, [{t, Opts}]), + ct:log("MigRes (t) = ~p", [MigRes]), + %% + %% Ensure that metadata reflect the migrated table + %% (now a column family, and the rocksdb_standalone prop gone) + %% + TRef1 = mrdb:get_ref(t), + ct:log("TRef1(t) = ~p", [TRef1]), + #{type := column_family, + properties := #{user_properties := UPs}} = TRef1, + error = maps:find(rocksdb_standalone, UPs), + UPsR = lists:sort(maps:values(UPs)), + UPsM = lists:sort(mnesia:table_info(t, user_properties)), + {UPsR,UPsM} = {UPsM,UPsR}, + ct:log("user properties (t): ~p", [UPsM]), + [{<<"2">>, <<"b">>}, + {<<"1">>, <<"a">>}] = mrdb:rdb_fold( + t, fun(K,V,A) -> [{K,V}|A] end, [], <<>>), + ct:log("All data present in new column family", []), + ct:log("Contents of mnesia dir: ~p", + [ok(file:list_dir(mnesia:system_info(directory)))]), + ct:log("mnesia stopped", []), + mnesia:stop(), + mnesia:start(), + ct:log("mnesia started", []), + mnesia:info(), + ok = mnesia:wait_for_tables([t], 3000), + ct:log("tables loaded", []), + [{t,<<"1">>,<<"a">>}, + {t,<<"2">>,<<"b">>}] = mrdb:select( + t, [{'_',[],['$_']}]), + [{<<"2">>,<<"b">>}, + {<<"1">>,<<"a">>}] = mrdb:rdb_fold( + t, fun(K,V,A) -> [{K,V}|A] end, [], <<>>), + ok. + +auto_migration(_Config) -> + ok. + +ok({ok, Value}) -> Value. + +tr_opts() -> + #{ patterns => [ {mnesia_rocksdb_admin, '_', []} + , {mnesia_rocksdb_lib, '_', []} + , {rocksdb, '_', x} | trace_exports(mrdb, x) ] }. + +trace_exports(M, Pat) -> + Fs = M:module_info(exports), + [{M, F, A, Pat} || {F, A} <- Fs]. + +tables() -> + [a]. + +create_migrateable_db(Config) -> + Os = [{user_properties, [{rocksdb_standalone, true}]}], + TabNames = tables(), + Tabs = [{T, Os} || T <- TabNames], + create_tabs(Tabs, Config), + verify_tabs_are_standalone(TabNames), + fill_tabs(TabNames), + Config. + +fill_tabs(Tabs) -> + lists:foreach(fun(Tab) -> + [mrdb:insert(Tab, {Tab, X, a}) || X <- lists:seq(1,3)] + end, Tabs). + +create_tabs(Tabs, Config) -> + Res = lists:map(fun create_tab/1, Tabs), + tr_ct:trace_checkpoint(?TABS_CREATED, Config), + Res. + +create_tab({T, Opts}) -> + create_tab(T, Opts). + +create_tab(T, Opts) -> + {atomic, ok} = mnesia:create_table(T, [{rdb, [node()]} | Opts]), + ok. + +verify_tabs_are_standalone(Tabs) -> + case analyze_tabs(Tabs) of + {_, []} -> + ok; + {[], NotSA} -> + error({not_standalone, NotSA}) + end. + +analyze_tabs(Tabs) -> + Dir = mnesia:system_info(directory), + Files = filelib:wildcard(filename:join(Dir, "*-_tab.extrdb")), + ct:log("Files = ~p", [Files]), + TabNames = lists:map( + fun(F) -> + {match,[TStr]} = + re:run(F, "^.+/([^/]+)-_tab\\.extrdb$", + [{capture, [1], list}]), + list_to_existing_atom(TStr) + end, Files), + ct:log("TabNames = ~p", [TabNames]), + NotSA = Tabs -- TabNames, + {TabNames -- NotSA, NotSA}. + diff --git a/test/mnesia_rocksdb_proper_semantics_test.erl b/test/mnesia_rocksdb_proper_semantics_test.erl index 7deb2ec..c1bf58c 100644 --- a/test/mnesia_rocksdb_proper_semantics_test.erl +++ b/test/mnesia_rocksdb_proper_semantics_test.erl @@ -78,6 +78,18 @@ setup_mnesia() -> ok = mnesia:delete_schema([node()]), ok = mnesia:create_schema([node()]), ok = mnesia:start(), + %% + %% dbg:tracer(), + %% dbg:tpl(mnesia_rocksdb_admin, x), + %% dbg:tpl(mnesia_rocksdb,x), + %% dbg:ctpl(mnesia_rocksdb, check_definition_entry), + %% dbg:ctpl(mnesia_rocksdb, '-check_definition/4-fun-0-'), + %% dbg:tpl(mnesia_rocksdb_lib,x), + %% dbg:tp(mnesia,x), + %% dbg:tpl(mrdb,x), + %% dbg:tp(rocksdb,x), + %% dbg:p(all,[c]), + %% {ok, rocksdb_copies} = mnesia_rocksdb:register(). setup() -> diff --git a/test/mnesia_rocksdb_tlib.erl b/test/mnesia_rocksdb_tlib.erl index 4c2a3da..38a8cb3 100644 --- a/test/mnesia_rocksdb_tlib.erl +++ b/test/mnesia_rocksdb_tlib.erl @@ -20,23 +20,32 @@ -export([start_mnesia/0, start_mnesia/1, + restart_reset_mnesia/0, create_table/1, create_table/3, trace/2]). +restart_reset_mnesia() -> + mnesia:stop(), + start_mnesia(reset). start_mnesia() -> start_mnesia(false). start_mnesia(Mode) -> if Mode==reset -> - mnesia:delete_schema([node()]), - mnesia:create_schema([node()], - [{backend_types, - [{rdb,mnesia_rocksdb}]}]); + DRes = mnesia:delete_schema([node()]), + ct:log("Delete schema: ~p", [DRes]), + CRes = mnesia:create_schema([node()], + [{backend_types, + [{rdb,mnesia_rocksdb}]}]), + ct:log("Create schema: ~p", [CRes]); true -> ok end, - mnesia:start(). + SRes = mnesia:start(), + ct:log("Mnesia start: ~p", [SRes]), + true = lists:member(rdb, mnesia_schema:backend_types()), + SRes. create_table(Backend) -> create_table(Backend, [k,v], [v]). diff --git a/test/mrdb_ttb.erl b/test/mrdb_ttb.erl new file mode 100644 index 0000000..fd448d4 --- /dev/null +++ b/test/mrdb_ttb.erl @@ -0,0 +1,31 @@ +-module(mrdb_ttb). + +-export([ on_nodes/2 + , stop/0 + , stop_nofetch/0 + , format/2 + , format/3 ]). + +-export([ patterns/0 + , flags/0 ]). + +on_nodes(Ns, File) -> + tr_ttb:on_nodes(Ns, File, ?MODULE). + +patterns() -> + mrdb:patterns(). + +flags() -> + {all, call}. + +stop() -> + tr_ttb:stop(). + +stop_nofetch() -> + tr_ttb:stop_nofetch(). + +format(Dir, Out) -> + tr_ttb:format(Dir, Out). + +format(Dir, Out, Opts) -> + tr_ttb:format(Dir, Out, Opts).