You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2020/02/13 22:15:03 UTC

[couchdb] 01/02: Track the size of data stored in a database

This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch prototype/fdb-layer-track-database-size
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 3cb5cd4a6604283f9de8523954af9cd2c3525b52
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Wed Dec 4 11:38:48 2019 -0600

    Track the size of data stored in a database
    
    This tracks the number of bytes that would be required to store the
    contents of a database as flat files on disk. Currently the following
    items are tracked:
    
        * Doc ids
        * Revisions
        * Doc body as JSON
        * Attachment names
        * Attachment type
        * Attachment length
        * Attachment md5s
        * Attachment headers
        * Local doc id
        * Local doc revision
        * Local doc bodies
---
 src/couch/src/couch_att.erl                |  19 ++++
 src/fabric/include/fabric2.hrl             |   7 +-
 src/fabric/src/fabric2_db.erl              |  11 ++-
 src/fabric/src/fabric2_fdb.erl             | 144 ++++++++++++++++++++++++-----
 src/fabric/src/fabric2_util.erl            |  52 +++++++++++
 src/fabric/test/fabric2_doc_crud_tests.erl |   5 +-
 6 files changed, 209 insertions(+), 29 deletions(-)

diff --git a/src/couch/src/couch_att.erl b/src/couch/src/couch_att.erl
index 2c33362..90d498c 100644
--- a/src/couch/src/couch_att.erl
+++ b/src/couch/src/couch_att.erl
@@ -27,6 +27,7 @@
 ]).
 
 -export([
+    external_size/1,
     size_info/1,
     to_disk_term/1,
     from_disk_term/3
@@ -179,6 +180,24 @@ merge_stubs([], _, Merged) ->
     {ok, lists:reverse(Merged)}.
 
 
+external_size(Att) ->
+    NameSize = size(fetch(name, Att)),
+    TypeSize = case fetch(type, Att) of
+        undefined -> 0;
+        Type -> size(Type)
+    end,
+    AttSize = fetch(att_len, Att),
+    Md5Size = case fetch(md5, Att) of
+        undefined -> 0;
+        Md5 -> size(Md5)
+    end,
+    HeadersSize = case fetch(headers, Att) of
+        undefined -> 0;
+        Headers -> couch_ejson_size:encoded_size(Headers)
+    end,
+    NameSize + TypeSize + AttSize + Md5Size + HeadersSize.
+
+
 size_info([]) ->
     {ok, []};
 size_info(Atts) ->
diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index 828a51b..5f2571e 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -45,8 +45,13 @@
 
 % 0 - Initial implementation
 % 1 - Added attachment hash
+% 2 - Added size information
 
--define(CURR_REV_FORMAT, 1).
+-define(CURR_REV_FORMAT, 2).
+
+% 0 - Adding local doc versions
+
+-define(CURR_LDOC_FORMAT, 0).
 
 % Misc constants
 
diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index 17c899d..4528194 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -1417,12 +1417,14 @@ update_doc_interactive(Db, Doc0, Future, _Options) ->
 
     NewRevInfo = #{
         winner => undefined,
+        exists => false,
         deleted => NewDeleted,
         rev_id => {NewRevPos, NewRev},
         rev_path => NewRevPath,
         sequence => undefined,
         branch_count => undefined,
-        att_hash => fabric2_util:hash_atts(Atts)
+        att_hash => fabric2_util:hash_atts(Atts),
+        rev_size => fabric2_util:rev_size(Doc4)
     },
 
     % Gather the list of possible winnig revisions
@@ -1473,12 +1475,14 @@ update_doc_replicated(Db, Doc0, _Options) ->
 
     DocRevInfo0 = #{
         winner => undefined,
+        exists => false,
         deleted => Deleted,
         rev_id => {RevPos, Rev},
         rev_path => RevPath,
         sequence => undefined,
         branch_count => undefined,
-        att_hash => <<>>
+        att_hash => <<>>,
+        rev_size => null
     },
 
     AllRevInfos = fabric2_fdb:get_all_revs(Db, DocId),
@@ -1518,7 +1522,8 @@ update_doc_replicated(Db, Doc0, _Options) ->
     Doc2 = prep_and_validate(Db, Doc1, PrevRevInfo),
     Doc3 = flush_doc_atts(Db, Doc2),
     DocRevInfo2 = DocRevInfo1#{
-        atts_hash => fabric2_util:hash_atts(Doc3#doc.atts)
+        atts_hash => fabric2_util:hash_atts(Doc3#doc.atts),
+        rev_size => fabric2_util:rev_size(Doc3)
     },
 
     % Possible winners are the previous winner and
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index 99611b0..d5be6d7 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -36,6 +36,7 @@
 
     get_stat/2,
     incr_stat/3,
+    incr_stat/4,
 
     get_all_revs/2,
     get_winning_revs/3,
@@ -454,6 +455,19 @@ incr_stat(#{} = Db, StatKey, Increment) when is_integer(Increment) ->
     erlfdb:add(Tx, Key, Increment).
 
 
+incr_stat(_Db, _Section, _Key, 0) ->
+    ok;
+
+incr_stat(#{} = Db, Section, Key, Increment) when is_integer(Increment) ->
+    #{
+        tx := Tx,
+        db_prefix := DbPrefix
+    } = ensure_current(Db),
+
+    BinKey = erlfdb_tuple:pack({?DB_STATS, Section, Key}, DbPrefix),
+    erlfdb:add(Tx, BinKey, Increment).
+
+
 get_all_revs(#{} = Db, DocId) ->
     #{
         tx := Tx,
@@ -573,6 +587,15 @@ get_local_doc(#{} = Db0, <<?LOCAL_DOC_PREFIX, _/binary>> = DocId) ->
 
 get_local_doc_rev(_Db0, <<?LOCAL_DOC_PREFIX, _/binary>> = DocId, Val) ->
     case Val of
+        <<255, RevBin/binary>> ->
+            % Versioned local docs
+            try
+                case erlfdb_tuple:unpack(RevBin) of
+                    {?CURR_LDOC_FORMAT, Rev, _Size} -> Rev
+                end
+            catch _:_ ->
+                erlang:error({invalid_local_doc_rev, DocId, Val})
+            end;
         <<131, _/binary>> ->
             % Compatibility clause for an older encoding format
             try binary_to_term(Val, [safe]) of
@@ -639,7 +662,9 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
 
     % Revision tree
 
-    NewWinner = NewWinner0#{winner := true},
+    NewWinner = NewWinner0#{
+        winner := true
+    },
     NewRevId = maps:get(rev_id, NewWinner),
 
     {WKey, WVal, WinnerVS} = revinfo_to_fdb(Tx, DbPrefix, DocId, NewWinner),
@@ -701,7 +726,7 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
     NewSeqVal = erlfdb_tuple:pack({DocId, Deleted, NewRevId}),
     erlfdb:set_versionstamped_key(Tx, NewSeqKey, NewSeqVal),
 
-    % And all the rest...
+    % Bump db version on design doc changes
 
     IsDDoc = case Doc#doc.id of
         <<?DESIGN_DOC_PREFIX, _/binary>> -> true;
@@ -712,6 +737,8 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
         bump_db_version(Db)
     end,
 
+    % Update our document counts
+
     case UpdateStatus of
         created ->
             if not IsDDoc -> ok; true ->
@@ -738,6 +765,11 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
             ok
     end,
 
+    % Update database size
+    AddSize = sum_add_rev_sizes([NewWinner | ToUpdate]),
+    RemSize = sum_rem_rev_sizes(ToRemove),
+    incr_stat(Db, <<"sizes">>, <<"external">>, AddSize - RemSize),
+
     ok.
 
 
@@ -749,11 +781,18 @@ write_local_doc(#{} = Db0, Doc) ->
 
     Id = Doc#doc.id,
 
-    {LDocKey, LDocVal, Rows} = local_doc_to_fdb(Db, Doc),
+    {LDocKey, LDocVal, NewSize, Rows} = local_doc_to_fdb(Db, Doc),
 
-    WasDeleted = case erlfdb:wait(erlfdb:get(Tx, LDocKey)) of
-        <<_/binary>> -> false;
-        not_found -> true
+    {WasDeleted, PrevSize} = case erlfdb:wait(erlfdb:get(Tx, LDocKey)) of
+        <<255, RevBin/binary>> ->
+            case erlfdb_tuple:unpack(RevBin) of
+                {?CURR_LDOC_FORMAT, _Rev, Size} ->
+                    {false, Size}
+            end;
+        <<_/binary>> ->
+            {false, 0};
+        not_found ->
+            {true, 0}
     end,
 
     BPrefix = erlfdb_tuple:pack({?DB_LOCAL_DOC_BODIES, Id}, DbPrefix),
@@ -779,6 +818,8 @@ write_local_doc(#{} = Db0, Doc) ->
             ok
     end,
 
+    incr_stat(Db, <<"sizes">>, <<"external">>, NewSize - PrevSize),
+
     ok.
 
 
@@ -1045,9 +1086,10 @@ write_doc_body(#{} = Db0, #doc{} = Doc) ->
         tx := Tx
     } = Db = ensure_current(Db0),
 
+    Rows = doc_to_fdb(Db, Doc),
     lists:foreach(fun({Key, Value}) ->
         ok = erlfdb:set(Tx, Key, Value)
-    end, doc_to_fdb(Db, Doc)).
+    end, Rows).
 
 
 clear_doc_body(_Db, _DocId, not_found) ->
@@ -1123,7 +1165,8 @@ revinfo_to_fdb(Tx, DbPrefix, DocId, #{winner := true} = RevId) ->
         rev_id := {RevPos, Rev},
         rev_path := RevPath,
         branch_count := BranchCount,
-        att_hash := AttHash
+        att_hash := AttHash,
+        rev_size := RevSize
     } = RevId,
     VS = new_versionstamp(Tx),
     Key = {?DB_REVS, DocId, not Deleted, RevPos, Rev},
@@ -1132,7 +1175,8 @@ revinfo_to_fdb(Tx, DbPrefix, DocId, #{winner := true} = RevId) ->
         VS,
         BranchCount,
         list_to_tuple(RevPath),
-        AttHash
+        AttHash,
+        RevSize
     },
     KBin = erlfdb_tuple:pack(Key, DbPrefix),
     VBin = erlfdb_tuple:pack_vs(Val),
@@ -1143,39 +1187,44 @@ revinfo_to_fdb(_Tx, DbPrefix, DocId, #{} = RevId) ->
         deleted := Deleted,
         rev_id := {RevPos, Rev},
         rev_path := RevPath,
-        att_hash := AttHash
+        att_hash := AttHash,
+        rev_size := RevSize
     } = RevId,
     Key = {?DB_REVS, DocId, not Deleted, RevPos, Rev},
-    Val = {?CURR_REV_FORMAT, list_to_tuple(RevPath), AttHash},
+    Val = {?CURR_REV_FORMAT, list_to_tuple(RevPath), AttHash, RevSize},
     KBin = erlfdb_tuple:pack(Key, DbPrefix),
     VBin = erlfdb_tuple:pack(Val),
     {KBin, VBin, undefined}.
 
 
-fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _} = Val) ->
+fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _, _} = Val) ->
     {?DB_REVS, _DocId, NotDeleted, RevPos, Rev} = Key,
-    {_RevFormat, Sequence, BranchCount, RevPath, AttHash} = Val,
+    {_RevFormat, Sequence, BranchCount, RevPath, AttHash, RevSize} = Val,
     #{
         winner => true,
+        exists => true,
         deleted => not NotDeleted,
         rev_id => {RevPos, Rev},
         rev_path => tuple_to_list(RevPath),
         sequence => Sequence,
         branch_count => BranchCount,
-        att_hash => AttHash
+        att_hash => AttHash,
+        rev_size => RevSize
     };
 
-fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _} = Val)  ->
+fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _} = Val)  ->
     {?DB_REVS, _DocId, NotDeleted, RevPos, Rev} = Key,
-    {_RevFormat, RevPath, AttHash} = Val,
+    {_RevFormat, RevPath, AttHash, RevSize} = Val,
     #{
         winner => false,
+        exists => true,
         deleted => not NotDeleted,
         rev_id => {RevPos, Rev},
         rev_path => tuple_to_list(RevPath),
         sequence => undefined,
         branch_count => undefined,
-        att_hash => AttHash
+        att_hash => AttHash,
+        rev_size => RevSize
     };
 
 fdb_to_revinfo(Key, {0, Seq, BCount, RPath}) ->
@@ -1184,6 +1233,14 @@ fdb_to_revinfo(Key, {0, Seq, BCount, RPath}) ->
 
 fdb_to_revinfo(Key, {0, RPath}) ->
     Val = {?CURR_REV_FORMAT, RPath, <<>>},
+    fdb_to_revinfo(Key, Val);
+
+fdb_to_revinfo(Key, {1, Seq, BCount, RPath, AttHash}) ->
+    Val = {?CURR_REV_FORMAT, Seq, BCount, RPath, AttHash, 0},
+    fdb_to_revinfo(Key, Val);
+
+fdb_to_revinfo(Key, {1, RPath, AttHash}) ->
+    Val = {?CURR_REV_FORMAT, RPath, AttHash, 0},
     fdb_to_revinfo(Key, Val).
 
 
@@ -1203,11 +1260,13 @@ doc_to_fdb(Db, #doc{} = Doc) ->
     DiskAtts = lists:map(fun couch_att:to_disk_term/1, Atts),
 
     Value = term_to_binary({Body, DiskAtts, Deleted}, [{minor_version, 1}]),
+    Chunks = chunkify_binary(Value),
 
     {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) ->
         Key = erlfdb_tuple:pack({?DB_DOCS, Id, Start, Rev, ChunkId}, DbPrefix),
         {{Key, Chunk}, ChunkId + 1}
-    end, 0, chunkify_binary(Value)),
+    end, 0, Chunks),
+
     Rows.
 
 
@@ -1258,8 +1317,17 @@ local_doc_to_fdb(Db, #doc{} = Doc) ->
         {{K, Chunk}, ChunkId + 1}
     end, 0, chunkify_binary(BVal)),
 
-    {Key, StoreRev, Rows}.
+    NewSize = fabric2_util:ldoc_size(Doc),
+    RawValue = erlfdb_tuple:pack({?CURR_LDOC_FORMAT, StoreRev, NewSize}),
+
+    % Prefix our tuple encoding to make upgrades easier
+    Value = <<255, RawValue/binary>>,
 
+    {Key, Value, NewSize, Rows}.
+
+
+fdb_to_local_doc(_Db, _DocId, not_found, []) ->
+    {not_found, missing};
 
 fdb_to_local_doc(_Db, DocId, <<131, _/binary>> = Val, []) ->
     % This is an upgrade clause for the old encoding. We allow reading the old
@@ -1272,18 +1340,48 @@ fdb_to_local_doc(_Db, DocId, <<131, _/binary>> = Val, []) ->
         body = Body
     };
 
-fdb_to_local_doc(_Db, _DocId, not_found, []) ->
-    {not_found, missing};
+fdb_to_local_doc(_Db, DocId, <<255, RevBin/binary>>, Rows) when is_list(Rows) ->
+    Rev = case erlfdb_tuple:unpack(RevBin) of
+        {?CURR_LDOC_FORMAT, Rev0, _Size} -> Rev0
+    end,
 
-fdb_to_local_doc(_Db, DocId, Rev, Rows) when is_list(Rows), is_binary(Rev) ->
     BodyBin = iolist_to_binary(Rows),
     Body = binary_to_term(BodyBin, [safe]),
+
     #doc{
         id = DocId,
         revs = {0, [Rev]},
         deleted = false,
         body = Body
-    }.
+    };
+
+fdb_to_local_doc(Db, DocId, RawRev, Rows) ->
+    BaseRev = erlfdb_tuple:pack({?CURR_LDOC_FORMAT, RawRev, 0}),
+    Rev = <<255, BaseRev/binary>>,
+    fdb_to_local_doc(Db, DocId, Rev, Rows).
+
+
+sum_add_rev_sizes(RevInfos) ->
+    lists:foldl(fun(RI, Acc) ->
+        #{
+            exists := Exists,
+            rev_size := Size
+        } = RI,
+        case Exists of
+            true -> Acc;
+            false -> Size + Acc
+        end
+    end, 0, RevInfos).
+
+
+sum_rem_rev_sizes(RevInfos) ->
+    lists:foldl(fun(RI, Acc) ->
+        #{
+            exists := true,
+            rev_size := Size
+        } = RI,
+        Size + Acc
+    end, 0, RevInfos).
 
 
 chunkify_binary(Data) ->
diff --git a/src/fabric/src/fabric2_util.erl b/src/fabric/src/fabric2_util.erl
index 2a94065..a4faf39 100644
--- a/src/fabric/src/fabric2_util.erl
+++ b/src/fabric/src/fabric2_util.erl
@@ -17,6 +17,8 @@
     revinfo_to_revs/1,
     revinfo_to_path/1,
     sort_revinfos/1,
+    rev_size/1,
+    ldoc_size/1,
 
     seq_zero_vs/0,
     seq_max_vs/0,
@@ -80,6 +82,56 @@ rev_sort_key(#{} = RevInfo) ->
     {not Deleted, RevPos, Rev}.
 
 
+rev_size(#doc{} = Doc) ->
+    #doc{
+        id = Id,
+        revs = Revs,
+        body = Body,
+        atts = Atts
+    } = Doc,
+
+    {Start, Rev} = case Revs of
+        {0, []} -> {0, <<>>};
+        {N, [RevId | _]} -> {N, RevId}
+    end,
+
+    lists:sum([
+        size(Id),
+        size(erlfdb_tuple:pack({Start})),
+        size(Rev),
+        1, % FDB tuple encoding of booleans for deleted flag is 1 byte
+        couch_ejson_size:encoded_size(Body),
+        lists:foldl(fun(Att, Acc) ->
+            couch_att:external_size(Att) + Acc
+        end, 0, Atts)
+    ]).
+
+
+ldoc_size(#doc{id = <<"_local/", _/binary>>} = Doc) ->
+    #doc{
+        id = Id,
+        revs = {0, [Rev]},
+        deleted = Deleted,
+        body = Body
+    } = Doc,
+
+    StoreRev = case Rev of
+        _ when is_integer(Rev) -> integer_to_binary(Rev);
+        _ when is_binary(Rev) -> Rev
+    end,
+
+    case Deleted of
+        true ->
+            0;
+        false ->
+            lists:sum([
+                size(Id),
+                size(StoreRev),
+                couch_ejson_size:encoded_size(Body)
+            ])
+    end.
+
+
 seq_zero_vs() ->
     {versionstamp, 0, 0, 0}.
 
diff --git a/src/fabric/test/fabric2_doc_crud_tests.erl b/src/fabric/test/fabric2_doc_crud_tests.erl
index 184eb4a..46cd4fc 100644
--- a/src/fabric/test/fabric2_doc_crud_tests.erl
+++ b/src/fabric/test/fabric2_doc_crud_tests.erl
@@ -884,11 +884,12 @@ local_doc_with_previous_encoding({Db, _}) ->
     ?assertEqual(NewBody, Doc3#doc.body),
 
     % Old doc now has only the rev number in it
-    OldDocBin = fabric2_fdb:transactional(Db, fun(TxDb) ->
+    <<255, OldDocBin/binary>> = fabric2_fdb:transactional(Db, fun(TxDb) ->
         #{tx := Tx} = TxDb,
         erlfdb:wait(erlfdb:get(Tx, Key))
     end),
-    ?assertEqual(<<"2">> , OldDocBin).
+    Unpacked = erlfdb_tuple:unpack(OldDocBin),
+    ?assertMatch({?CURR_LDOC_FORMAT, <<"2">>, _}, Unpacked).
 
 
 before_doc_update_skips_local_docs({Db0, _}) ->