You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2019/12/06 20:40:16 UTC

[couchdb] branch prototype/fdb-layer-get-dbs-info updated: Delete attachments when possible

This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch prototype/fdb-layer-get-dbs-info
in repository https://gitbox.apache.org/repos/asf/couchdb.git


The following commit(s) were added to refs/heads/prototype/fdb-layer-get-dbs-info by this push:
     new 3e6c529  Delete attachments when possible
3e6c529 is described below

commit 3e6c529f8434203cc6fcbdd113e5d2f11edee21e
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Fri Dec 6 14:40:55 2019 -0600

    Delete attachments when possible
---
 src/fabric/include/fabric2.hrl  |   4 +-
 src/fabric/src/fabric2_db.erl   |  12 +++--
 src/fabric/src/fabric2_fdb.erl  | 115 ++++++++++++++++++++++++++++++++++------
 src/fabric/src/fabric2_util.erl |  13 +++++
 4 files changed, 124 insertions(+), 20 deletions(-)

diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index fc07f33..78b59c3 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -37,14 +37,16 @@
 -define(DB_ATTS, 23).
 -define(DB_VIEWS, 24).
 -define(DB_LOCAL_DOC_BODIES, 25).
+-define(DB_ATT_NAMES, 26).
 
 
 % Versions
 
 % 0 - Initial implementation
 % 1 - Added size information
+% 2 - Added attachment hash
 
--define(CURR_REV_FORMAT, 1).
+-define(CURR_REV_FORMAT, 2).
 
 % 0 - Adding local doc versions
 
diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index 1bc53ca..c57d33d 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -1411,7 +1411,8 @@ update_doc_interactive(Db, Doc0, Future, _Options) ->
 
     #doc{
         deleted = NewDeleted,
-        revs = {NewRevPos, [NewRev | NewRevPath]}
+        revs = {NewRevPos, [NewRev | NewRevPath]},
+        atts = Atts
     } = Doc4 = stem_revisions(Db, Doc3),
 
     NewRevInfo = #{
@@ -1421,7 +1422,8 @@ update_doc_interactive(Db, Doc0, Future, _Options) ->
         rev_path => NewRevPath,
         sequence => undefined,
         branch_count => undefined,
-        rev_size => null
+        rev_size => null,
+        att_hash => fabric2_util:hash_atts(Atts)
     },
 
     % Gather the list of possible winnig revisions
@@ -1467,7 +1469,8 @@ update_doc_replicated(Db, Doc0, _Options) ->
     #doc{
         id = DocId,
         deleted = Deleted,
-        revs = {RevPos, [Rev | RevPath]}
+        revs = {RevPos, [Rev | RevPath]},
+        atts = Atts
     } = Doc0,
 
     DocRevInfo0 = #{
@@ -1477,7 +1480,8 @@ update_doc_replicated(Db, Doc0, _Options) ->
         rev_path => RevPath,
         sequence => undefined,
         branch_count => undefined,
-        rev_size => null
+        rev_size => null,
+        att_hash => fabric2_util:hash_atts(Atts)
     },
 
     AllRevInfos = fabric2_fdb:get_all_revs(Db, DocId),
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index a91f82a..8c8baf2 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -623,7 +623,8 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
 
     #doc{
         id = DocId,
-        deleted = Deleted
+        deleted = Deleted,
+        atts = Atts
     } = Doc,
 
     % Doc body
@@ -654,6 +655,31 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
         ok = clear_doc_body(Db, DocId, RI0)
     end, ToRemove),
 
+    % Attachment bookkeeping
+
+    % If a document's attachments have changed we have to scan
+    % for any attachments that may need to be deleted. The check
+    % for `>= 2` is a bit subtle. The important point is that
+    % one of the revisions will be from the new document so we
+    % have to find at least one more beyond that to assert that
+    % the attachments have not changed.
+    %
+    % This has to happen after we update the revision tree above
+    % or else we won't delete attachments until the second update
+    % after they have been removed.
+    AttHash = fabric2_util:hash_atts(Atts),
+    RevsToCheck = [NewWinner0] ++ ToUpdate ++ ToRemove,
+    AttHashCount = lists:foldl(fun(Att, Count) ->
+        #{att_hash := RevAttHash} = Att,
+        case RevAttHash == AttHash of
+            true -> Count + 1;
+            false -> Count
+        end
+    end, 0, RevsToCheck),
+    if AttHashCount >= 2 -> ok; true ->
+        scan_attachments(Db, DocId, Doc)
+    end,
+
     % _all_docs
 
     UpdateStatus = case {OldWinner, NewWinner} of
@@ -823,6 +849,9 @@ write_attachment(#{} = Db, DocId, Data) when is_binary(Data) ->
     AttId = fabric2_util:uuid(),
     Chunks = chunkify_binary(Data),
 
+    IdKey = erlfdb_tuple:pack({?DB_AT_NAMES, DocId, AttId}, DbPrefix),
+    ok = erlfdb:set(Tx, IdKey, <<>>),
+
     lists:foldl(fun(Chunk, ChunkId) ->
         AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId, ChunkId}, DbPrefix),
         ok = erlfdb:set(Tx, AttKey, Chunk),
@@ -1084,17 +1113,62 @@ clear_doc_body(#{} = Db, DocId, #{} = RevInfo) ->
     ok = erlfdb:clear_range(Tx, StartKey, EndKey).
 
 
+scan_attachments(Db, DocId, NewDoc) ->
+    #{
+        tx := Tx
+    } = Db,
+
+    % Gather all known document revisions
+    {ok, DiskDocs} = fabric2_db:open_doc_revs(Db, DocId, all, []),
+    AllDocs = [{ok, NewDoc} | DiskDocs],
+
+    % Get referenced attachment ids
+    ActiveIdSet = list:foldl(fun({ok, Doc}, Acc) ->
+        lists:foldl(fun(Att, InnerAcc) ->
+            {loc, _Db, _DocId, AttId} = couch_att:fetch(data, Att),
+            sets:add_element(AttId, InnerAcc)
+        end, Acc, Doc#doc.atts)
+    end, AllDocs),
+
+    AttPrefix = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId}, DbPrefix),
+    Options = [{streaming_mode, want_all}],
+    Future = erlfdb:get_range_startswith(Tx, AttPrefix, Options),
+
+    ExistingIdSet = lists:foldl(fun({K, _}, Acc) ->
+        {?DB_ATT_NAMES, DocId, AttId} = erlfdb_tuple:unpack(K, DbPrefix),
+        sets:add_element(AttId, Acc)
+    end, sets:new(), erlfdb:wait(Future)),
+
+    ToRemove = sets:subtract(ExistingIdSet, ActiveIdSet),
+
+    lists:foreach(fun(AttId) ->
+        IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
+        erlfdb:clear(Tx, IdKey),
+
+        ChunkKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix),
+        erlfdb:clear_range_startswith(Tx, ChunkKey)
+    end, sets:to_list(ToRemove)).
+
+
 revinfo_to_fdb(Tx, DbPrefix, DocId, #{winner := true} = RevId) ->
     #{
         deleted := Deleted,
         rev_id := {RevPos, Rev},
         rev_path := RevPath,
         branch_count := BranchCount,
-        rev_size := RevSize
+        rev_size := RevSize,
+        att_hash := AttHash
     } = RevId,
     VS = new_versionstamp(Tx),
     Key = {?DB_REVS, DocId, not Deleted, RevPos, Rev},
-    Val = {?CURR_REV_FORMAT, VS, BranchCount, list_to_tuple(RevPath), RevSize},
+    Val = {
+        ?CURR_REV_FORMAT,
+        VS,
+        BranchCount,
+        list_to_tuple(RevPath),
+        RevSize,
+        AttHash
+    },
     KBin = erlfdb_tuple:pack(Key, DbPrefix),
     VBin = erlfdb_tuple:pack_vs(Val),
     {KBin, VBin, VS};
@@ -1104,18 +1178,19 @@ revinfo_to_fdb(_Tx, DbPrefix, DocId, #{} = RevId) ->
         deleted := Deleted,
         rev_id := {RevPos, Rev},
         rev_path := RevPath,
-        rev_size := RevSize
+        rev_size := RevSize,
+        att_hash := AttHash
     } = RevId,
     Key = {?DB_REVS, DocId, not Deleted, RevPos, Rev},
-    Val = {?CURR_REV_FORMAT, list_to_tuple(RevPath), RevSize},
+    Val = {?CURR_REV_FORMAT, list_to_tuple(RevPath), RevSize, AttHash},
     KBin = erlfdb_tuple:pack(Key, DbPrefix),
     VBin = erlfdb_tuple:pack(Val),
     {KBin, VBin, undefined}.
 
 
-fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _} = Val) ->
+fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _, _} = Val) ->
     {?DB_REVS, _DocId, NotDeleted, RevPos, Rev} = Key,
-    {_RevFormat, Sequence, BranchCount, RevPath, RevSize} = Val,
+    {_RevFormat, Sequence, BranchCount, RevPath, RevSize, AttHash} = Val,
     #{
         winner => true,
         deleted => not NotDeleted,
@@ -1123,12 +1198,13 @@ fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _} = Val) ->
         rev_path => tuple_to_list(RevPath),
         sequence => Sequence,
         branch_count => BranchCount,
-        rev_size => RevSize
+        rev_size => RevSize,
+        att_hash => AttHash
     };
 
-fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _} = Val)  ->
+fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _} = Val)  ->
     {?DB_REVS, _DocId, NotDeleted, RevPos, Rev} = Key,
-    {_RevFormat, RevPath, RevSize} = Val,
+    {_RevFormat, RevPath, RevSize, AttHash} = Val,
     #{
         winner => false,
         deleted => not NotDeleted,
@@ -1136,15 +1212,24 @@ fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _} = Val)  ->
         rev_path => tuple_to_list(RevPath),
         sequence => undefined,
         branch_count => undefined,
-        rev_size => RevSize
+        rev_size => RevSize,
+        att_hash => AttHash
     };
 
-fdb_to_revinfo(Key, {0, S, B, R}) ->
-    Val = {?CURR_REV_FORMAT, S, B, R, null},
+fdb_to_revinfo(Key, {0, Seq, BCount, RPath}) ->
+    Val = {?CURR_REV_FORMAT, Seq, BCount, RPath, null, <<>>},
+    fdb_to_revinfo(Key, Val);
+
+fdb_to_revinfo(Key, {0, RPath}) ->
+    Val = {?CURR_REV_FORMAT, RPath, null, <<>>},
+    fdb_to_revinfo(Key, Val);
+
+fdb_to_revinfo(Key, {1, Seq, BCount, RPath, Size}) ->
+    Val = {?CURR_REV_FORMAT, Seq, BCount, RPath, Size, <<>>};
     fdb_to_revinfo(Key, Val);
 
-fdb_to_revinfo(Key, {0, R}) ->
-    Val = {?CURR_REV_FORMAT, R, null},
+fdb_to_revinfo(Key, {1, RPath, Size}) ->
+    Val = {?CURR_REV_FORMAT, RPath, Size, <<>>},
     fdb_to_revinfo(Key, Val).
 
 
diff --git a/src/fabric/src/fabric2_util.erl b/src/fabric/src/fabric2_util.erl
index 2b8e49e..297eaaf 100644
--- a/src/fabric/src/fabric2_util.erl
+++ b/src/fabric/src/fabric2_util.erl
@@ -25,6 +25,8 @@
 
     validate_security_object/1,
 
+    hash_atts/1,
+
     dbname_ends_with/2,
 
     get_value/2,
@@ -124,6 +126,17 @@ validate_json_list_of_strings(Member, Props) ->
     end.
 
 
+hasn_atts([]) ->
+    <<>>;
+
+hash_atts(Atts) ->
+    Md5St = lists:foldl(fun(Att, Acc) ->
+        {loc, _Db, _DocId, AttId} = couch_att:fetch(data, Att),
+        couch_hash:md5_hash_update(Acc, AttId)
+    end, couch_hash:md5_hash_init(), Atts),
+    couch_hash:md5_hash_final(Md5St).
+
+
 dbname_ends_with(#{} = Db, Suffix) ->
     dbname_ends_with(fabric2_db:name(Db), Suffix);