You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2019/12/06 20:40:16 UTC
[couchdb] branch prototype/fdb-layer-get-dbs-info updated: Delete
attachments when possible
This is an automated email from the ASF dual-hosted git repository.
davisp pushed a commit to branch prototype/fdb-layer-get-dbs-info
in repository https://gitbox.apache.org/repos/asf/couchdb.git
The following commit(s) were added to refs/heads/prototype/fdb-layer-get-dbs-info by this push:
new 3e6c529 Delete attachments when possible
3e6c529 is described below
commit 3e6c529f8434203cc6fcbdd113e5d2f11edee21e
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Fri Dec 6 14:40:55 2019 -0600
Delete attachments when possible
---
src/fabric/include/fabric2.hrl | 4 +-
src/fabric/src/fabric2_db.erl | 12 +++--
src/fabric/src/fabric2_fdb.erl | 115 ++++++++++++++++++++++++++++++++++------
src/fabric/src/fabric2_util.erl | 13 +++++
4 files changed, 124 insertions(+), 20 deletions(-)
diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index fc07f33..78b59c3 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -37,14 +37,16 @@
-define(DB_ATTS, 23).
-define(DB_VIEWS, 24).
-define(DB_LOCAL_DOC_BODIES, 25).
+-define(DB_ATT_NAMES, 26).
% Versions
% 0 - Initial implementation
% 1 - Added size information
+% 2 - Added attachment hash
--define(CURR_REV_FORMAT, 1).
+-define(CURR_REV_FORMAT, 2).
% 0 - Adding local doc versions
diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index 1bc53ca..c57d33d 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -1411,7 +1411,8 @@ update_doc_interactive(Db, Doc0, Future, _Options) ->
#doc{
deleted = NewDeleted,
- revs = {NewRevPos, [NewRev | NewRevPath]}
+ revs = {NewRevPos, [NewRev | NewRevPath]},
+ atts = Atts
} = Doc4 = stem_revisions(Db, Doc3),
NewRevInfo = #{
@@ -1421,7 +1422,8 @@ update_doc_interactive(Db, Doc0, Future, _Options) ->
rev_path => NewRevPath,
sequence => undefined,
branch_count => undefined,
- rev_size => null
+ rev_size => null,
+ att_hash => fabric2_util:hash_atts(Atts)
},
% Gather the list of possible winnig revisions
@@ -1467,7 +1469,8 @@ update_doc_replicated(Db, Doc0, _Options) ->
#doc{
id = DocId,
deleted = Deleted,
- revs = {RevPos, [Rev | RevPath]}
+ revs = {RevPos, [Rev | RevPath]},
+ atts = Atts
} = Doc0,
DocRevInfo0 = #{
@@ -1477,7 +1480,8 @@ update_doc_replicated(Db, Doc0, _Options) ->
rev_path => RevPath,
sequence => undefined,
branch_count => undefined,
- rev_size => null
+ rev_size => null,
+ att_hash => fabric2_util:hash_atts(Atts)
},
AllRevInfos = fabric2_fdb:get_all_revs(Db, DocId),
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index a91f82a..8c8baf2 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -623,7 +623,8 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
#doc{
id = DocId,
- deleted = Deleted
+ deleted = Deleted,
+ atts = Atts
} = Doc,
% Doc body
@@ -654,6 +655,31 @@ write_doc(#{} = Db0, Doc, NewWinner0, OldWinner, ToUpdate, ToRemove) ->
ok = clear_doc_body(Db, DocId, RI0)
end, ToRemove),
+ % Attachment bookkeeping
+
+ % If a document's attachments have changed we have to scan
+ % for any attachments that may need to be deleted. The check
+ % for `>= 2` is a bit subtle. The important point is that
+ % one of the revisions will be from the new document so we
+ % have to find at least one more beyond that to assert that
+ % the attachments have not changed.
+ %
+ % This has to happen after we update the revision tree above
+ % or else we won't delete attachments until the second update
+ % after they have been removed.
+ AttHash = fabric2_util:hash_atts(Atts),
+ RevsToCheck = [NewWinner0] ++ ToUpdate ++ ToRemove,
+ AttHashCount = lists:foldl(fun(Att, Count) ->
+ #{att_hash := RevAttHash} = Att,
+ case RevAttHash == AttHash of
+ true -> Count + 1;
+ false -> Count
+ end
+ end, 0, RevsToCheck),
+ if AttHashCount >= 2 -> ok; true ->
+ scan_attachments(Db, DocId, Doc)
+ end,
+
% _all_docs
UpdateStatus = case {OldWinner, NewWinner} of
@@ -823,6 +849,9 @@ write_attachment(#{} = Db, DocId, Data) when is_binary(Data) ->
AttId = fabric2_util:uuid(),
Chunks = chunkify_binary(Data),
+ IdKey = erlfdb_tuple:pack({?DB_AT_NAMES, DocId, AttId}, DbPrefix),
+ ok = erlfdb:set(Tx, IdKey, <<>>),
+
lists:foldl(fun(Chunk, ChunkId) ->
AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId, ChunkId}, DbPrefix),
ok = erlfdb:set(Tx, AttKey, Chunk),
@@ -1084,17 +1113,62 @@ clear_doc_body(#{} = Db, DocId, #{} = RevInfo) ->
ok = erlfdb:clear_range(Tx, StartKey, EndKey).
+scan_attachments(Db, DocId, NewDoc) ->
+ #{
+ tx := Tx
+ } = Db,
+
+ % Gather all known document revisions
+ {ok, DiskDocs} = fabric2_db:open_doc_revs(Db, DocId, all, []),
+ AllDocs = [{ok, NewDoc} | DiskDocs],
+
+ % Get referenced attachment ids
+ ActiveIdSet = list:foldl(fun({ok, Doc}, Acc) ->
+ lists:foldl(fun(Att, InnerAcc) ->
+ {loc, _Db, _DocId, AttId} = couch_att:fetch(data, Att),
+ sets:add_element(AttId, InnerAcc)
+ end, Acc, Doc#doc.atts)
+ end, AllDocs),
+
+ AttPrefix = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId}, DbPrefix),
+ Options = [{streaming_mode, want_all}],
+ Future = erlfdb:get_range_startswith(Tx, AttPrefix, Options),
+
+ ExistingIdSet = lists:foldl(fun({K, _}, Acc) ->
+ {?DB_ATT_NAMES, DocId, AttId} = erlfdb_tuple:unpack(K, DbPrefix),
+ sets:add_element(AttId, Acc)
+ end, sets:new(), erlfdb:wait(Future)),
+
+ ToRemove = sets:subtract(ExistingIdSet, ActiveIdSet),
+
+ lists:foreach(fun(AttId) ->
+ IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix),
+ erlfdb:clear(Tx, IdKey),
+
+ ChunkKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix),
+ erlfdb:clear_range_startswith(Tx, ChunkKey)
+ end, sets:to_list(ToRemove)).
+
+
revinfo_to_fdb(Tx, DbPrefix, DocId, #{winner := true} = RevId) ->
#{
deleted := Deleted,
rev_id := {RevPos, Rev},
rev_path := RevPath,
branch_count := BranchCount,
- rev_size := RevSize
+ rev_size := RevSize,
+ att_hash := AttHash
} = RevId,
VS = new_versionstamp(Tx),
Key = {?DB_REVS, DocId, not Deleted, RevPos, Rev},
- Val = {?CURR_REV_FORMAT, VS, BranchCount, list_to_tuple(RevPath), RevSize},
+ Val = {
+ ?CURR_REV_FORMAT,
+ VS,
+ BranchCount,
+ list_to_tuple(RevPath),
+ RevSize,
+ AttHash
+ },
KBin = erlfdb_tuple:pack(Key, DbPrefix),
VBin = erlfdb_tuple:pack_vs(Val),
{KBin, VBin, VS};
@@ -1104,18 +1178,19 @@ revinfo_to_fdb(_Tx, DbPrefix, DocId, #{} = RevId) ->
deleted := Deleted,
rev_id := {RevPos, Rev},
rev_path := RevPath,
- rev_size := RevSize
+ rev_size := RevSize,
+ att_hash := AttHash
} = RevId,
Key = {?DB_REVS, DocId, not Deleted, RevPos, Rev},
- Val = {?CURR_REV_FORMAT, list_to_tuple(RevPath), RevSize},
+ Val = {?CURR_REV_FORMAT, list_to_tuple(RevPath), RevSize, AttHash},
KBin = erlfdb_tuple:pack(Key, DbPrefix),
VBin = erlfdb_tuple:pack(Val),
{KBin, VBin, undefined}.
-fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _} = Val) ->
+fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _, _} = Val) ->
{?DB_REVS, _DocId, NotDeleted, RevPos, Rev} = Key,
- {_RevFormat, Sequence, BranchCount, RevPath, RevSize} = Val,
+ {_RevFormat, Sequence, BranchCount, RevPath, RevSize, AttHash} = Val,
#{
winner => true,
deleted => not NotDeleted,
@@ -1123,12 +1198,13 @@ fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _, _} = Val) ->
rev_path => tuple_to_list(RevPath),
sequence => Sequence,
branch_count => BranchCount,
- rev_size => RevSize
+ rev_size => RevSize,
+ att_hash => AttHash
};
-fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _} = Val) ->
+fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _, _} = Val) ->
{?DB_REVS, _DocId, NotDeleted, RevPos, Rev} = Key,
- {_RevFormat, RevPath, RevSize} = Val,
+ {_RevFormat, RevPath, RevSize, AttHash} = Val,
#{
winner => false,
deleted => not NotDeleted,
@@ -1136,15 +1212,24 @@ fdb_to_revinfo(Key, {?CURR_REV_FORMAT, _, _} = Val) ->
rev_path => tuple_to_list(RevPath),
sequence => undefined,
branch_count => undefined,
- rev_size => RevSize
+ rev_size => RevSize,
+ att_hash => AttHash
};
-fdb_to_revinfo(Key, {0, S, B, R}) ->
- Val = {?CURR_REV_FORMAT, S, B, R, null},
+fdb_to_revinfo(Key, {0, Seq, BCount, RPath}) ->
+ Val = {?CURR_REV_FORMAT, Seq, BCount, RPath, null, <<>>},
+ fdb_to_revinfo(Key, Val);
+
+fdb_to_revinfo(Key, {0, RPath}) ->
+ Val = {?CURR_REV_FORMAT, RPath, null, <<>>},
+ fdb_to_revinfo(Key, Val);
+
+fdb_to_revinfo(Key, {1, Seq, BCount, RPath, Size}) ->
+ Val = {?CURR_REV_FORMAT, Seq, BCount, RPath, Size, <<>>};
fdb_to_revinfo(Key, Val);
-fdb_to_revinfo(Key, {0, R}) ->
- Val = {?CURR_REV_FORMAT, R, null},
+fdb_to_revinfo(Key, {1, RPath, Size}) ->
+ Val = {?CURR_REV_FORMAT, RPath, Size, <<>>},
fdb_to_revinfo(Key, Val).
diff --git a/src/fabric/src/fabric2_util.erl b/src/fabric/src/fabric2_util.erl
index 2b8e49e..297eaaf 100644
--- a/src/fabric/src/fabric2_util.erl
+++ b/src/fabric/src/fabric2_util.erl
@@ -25,6 +25,8 @@
validate_security_object/1,
+ hash_atts/1,
+
dbname_ends_with/2,
get_value/2,
@@ -124,6 +126,17 @@ validate_json_list_of_strings(Member, Props) ->
end.
+hasn_atts([]) ->
+ <<>>;
+
+hash_atts(Atts) ->
+ Md5St = lists:foldl(fun(Att, Acc) ->
+ {loc, _Db, _DocId, AttId} = couch_att:fetch(data, Att),
+ couch_hash:md5_hash_update(Acc, AttId)
+ end, couch_hash:md5_hash_init(), Atts),
+ couch_hash:md5_hash_final(Md5St).
+
+
dbname_ends_with(#{} = Db, Suffix) ->
dbname_ends_with(fabric2_db:name(Db), Suffix);