You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by rn...@apache.org on 2014/08/22 17:05:52 UTC

couch commit: updated refs/heads/windsor-merge to 37f2218

Repository: couchdb-couch
Updated Branches:
  refs/heads/windsor-merge 6d3e5b8d7 -> 37f2218b8


Merge CouchDB's idea of data_size

This paches dbcore to include the CouchDB notion of data sizes which is
defined as the size of all live data in a database file. This number is
useful for deciding when to compact databases.

Technically speaking this measure is lacking a bit of information
because it does not currently account for headers, security objects, or
purged document information.

The new sizes are named slightly differently than either the CouchDB
version or the Cloudant version. The new names are:

     file - Total number of bytes in the file
     active - Active bytes in the current MVCC snapshot
     external - Theoretical bytes to store user data uncompressed

BugzId: 27061

N.B. Heavily modified by rnewson for the merge, credit him if it works, blame
davisp if it doesn't. thanks.


Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-couch/commit/37f2218b
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch/tree/37f2218b
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch/diff/37f2218b

Branch: refs/heads/windsor-merge
Commit: 37f2218b83d39fd5a88dfc39d2969f0066b32768
Parents: 6d3e5b8
Author: Paul J. Davis <pa...@gmail.com>
Authored: Tue Jan 28 17:43:10 2014 -0600
Committer: Robert Newson <rn...@apache.org>
Committed: Fri Aug 22 16:05:34 2014 +0100

----------------------------------------------------------------------
 include/couch_db.hrl     |  4 +-
 src/couch_att.erl        |  2 +-
 src/couch_db.erl         | 51 ++++++++++++-----------
 src/couch_db_updater.erl | 96 +++++++++++++++++++++++++------------------
 4 files changed, 88 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/37f2218b/include/couch_db.hrl
----------------------------------------------------------------------
diff --git a/include/couch_db.hrl b/include/couch_db.hrl
index 8bb44a0..698c96e 100644
--- a/include/couch_db.hrl
+++ b/include/couch_db.hrl
@@ -63,7 +63,7 @@
     update_seq = 0,
     deleted = false,
     rev_tree = [],
-    leafs_size = 0
+    sizes = {0, 0}
 }).
 
 -record(httpd, {
@@ -204,6 +204,6 @@
     deleted,
     ptr,
     seq,
-    size = nil
+    sizes = nil
 }).
 

http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/37f2218b/src/couch_att.erl
----------------------------------------------------------------------
diff --git a/src/couch_att.erl b/src/couch_att.erl
index 232bb60..128b9d2 100644
--- a/src/couch_att.erl
+++ b/src/couch_att.erl
@@ -287,7 +287,7 @@ disk_info(ActiveFd, Atts) ->
                 [{_, Pos}, AttLen] = fetch([data, att_len], Att),
                 {Pos, AttLen}
             end, Atts),
-            {ok, Tuples, Info};
+            {ok, Tuples, lists:usort(Info)};
         true ->
             ?LOG_ERROR("MISMATCH: ~p ; ~p~n", [ActiveFd, Atts]),
             file_mismatch

http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/37f2218b/src/couch_db.erl
----------------------------------------------------------------------
diff --git a/src/couch_db.erl b/src/couch_db.erl
index bd250ef..4ed85ac 100644
--- a/src/couch_db.erl
+++ b/src/couch_db.erl
@@ -321,12 +321,12 @@ get_db_info(Db) ->
         name=Name,
         instance_start_time=StartTime,
         committed_update_seq=CommittedUpdateSeq,
-        id_tree = IdBtree,
-        seq_tree = SeqBtree,
-        local_tree = LocalBtree
+        id_tree = IdBtree
     } = Db,
-    {ok, Size} = couch_file:bytes(Fd),
+    {ok, FileSize} = couch_file:bytes(Fd),
     {ok, DbReduction} = couch_btree:full_reduce(IdBtree),
+    {ActiveSize0, ExternalSize} = element(3, DbReduction),
+    ActiveSize = active_size(Db, ActiveSize0),
     DiskVersion = couch_db_header:disk_version(Header),
     Uuid = case get_uuid(Db) of
         undefined -> null;
@@ -343,8 +343,14 @@ get_db_info(Db) ->
         {update_seq, SeqNum},
         {purge_seq, couch_db:get_purge_seq(Db)},
         {compact_running, Compactor/=nil},
-        {disk_size, Size},
-        {data_size, db_data_size(DbReduction, [SeqBtree, IdBtree, LocalBtree])},
+        {disk_size, FileSize}, % legacy
+        {other, {[{data_size, ActiveSize}]}}, % legacy
+        {data_size, ActiveSize}, % legacy
+        {sizes, {[
+            {file, FileSize},
+            {active, ActiveSize},
+            {external, ExternalSize}
+        ]}},
         {instance_start_time, StartTime},
         {disk_format_version, DiskVersion},
         {committed_update_seq, CommittedUpdateSeq},
@@ -353,23 +359,22 @@ get_db_info(Db) ->
         ],
     {ok, InfoList}.
 
-db_data_size({_Count, _DelCount}, _Trees) ->
-    % pre 1.2 format, upgraded on compaction
-    null;
-db_data_size({_Count, _DelCount, nil}, _Trees) ->
-    null;
-db_data_size({_Count, _DelCount, DocAndAttsSize}, Trees) ->
-    sum_tree_sizes(DocAndAttsSize, Trees).
-
-sum_tree_sizes(Acc, []) ->
-    Acc;
-sum_tree_sizes(Acc, [T | Rest]) ->
-    case couch_btree:size(T) of
-    nil ->
-        null;
-    Sz ->
-        sum_tree_sizes(Acc + Sz, Rest)
-    end.
+active_size(#db{}=Db, DocActiveSize) ->
+    Trees = [
+        Db#db.id_tree,
+        Db#db.seq_tree,
+        Db#db.local_tree
+    ],
+    lists:foldl(fun(T, Acc) ->
+        case couch_btree:size(T) of
+            _ when Acc == null ->
+                null;
+            undefined ->
+                null;
+            Size ->
+                Acc + Size
+        end
+    end, DocActiveSize, Trees).
 
 get_design_docs(#db{name = <<"shards/", _:18/binary, DbName/binary>>}) ->
     {_, Ref} = spawn_monitor(fun() -> exit(fabric:design_docs(DbName)) end),

http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/37f2218b/src/couch_db_updater.erl
----------------------------------------------------------------------
diff --git a/src/couch_db_updater.erl b/src/couch_db_updater.erl
index cd434df..f304769 100644
--- a/src/couch_db_updater.erl
+++ b/src/couch_db_updater.erl
@@ -379,34 +379,42 @@ rev_tree(DiskTree) ->
             {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq}, nil};
         (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, branch, Acc) ->
             {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq}, Acc};
-        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, leaf, Acc) ->
-            Acc2 = sum_leaf_sizes(Acc, Size),
-            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq, size=Size}, Acc2};
-        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, branch, Acc) ->
-            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq, size=Size}, Acc};
+        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Sizes0}, leaf, Acc) ->
+            Sizes = upgrade_sizes(Sizes0),
+            Acc2 = reduce_sizes(Acc, Sizes),
+            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq,
+                   sizes=Sizes}, Acc2};
+        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Sizes}, branch, Acc) ->
+            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq,
+                   sizes=upgrade_sizes(Sizes)}, Acc};
         (_RevId, ?REV_MISSING, _Type, Acc) ->
             {?REV_MISSING, Acc}
-    end, 0, DiskTree).
+    end, {0, 0}, DiskTree).
 
 disk_tree(RevTree) ->
     couch_key_tree:map(fun
         (_RevId, ?REV_MISSING) ->
             ?REV_MISSING;
-        (_RevId, #leaf{deleted=IsDeleted, ptr=BodyPointer, seq=UpdateSeq, size=Size}) ->
-            {?b2i(IsDeleted), BodyPointer, UpdateSeq, Size}
+        (_RevId, #leaf{deleted=IsDeleted, ptr=BodyPointer, seq=UpdateSeq, sizes=Sizes}) ->
+            {?b2i(IsDeleted), BodyPointer, UpdateSeq, upgrade_sizes(Sizes)}
     end, RevTree).
 
+upgrade_sizes({_, _}=Sizes) ->
+    Sizes;
+upgrade_sizes(S) when is_integer(S) ->
+    {0, S}.
+
 btree_by_seq_split(#full_doc_info{id=Id, update_seq=Seq, deleted=Del, rev_tree=T}) ->
     {Seq, {Id, ?b2i(Del), disk_tree(T)}}.
 
 btree_by_seq_join(Seq, {Id, Del, DiskTree}) when is_integer(Del) ->
-    {RevTree, LeafsSize} = rev_tree(DiskTree),
+    {RevTree, Sizes} = rev_tree(DiskTree),
     #full_doc_info{
         id = Id,
         update_seq = Seq,
         deleted = ?i2b(Del),
         rev_tree = RevTree,
-        leafs_size = LeafsSize
+        sizes = upgrade_sizes(Sizes)
     };
 btree_by_seq_join(KeySeq, {Id, RevInfos, DeletedRevInfos}) ->
     % Older versions stored #doc_info records in the seq_tree.
@@ -425,44 +433,47 @@ btree_by_id_split(#full_doc_info{id=Id, update_seq=Seq,
     {Id, {Seq, ?b2i(Deleted), disk_tree(Tree)}}.
 
 btree_by_id_join(Id, {HighSeq, Deleted, DiskTree}) ->
-    {Tree, LeafsSize} = rev_tree(DiskTree),
+    {Tree, Sizes} = rev_tree(DiskTree),
     #full_doc_info{
         id = Id,
         update_seq = HighSeq,
         deleted = ?i2b(Deleted),
         rev_tree = Tree,
-        leafs_size = LeafsSize
+        sizes = upgrade_sizes(Sizes)
     }.
 
 btree_by_id_reduce(reduce, FullDocInfos) ->
     lists:foldl(
-        fun(Info, {NotDeleted, Deleted, Size}) ->
-            Size2 = sum_leaf_sizes(Size, Info#full_doc_info.leafs_size),
+        fun(Info, {NotDeleted, Deleted, Sizes}) ->
+            Sizes2 = reduce_sizes(Sizes, Info#full_doc_info.sizes),
             case Info#full_doc_info.deleted of
             true ->
-                {NotDeleted, Deleted + 1, Size2};
+                {NotDeleted, Deleted + 1, Sizes2};
             false ->
-                {NotDeleted + 1, Deleted, Size2}
+                {NotDeleted + 1, Deleted, Sizes2}
             end
         end,
-        {0, 0, 0}, FullDocInfos);
+        {0, 0, {0, 0}}, FullDocInfos);
 btree_by_id_reduce(rereduce, Reds) ->
     lists:foldl(
         fun({NotDeleted, Deleted}, {AccNotDeleted, AccDeleted, _AccSize}) ->
             % pre 1.2 format, will be upgraded on compaction
             {AccNotDeleted + NotDeleted, AccDeleted + Deleted, nil};
-        ({NotDeleted, Deleted, Size}, {AccNotDeleted, AccDeleted, AccSize}) ->
-            AccSize2 = sum_leaf_sizes(AccSize, Size),
-            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, AccSize2}
+        ({NotDeleted, Deleted, Sizes}, {AccNotDeleted, AccDeleted, AccSizes}) ->
+            AccSizes2 = reduce_sizes(AccSizes, Sizes),
+            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, AccSizes2}
         end,
-        {0, 0, 0}, Reds).
+        {0, 0, {0, 0}}, Reds).
 
-sum_leaf_sizes(nil, _) ->
+reduce_sizes(nil, _) ->
     nil;
-sum_leaf_sizes(_, nil) ->
+reduce_sizes(_, nil) ->
     nil;
-sum_leaf_sizes(Size1, Size2) ->
-    Size1 + Size2.
+reduce_sizes({A1, E1}, {A2, E2}) ->
+    {A1 + A2, E1 + E2};
+reduce_sizes(S, {_, _} = Acc) when is_integer(Acc) ->
+    reduce_sizes({0, S}, Acc).
+
 
 btree_by_seq_reduce(reduce, DocInfos) ->
     % count the number of documents
@@ -574,7 +585,7 @@ flush_trees(_Db, [], AccFlushedTrees) ->
 flush_trees(#db{fd = Fd} = Db,
         [InfoUnflushed | RestUnflushed], AccFlushed) ->
     #full_doc_info{update_seq=UpdateSeq, rev_tree=Unflushed} = InfoUnflushed,
-    {Flushed, LeafsSize} = couch_key_tree:mapfold(
+    {Flushed, Sizes} = couch_key_tree:mapfold(
         fun(_Rev, Value, Type, Acc) ->
             case Value of
             #doc{deleted = IsDeleted, body = {summary, Summary, AttsFd}} ->
@@ -596,28 +607,33 @@ flush_trees(#db{fd = Fd} = Db,
                             " changed. Possibly retrying.", []),
                     throw(retry)
                 end,
+                ExternalSize = ?term_size(Summary),
                 {ok, NewSummaryPointer, SummarySize} =
                     couch_file:append_raw_chunk(Fd, Summary),
-                TotalSize = lists:foldl(
+                AttsSize = lists:foldl(
                     fun(Att, A) -> A + couch_att:fetch(att_len, Att) end,
-                    SummarySize, Value#doc.atts),
-                NewValue = #leaf{deleted=IsDeleted, ptr=NewSummaryPointer,
-                                 seq=UpdateSeq, size=TotalSize},
+                    0, Value#doc.atts),
+                NewValue = #leaf{deleted=IsDeleted,
+                                 ptr=NewSummaryPointer,
+                                 seq=UpdateSeq,
+                                 sizes={SummarySize + AttsSize,
+                                        ExternalSize + AttsSize}},
                 case Type of
                 leaf ->
-                    {NewValue, Acc + TotalSize};
+                    {NewValue, reduce_sizes(Acc, {SummarySize + AttsSize,
+                                                  ExternalSize + AttsSize})};
                 branch ->
                     {NewValue, Acc}
                 end;
-             {_, _, _, LeafSize} when Type =:= leaf, LeafSize =/= nil ->
-                {Value, Acc + LeafSize};
-             _ ->
+            {_, _, _, Sizes1} when Type =:= leaf, Sizes1 =/= nil ->
+                {Value, reduce_sizes(Acc, Sizes1)};
+            _ ->
                 {Value, Acc}
             end
-        end, 0, Unflushed),
+        end, {0, 0}, Unflushed),
     InfoFlushed = InfoUnflushed#full_doc_info{
         rev_tree = Flushed,
-        leafs_size = LeafsSize
+        sizes = Sizes
     },
     flush_trees(Db, RestUnflushed, [InfoFlushed | AccFlushed]).
 
@@ -975,12 +991,14 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) ->
                     {_Body, AttsInfo} = Summary = copy_doc_attachments(
                         Db, Sp, DestFd),
                     SummaryChunk = make_doc_summary(NewDb, Summary),
+                    ExternalSize = ?term_size(SummaryChunk),
                     {ok, Pos, SummarySize} = couch_file:append_raw_chunk(
                         DestFd, SummaryChunk),
-                    TotalLeafSize = lists:foldl(
+                    AttsSize = lists:foldl(
                         fun({_, _, _, AttLen, _, _, _, _}, S) -> S + AttLen end,
-                        SummarySize, AttsInfo),
-                    Leaf#leaf{ptr=Pos, size=TotalLeafSize}
+                        0, AttsInfo),
+                    Leaf#leaf{ptr=Pos, sizes={SummarySize + AttsSize,
+                                              ExternalSize + AttsSize}}
                 end, RevTree)}
         end, NewInfos0),