You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@couchdb.apache.org by Andy Wenk <an...@nms.de> on 2014/02/11 09:13:31 UTC

Fwd: couch commit: updated refs/heads/2001-feature-external-size to d9d11a9

This is not a rant but a selfish wish :)

I wish all commit messages would be this explanatory like the one below. As
I am not really deep into the erlang code but would love to understand
more, a good way for me is to read all the commit messages. But mostly we
have oneliner. Reading the code is ok but with a explaining message, it is
clearer.

Maybe you remember this when you write your next message to help me and
others :)

Thanks a lot

Cheers

Andy

---------- Forwarded message ----------
From: <da...@apache.org>
Date: 10 February 2014 22:54
Subject: couch commit: updated refs/heads/2001-feature-external-size to
d9d11a9
To: commits@couchdb.apache.org


Updated Branches:
  refs/heads/2001-feature-external-size [created] d9d11a943


Implement database external size calculations

This patch adds calculations to show the "external size" of a database
which is roughly a measure of how much disk space it would take to store
the contents of the database in flat files. It is used to calculate
rough compression ratios for capacity planning.

COUCHDB-2001


Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-couch/commit/d9d11a94
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch/tree/d9d11a94
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch/diff/d9d11a94

Branch: refs/heads/2001-feature-external-size
Commit: d9d11a943e2815897ee5b5b3d1b1eef4b417c1fa
Parents: 09c6556
Author: Paul J. Davis <pa...@gmail.com>
Authored: Mon Feb 10 14:03:47 2014 -0600
Committer: Paul J. Davis <pa...@gmail.com>
Committed: Mon Feb 10 15:32:21 2014 -0600

----------------------------------------------------------------------
 include/couch_db.hrl     |   5 +-
 src/couch_btree.erl      |  10 +-
 src/couch_compress.erl   |  10 ++
 src/couch_db.erl         |  56 +++++-----
 src/couch_db_updater.erl | 237 ++++++++++++++++++++++++++----------------
 5 files changed, 196 insertions(+), 122 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/include/couch_db.hrl
----------------------------------------------------------------------
diff --git a/include/couch_db.hrl b/include/couch_db.hrl
index 2c015df..2ce5ebe 100644
--- a/include/couch_db.hrl
+++ b/include/couch_db.hrl
@@ -65,7 +65,7 @@
     update_seq = 0,
     deleted = false,
     rev_tree = [],
-    leafs_size = 0
+    sizes = {0, 0}
     }).

 -record(httpd,
@@ -251,6 +251,7 @@
     deleted,
     ptr,
     seq,
-    size = nil
+    sizes = {0, 0},
+    atts = []
 }).


http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_btree.erl
----------------------------------------------------------------------
diff --git a/src/couch_btree.erl b/src/couch_btree.erl
index 9caceb8..ac5681d 100644
--- a/src/couch_btree.erl
+++ b/src/couch_btree.erl
@@ -61,8 +61,8 @@ final_reduce(#btree{reduce=Reduce}, Val) ->
     final_reduce(Reduce, Val);
 final_reduce(Reduce, {[], []}) ->
     Reduce(reduce, []);
-final_reduce(_Bt, {[], [Red]}) ->
-    Red;
+final_reduce(Reduce, {[], [Red]}) ->
+    Reduce(rereduce, [Red]);
 final_reduce(Reduce, {[], Reductions}) ->
     Reduce(rereduce, Reductions);
 final_reduce(Reduce, {KVs, Reductions}) ->
@@ -92,14 +92,14 @@ fold_reduce(#btree{root=Root}=Bt, Fun, Acc, Options) ->

 full_reduce(#btree{root=nil,reduce=Reduce}) ->
     {ok, Reduce(reduce, [])};
-full_reduce(#btree{root=Root}) ->
-    {ok, element(2, Root)}.
+full_reduce(#btree{root=Root, reduce=Reduce}) ->
+    {ok, Reduce(rereduce, [element(2, Root)])}.

 size(#btree{root = nil}) ->
     0;
 size(#btree{root = {_P, _Red}}) ->
     % pre 1.2 format
-    nil;
+    undefined;
 size(#btree{root = {_P, _Red, Size}}) ->
     Size.


http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_compress.erl
----------------------------------------------------------------------
diff --git a/src/couch_compress.erl b/src/couch_compress.erl
index 6b47a7a..49876d8 100644
--- a/src/couch_compress.erl
+++ b/src/couch_compress.erl
@@ -14,6 +14,7 @@

 -export([compress/2, decompress/1, is_compressed/2]).
 -export([get_compression_method/0]).
+-export([uncompressed_length/1]).

 -include_lib("couch/include/couch_db.hrl").

@@ -82,3 +83,12 @@ is_compressed(<<?TERM_PREFIX, _/binary>>, Method) ->
 is_compressed(Term, _Method) when not is_binary(Term) ->
     false.

+
+uncompressed_length(<<?SNAPPY_PREFIX, _/binary>> = Bin) ->
+    snappy:uncompressed_length(Bin);
+uncompressed_length(<<?COMPRESSED_TERM_PREFIX, _/binary>> = Bin) ->
+    <<131, 80, Size:4/big-unsigned-integer-unit:8, _/binary>> = Bin,
+    Size;
+uncompressed_length(<<?TERM_PREFIX, _/binary>> = Bin) ->
+    size(Bin).
+

http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db.erl
----------------------------------------------------------------------
diff --git a/src/couch_db.erl b/src/couch_db.erl
index 32a0049..6432e8e 100644
--- a/src/couch_db.erl
+++ b/src/couch_db.erl
@@ -302,44 +302,48 @@ get_db_info(Db) ->
         name=Name,
         instance_start_time=StartTime,
         committed_update_seq=CommittedUpdateSeq,
-        id_tree = IdBtree,
-        seq_tree = SeqBtree,
-        local_tree = LocalBtree
+        id_tree = IdBtree
     } = Db,
-    {ok, Size} = couch_file:bytes(Fd),
-    {ok, DbReduction} = couch_btree:full_reduce(IdBtree),
+    {ok, FileSize} = couch_file:bytes(Fd),
+    {ok, {Count, DelCount, Sizes}} = couch_btree:full_reduce(IdBtree),
+    {ActiveSize0, ExternalSize} = Sizes,
+    ActiveSize = active_size(Db, ActiveSize0),
     InfoList = [
         {db_name, Name},
-        {doc_count, element(1, DbReduction)},
-        {doc_del_count, element(2, DbReduction)},
+        {doc_count, Count},
+        {doc_del_count, DelCount},
         {update_seq, SeqNum},
         {purge_seq, couch_db:get_purge_seq(Db)},
         {compact_running, Compactor/=nil},
-        {disk_size, Size},
-        {data_size, db_data_size(DbReduction, [SeqBtree, IdBtree,
LocalBtree])},
+        {disk_size, FileSize},
+        {data_size, ActiveSize},
+        {sizes, {[
+            {file, FileSize},
+            {active, ActiveSize},
+            {external, ExternalSize}
+        ]}},
         {instance_start_time, StartTime},
         {disk_format_version, DiskVersion},
         {committed_update_seq, CommittedUpdateSeq}
         ],
     {ok, InfoList}.

-db_data_size({_Count, _DelCount}, _Trees) ->
-    % pre 1.2 format, upgraded on compaction
-    null;
-db_data_size({_Count, _DelCount, nil}, _Trees) ->
-    null;
-db_data_size({_Count, _DelCount, DocAndAttsSize}, Trees) ->
-    sum_tree_sizes(DocAndAttsSize, Trees).
-
-sum_tree_sizes(Acc, []) ->
-    Acc;
-sum_tree_sizes(Acc, [T | Rest]) ->
-    case couch_btree:size(T) of
-    nil ->
-        null;
-    Sz ->
-        sum_tree_sizes(Acc + Sz, Rest)
-    end.
+active_size(#db{}=Db, DocActiveSize) ->
+    Trees = [
+        Db#db.id_tree,
+        Db#db.seq_tree,
+        Db#db.local_tree
+    ],
+    lists:foldl(fun(T, Acc) ->
+        case couch_btree:size(T) of
+            _ when Acc == null ->
+                null;
+            undefined ->
+                null;
+            Size ->
+                Acc + Size
+        end
+    end, DocActiveSize, Trees).

 get_design_docs(#db{name = <<"shards/", _:18/binary, DbName/binary>>}) ->
     {_, Ref} = spawn_monitor(fun() -> exit(fabric:design_docs(DbName))
end),

http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db_updater.erl
----------------------------------------------------------------------
diff --git a/src/couch_db_updater.erl b/src/couch_db_updater.erl
index 649826a..eb75177 100644
--- a/src/couch_db_updater.erl
+++ b/src/couch_db_updater.erl
@@ -342,40 +342,60 @@ collect_updates(GroupedDocsAcc, ClientsAcc,
MergeConflicts, FullCommit) ->
     end.

 rev_tree(DiskTree) ->
-    couch_key_tree:mapfold(fun
-        (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, leaf, _Acc) ->
+    couch_key_tree:map(fun
+        (_RevId, {Del, Ptr, Seq}) ->
             % pre 1.2 format, will be upgraded on compaction
-            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
seq=UpdateSeq}, nil};
-        (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, branch, Acc) ->
-            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
seq=UpdateSeq}, Acc};
-        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, leaf, Acc) ->
-            Acc2 = sum_leaf_sizes(Acc, Size),
-            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
seq=UpdateSeq, size=Size}, Acc2};
-        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, branch, Acc) ->
-            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
seq=UpdateSeq, size=Size}, Acc};
-        (_RevId, ?REV_MISSING, _Type, Acc) ->
-            {?REV_MISSING, Acc}
-    end, 0, DiskTree).
+            #leaf{deleted=?i2b(Del), ptr=Ptr, seq=Seq};
+        (_RevId, {Del, Ptr, Seq, Size}) ->
+            % Pre-bigcouch format, will be upgraded on compaction
+            #leaf{
+                deleted = ?i2b(Del),
+                ptr = Ptr,
+                seq = Seq,
+                sizes = {Size, 0},
+                atts = []
+            };
+        (_RevId, {Del, Ptr, Seq, Sizes, Atts}) ->
+            #leaf{
+                deleted = ?i2b(Del),
+                ptr = Ptr,
+                seq = Seq,
+                sizes = Sizes,
+                atts = Atts
+            };
+        (_RevId, ?REV_MISSING) ->
+            ?REV_MISSING
+    end, DiskTree).

 disk_tree(RevTree) ->
     couch_key_tree:map(fun
         (_RevId, ?REV_MISSING) ->
             ?REV_MISSING;
-        (_RevId, #leaf{deleted=IsDeleted, ptr=BodyPointer, seq=UpdateSeq,
size=Size}) ->
-            {?b2i(IsDeleted), BodyPointer, UpdateSeq, Size}
+        (_RevId, #leaf{}=Leaf) ->
+            #leaf{
+                deleted = Del,
+                ptr = Ptr,
+                seq = Seq,
+                sizes = Sizes,
+                atts = Atts
+            } = Leaf,
+            {?b2i(Del), Ptr, Seq, upgrade_sizes(Sizes), Atts}
     end, RevTree).

+upgrade_sizes({_, _} = Sizes) ->
+    Sizes;
+upgrade_sizes(S) when is_integer(S) ->
+    {S, 0}.
+
 btree_by_seq_split(#full_doc_info{id=Id, update_seq=Seq, deleted=Del,
rev_tree=T}) ->
     {Seq, {Id, ?b2i(Del), disk_tree(T)}}.

 btree_by_seq_join(Seq, {Id, Del, DiskTree}) when is_integer(Del) ->
-    {RevTree, LeafsSize} = rev_tree(DiskTree),
     #full_doc_info{
         id = Id,
         update_seq = Seq,
         deleted = ?i2b(Del),
-        rev_tree = RevTree,
-        leafs_size = LeafsSize
+        rev_tree = rev_tree(DiskTree)
     };
 btree_by_seq_join(KeySeq, {Id, RevInfos, DeletedRevInfos}) ->
     % Older versions stored #doc_info records in the seq_tree.
@@ -389,49 +409,59 @@ btree_by_seq_join(KeySeq, {Id, RevInfos,
DeletedRevInfos}) ->
             [#rev_info{rev=Rev,seq=Seq,deleted=true,body_sp = Bp} ||
                 {Rev, Seq, Bp} <- DeletedRevInfos]}.

-btree_by_id_split(#full_doc_info{id=Id, update_seq=Seq,
-        deleted=Deleted, rev_tree=Tree}) ->
-    {Id, {Seq, ?b2i(Deleted), disk_tree(Tree)}}.
+btree_by_id_split(#full_doc_info{}=Info) ->
+    #full_doc_info{
+        id = Id,
+        update_seq = Seq,
+        deleted = Del,
+        sizes = Sizes,
+        rev_tree = Tree
+    } = Info,
+    {Id, {Seq, ?b2i(Del), upgrade_sizes(Sizes), disk_tree(Tree)}}.

 btree_by_id_join(Id, {HighSeq, Deleted, DiskTree}) ->
-    {Tree, LeafsSize} = rev_tree(DiskTree),
+    % Upgrade from pre-BigCouch disk format
+    ActiveSize = couch_key_tree:fold(fun
+        (_RevId, {_Del, _Ptr, _Seq}, _, Acc) ->
+            Acc;
+        (_RevId, {_Del, _Ptr, _Seq, Size}, _, Acc) ->
+            Acc + Size;
+        (_RevId, {_Del, _Ptr, _Seq, Sizes, _Atts}, _, Acc) ->
+            {Active, _} = Sizes,
+            Active + Acc;
+        (_RevId, ?REV_MISSING, _, Acc) ->
+            Acc
+    end, 0, DiskTree),
+    btree_by_id_join(Id, {HighSeq, Deleted, {ActiveSize, 0}, DiskTree});
+
+btree_by_id_join(Id, {HighSeq, Deleted, Sizes, DiskTree}) ->
     #full_doc_info{
         id = Id,
         update_seq = HighSeq,
         deleted = ?i2b(Deleted),
-        rev_tree = Tree,
-        leafs_size = LeafsSize
+        sizes = Sizes,
+        rev_tree = rev_tree(DiskTree)
     }.

 btree_by_id_reduce(reduce, FullDocInfos) ->
-    lists:foldl(
-        fun(Info, {NotDeleted, Deleted, Size}) ->
-            Size2 = sum_leaf_sizes(Size, Info#full_doc_info.leafs_size),
-            case Info#full_doc_info.deleted of
-            true ->
-                {NotDeleted, Deleted + 1, Size2};
-            false ->
-                {NotDeleted + 1, Deleted, Size2}
-            end
-        end,
-        {0, 0, 0}, FullDocInfos);
-btree_by_id_reduce(rereduce, Reds) ->
-    lists:foldl(
-        fun({NotDeleted, Deleted}, {AccNotDeleted, AccDeleted, _AccSize})
->
-            % pre 1.2 format, will be upgraded on compaction
-            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, nil};
-        ({NotDeleted, Deleted, Size}, {AccNotDeleted, AccDeleted,
AccSize}) ->
-            AccSize2 = sum_leaf_sizes(AccSize, Size),
-            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, AccSize2}
-        end,
-        {0, 0, 0}, Reds).
-
-sum_leaf_sizes(nil, _) ->
-    nil;
-sum_leaf_sizes(_, nil) ->
-    nil;
-sum_leaf_sizes(Size1, Size2) ->
-    Size1 + Size2.
+    lists:foldl(fun
+        (#full_doc_info{deleted=false, sizes=Sizes}, {NotDel, Del, SAcc})
->
+            {NotDel + 1, Del, reduce_sizes(Sizes, SAcc)};
+        (#full_doc_info{deleted=true, sizes=Sizes}, {NotDel, Del, SAcc}) ->
+            {NotDel, Del + 1, reduce_sizes(Sizes, SAcc)}
+    end, {0, 0, {0, 0}}, FullDocInfos);
+btree_by_id_reduce(rereduce, Reductions) ->
+    lists:foldl(fun
+        ({NotDel, Del}, {NDAcc, DAcc, SAcc}) ->
+            {NotDel + NDAcc, Del + DAcc, SAcc};
+        ({NotDel, Del, Sizes}, {NDAcc, DAcc, SAcc}) ->
+            {NotDel + NDAcc, Del + DAcc, reduce_sizes(Sizes, SAcc)}
+    end, {0, 0, {0, 0}}, Reductions).
+
+reduce_sizes({A1, E1}, {A2, E2}) ->
+    {A1 + A2, E1 + E2};
+reduce_sizes(S, {_, _} = Acc) when is_integer(S) ->
+    reduce_sizes({S, 0}, Acc).

 btree_by_seq_reduce(reduce, DocInfos) ->
     % count the number of documents
@@ -549,10 +579,15 @@ flush_trees(_Db, [], AccFlushedTrees) ->
 flush_trees(#db{fd = Fd} = Db,
         [InfoUnflushed | RestUnflushed], AccFlushed) ->
     #full_doc_info{update_seq=UpdateSeq, rev_tree=Unflushed} =
InfoUnflushed,
-    {Flushed, LeafsSize} = couch_key_tree:mapfold(
+    {Flushed, FinalAcc} = couch_key_tree:mapfold(
         fun(_Rev, Value, Type, Acc) ->
             case Value of
-            #doc{deleted = IsDeleted, body = {summary, Summary, AttsFd}} ->
+            #doc{} = Doc ->
+                #doc{
+                    deleted = IsDeleted,
+                    body = {summary, Summary, AttsFd},
+                    atts = Atts
+                } = Doc,
                 % this node value is actually an unwritten document
summary,
                 % write to disk.
                 % make sure the Fd in the written bins is the same Fd we
are
@@ -571,31 +606,44 @@ flush_trees(#db{fd = Fd} = Db,
                             " changed. Possibly retrying.", []),
                     throw(retry)
                 end,
-                {ok, NewSummaryPointer, SummarySize} =
-                    couch_file:append_raw_chunk(Fd, Summary),
-                TotalSize = lists:foldl(
-                    fun(#att{att_len = L}, A) -> A + L end,
-                    SummarySize, Value#doc.atts),
-                NewValue = #leaf{deleted=IsDeleted, ptr=NewSummaryPointer,
-                                 seq=UpdateSeq, size=TotalSize},
-                case Type of
-                leaf ->
-                    {NewValue, Acc + TotalSize};
-                branch ->
-                    {NewValue, Acc}
-                end;
-             {_, _, _, LeafSize} when Type =:= leaf, LeafSize =/= nil ->
-                {Value, Acc + LeafSize};
-             _ ->
+                AttsInfo = lists:usort([
+                        {P, L} || #att{data = {_, P}, att_len = L} <- Atts
+                    ]),
+                [_, _, SummaryBin] = Summary,
+                ExternalSize =
couch_compress:uncompressed_length(SummaryBin),
+                {ok, NewPtr, ActiveSize}
+                    = couch_file:append_raw_chunk(Fd, Summary),
+                Leaf = #leaf{
+                    deleted = IsDeleted,
+                    ptr = NewPtr,
+                    seq = UpdateSeq,
+                    sizes = {ActiveSize, ExternalSize},
+                    atts = AttsInfo
+                },
+                {Leaf, add_sizes(Type, Leaf, Acc)};
+            #leaf{} = Leaf ->
+                {Value, add_sizes(Type, Leaf, Acc)};
+             ?REV_MISSING ->
                 {Value, Acc}
             end
-        end, 0, Unflushed),
+        end, {0, 0, []}, Unflushed),
+    {FinalAS, FinalES, FinalAtts} = FinalAcc,
+    TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0, FinalAtts),
     InfoFlushed = InfoUnflushed#full_doc_info{
         rev_tree = Flushed,
-        leafs_size = LeafsSize
+        sizes = {FinalAS + TotalAttSize, FinalES + TotalAttSize}
     },
     flush_trees(Db, RestUnflushed, [InfoFlushed | AccFlushed]).

+add_sizes(branch, _, Acc) ->
+    Acc;
+add_sizes(leaf, #leaf{sizes=Sizes, atts=AttSizes}, Acc) ->
+    {ActiveSize, ExternalSize} = upgrade_sizes(Sizes),
+    {ASAcc, ESAcc, AttsAcc} = Acc,
+    NewASAcc = ActiveSize + ASAcc,
+    NewESAcc = ExternalSize + ESAcc,
+    NewAttsAcc = lists:umerge(AttSizes, AttsAcc),
+    {NewASAcc, NewESAcc, NewAttsAcc}.

 send_result(Client, Ref, NewResult) ->
     % used to send a result to the client
@@ -896,23 +944,34 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos,
Retry) ->
         A =< B
     end, merge_lookups(MixedInfos, LookupResults)),

-    NewInfos1 = lists:map(
-        fun(#full_doc_info{rev_tree=RevTree}=Info) ->
-            Info#full_doc_info{rev_tree=couch_key_tree:map(
-                fun(_, _, branch) ->
-                    ?REV_MISSING;
-                (_Rev, #leaf{ptr=Sp}=Leaf, leaf) ->
-                    {_Body, AttsInfo} = Summary = copy_doc_attachments(
-                        Db, Sp, DestFd),
-                    SummaryChunk = make_doc_summary(NewDb, Summary),
-                    {ok, Pos, SummarySize} = couch_file:append_raw_chunk(
-                        DestFd, SummaryChunk),
-                    TotalLeafSize = lists:foldl(
-                        fun({_, _, _, AttLen, _, _, _, _}, S) -> S +
AttLen end,
-                        SummarySize, AttsInfo),
-                    Leaf#leaf{ptr=Pos, size=TotalLeafSize}
-                end, RevTree)}
-        end, NewInfos0),
+    NewInfos1 = lists:map(fun(Info) ->
+        {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun
+            (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) ->
+                {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd),
+                Summary = make_doc_summary(NewDb, {Body, AttInfos}),
+                [_, _, SummaryBin] = Summary,
+                ExternalSize =
couch_compress:uncompressed_length(SummaryBin),
+                {ok, Pos, ActiveSize}
+                    = couch_file:append_raw_chunk(DestFd, Summary),
+                AttSizes = [{element(3, A), element(4, A)} || A <-
AttInfos],
+                NewLeaf = Leaf#leaf{
+                    ptr = Pos,
+                    sizes = {ActiveSize, ExternalSize},
+                    atts = lists:usort(AttSizes)
+                },
+                {NewLeaf, add_sizes(leaf, NewLeaf, SizesAcc)};
+            (_Rev, _Value, branch, SizesAcc) ->
+                {?REV_MISSING, SizesAcc}
+        end, {0, 0, []}, Info#full_doc_info.rev_tree),
+        {FinalAS, FinalES, FinalAtts} = FinalAcc,
+        TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0,
FinalAtts),
+        NewActiveSize = FinalAS + TotalAttSize,
+        NewExternalSize = FinalES + TotalAttSize,
+        Info#full_doc_info{
+            rev_tree = NewRevTree,
+            sizes = {NewActiveSize, NewExternalSize}
+        }
+    end, NewInfos0),

     NewInfos = stem_full_doc_infos(Db, NewInfos1),
     RemoveSeqs =




-- 
Andy Wenk
Hamburg - Germany
RockIt!

http://www.couchdb-buch.de
http://www.pg-praxisbuch.de

GPG fingerprint: C044 8322 9E12 1483 4FEC 9452 B65D 6BE3 9ED3 9588

https://people.apache.org/keys/committer/andywenk.asc

Re: couch commit: updated refs/heads/2001-feature-external-size to d9d11a9

Posted by Andy Wenk <an...@nms.de>.
awesome! Thanks Bob


On 11 February 2014 10:51, Robert Samuel Newson <rn...@apache.org> wrote:

> Clear commit messages are something we’ve committed to in the past and
> will be a required practice once we get the merges nailed down. The review
> process will include ensuring that commits follow the standard we’ve agreed
> to.
>
> B.
>
> On 11 Feb 2014, at 08:13, Andy Wenk <an...@nms.de> wrote:
>
> > This is not a rant but a selfish wish :)
> >
> > I wish all commit messages would be this explanatory like the one below.
> As
> > I am not really deep into the erlang code but would love to understand
> > more, a good way for me is to read all the commit messages. But mostly we
> > have oneliner. Reading the code is ok but with a explaining message, it
> is
> > clearer.
> >
> > Maybe you remember this when you write your next message to help me and
> > others :)
> >
> > Thanks a lot
> >
> > Cheers
> >
> > Andy
> >
> > ---------- Forwarded message ----------
> > From: <da...@apache.org>
> > Date: 10 February 2014 22:54
> > Subject: couch commit: updated refs/heads/2001-feature-external-size to
> > d9d11a9
> > To: commits@couchdb.apache.org
> >
> >
> > Updated Branches:
> >  refs/heads/2001-feature-external-size [created] d9d11a943
> >
> >
> > Implement database external size calculations
> >
> > This patch adds calculations to show the "external size" of a database
> > which is roughly a measure of how much disk space it would take to store
> > the contents of the database in flat files. It is used to calculate
> > rough compression ratios for capacity planning.
> >
> > COUCHDB-2001
> >
> >
> > Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch/repo
> > Commit:
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/commit/d9d11a94
> > Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch/tree/d9d11a94
> > Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch/diff/d9d11a94
> >
> > Branch: refs/heads/2001-feature-external-size
> > Commit: d9d11a943e2815897ee5b5b3d1b1eef4b417c1fa
> > Parents: 09c6556
> > Author: Paul J. Davis <pa...@gmail.com>
> > Authored: Mon Feb 10 14:03:47 2014 -0600
> > Committer: Paul J. Davis <pa...@gmail.com>
> > Committed: Mon Feb 10 15:32:21 2014 -0600
> >
> > ----------------------------------------------------------------------
> > include/couch_db.hrl     |   5 +-
> > src/couch_btree.erl      |  10 +-
> > src/couch_compress.erl   |  10 ++
> > src/couch_db.erl         |  56 +++++-----
> > src/couch_db_updater.erl | 237 ++++++++++++++++++++++++++----------------
> > 5 files changed, 196 insertions(+), 122 deletions(-)
> > ----------------------------------------------------------------------
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/include/couch_db.hrl
> > ----------------------------------------------------------------------
> > diff --git a/include/couch_db.hrl b/include/couch_db.hrl
> > index 2c015df..2ce5ebe 100644
> > --- a/include/couch_db.hrl
> > +++ b/include/couch_db.hrl
> > @@ -65,7 +65,7 @@
> >     update_seq = 0,
> >     deleted = false,
> >     rev_tree = [],
> > -    leafs_size = 0
> > +    sizes = {0, 0}
> >     }).
> >
> > -record(httpd,
> > @@ -251,6 +251,7 @@
> >     deleted,
> >     ptr,
> >     seq,
> > -    size = nil
> > +    sizes = {0, 0},
> > +    atts = []
> > }).
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_btree.erl
> > ----------------------------------------------------------------------
> > diff --git a/src/couch_btree.erl b/src/couch_btree.erl
> > index 9caceb8..ac5681d 100644
> > --- a/src/couch_btree.erl
> > +++ b/src/couch_btree.erl
> > @@ -61,8 +61,8 @@ final_reduce(#btree{reduce=Reduce}, Val) ->
> >     final_reduce(Reduce, Val);
> > final_reduce(Reduce, {[], []}) ->
> >     Reduce(reduce, []);
> > -final_reduce(_Bt, {[], [Red]}) ->
> > -    Red;
> > +final_reduce(Reduce, {[], [Red]}) ->
> > +    Reduce(rereduce, [Red]);
> > final_reduce(Reduce, {[], Reductions}) ->
> >     Reduce(rereduce, Reductions);
> > final_reduce(Reduce, {KVs, Reductions}) ->
> > @@ -92,14 +92,14 @@ fold_reduce(#btree{root=Root}=Bt, Fun, Acc, Options)
> ->
> >
> > full_reduce(#btree{root=nil,reduce=Reduce}) ->
> >     {ok, Reduce(reduce, [])};
> > -full_reduce(#btree{root=Root}) ->
> > -    {ok, element(2, Root)}.
> > +full_reduce(#btree{root=Root, reduce=Reduce}) ->
> > +    {ok, Reduce(rereduce, [element(2, Root)])}.
> >
> > size(#btree{root = nil}) ->
> >     0;
> > size(#btree{root = {_P, _Red}}) ->
> >     % pre 1.2 format
> > -    nil;
> > +    undefined;
> > size(#btree{root = {_P, _Red, Size}}) ->
> >     Size.
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_compress.erl
> > ----------------------------------------------------------------------
> > diff --git a/src/couch_compress.erl b/src/couch_compress.erl
> > index 6b47a7a..49876d8 100644
> > --- a/src/couch_compress.erl
> > +++ b/src/couch_compress.erl
> > @@ -14,6 +14,7 @@
> >
> > -export([compress/2, decompress/1, is_compressed/2]).
> > -export([get_compression_method/0]).
> > +-export([uncompressed_length/1]).
> >
> > -include_lib("couch/include/couch_db.hrl").
> >
> > @@ -82,3 +83,12 @@ is_compressed(<<?TERM_PREFIX, _/binary>>, Method) ->
> > is_compressed(Term, _Method) when not is_binary(Term) ->
> >     false.
> >
> > +
> > +uncompressed_length(<<?SNAPPY_PREFIX, _/binary>> = Bin) ->
> > +    snappy:uncompressed_length(Bin);
> > +uncompressed_length(<<?COMPRESSED_TERM_PREFIX, _/binary>> = Bin) ->
> > +    <<131, 80, Size:4/big-unsigned-integer-unit:8, _/binary>> = Bin,
> > +    Size;
> > +uncompressed_length(<<?TERM_PREFIX, _/binary>> = Bin) ->
> > +    size(Bin).
> > +
> >
> >
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db.erl
> > ----------------------------------------------------------------------
> > diff --git a/src/couch_db.erl b/src/couch_db.erl
> > index 32a0049..6432e8e 100644
> > --- a/src/couch_db.erl
> > +++ b/src/couch_db.erl
> > @@ -302,44 +302,48 @@ get_db_info(Db) ->
> >         name=Name,
> >         instance_start_time=StartTime,
> >         committed_update_seq=CommittedUpdateSeq,
> > -        id_tree = IdBtree,
> > -        seq_tree = SeqBtree,
> > -        local_tree = LocalBtree
> > +        id_tree = IdBtree
> >     } = Db,
> > -    {ok, Size} = couch_file:bytes(Fd),
> > -    {ok, DbReduction} = couch_btree:full_reduce(IdBtree),
> > +    {ok, FileSize} = couch_file:bytes(Fd),
> > +    {ok, {Count, DelCount, Sizes}} = couch_btree:full_reduce(IdBtree),
> > +    {ActiveSize0, ExternalSize} = Sizes,
> > +    ActiveSize = active_size(Db, ActiveSize0),
> >     InfoList = [
> >         {db_name, Name},
> > -        {doc_count, element(1, DbReduction)},
> > -        {doc_del_count, element(2, DbReduction)},
> > +        {doc_count, Count},
> > +        {doc_del_count, DelCount},
> >         {update_seq, SeqNum},
> >         {purge_seq, couch_db:get_purge_seq(Db)},
> >         {compact_running, Compactor/=nil},
> > -        {disk_size, Size},
> > -        {data_size, db_data_size(DbReduction, [SeqBtree, IdBtree,
> > LocalBtree])},
> > +        {disk_size, FileSize},
> > +        {data_size, ActiveSize},
> > +        {sizes, {[
> > +            {file, FileSize},
> > +            {active, ActiveSize},
> > +            {external, ExternalSize}
> > +        ]}},
> >         {instance_start_time, StartTime},
> >         {disk_format_version, DiskVersion},
> >         {committed_update_seq, CommittedUpdateSeq}
> >         ],
> >     {ok, InfoList}.
> >
> > -db_data_size({_Count, _DelCount}, _Trees) ->
> > -    % pre 1.2 format, upgraded on compaction
> > -    null;
> > -db_data_size({_Count, _DelCount, nil}, _Trees) ->
> > -    null;
> > -db_data_size({_Count, _DelCount, DocAndAttsSize}, Trees) ->
> > -    sum_tree_sizes(DocAndAttsSize, Trees).
> > -
> > -sum_tree_sizes(Acc, []) ->
> > -    Acc;
> > -sum_tree_sizes(Acc, [T | Rest]) ->
> > -    case couch_btree:size(T) of
> > -    nil ->
> > -        null;
> > -    Sz ->
> > -        sum_tree_sizes(Acc + Sz, Rest)
> > -    end.
> > +active_size(#db{}=Db, DocActiveSize) ->
> > +    Trees = [
> > +        Db#db.id_tree,
> > +        Db#db.seq_tree,
> > +        Db#db.local_tree
> > +    ],
> > +    lists:foldl(fun(T, Acc) ->
> > +        case couch_btree:size(T) of
> > +            _ when Acc == null ->
> > +                null;
> > +            undefined ->
> > +                null;
> > +            Size ->
> > +                Acc + Size
> > +        end
> > +    end, DocActiveSize, Trees).
> >
> > get_design_docs(#db{name = <<"shards/", _:18/binary, DbName/binary>>}) ->
> >     {_, Ref} = spawn_monitor(fun() -> exit(fabric:design_docs(DbName))
> > end),
> >
> >
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db_updater.erl
> > ----------------------------------------------------------------------
> > diff --git a/src/couch_db_updater.erl b/src/couch_db_updater.erl
> > index 649826a..eb75177 100644
> > --- a/src/couch_db_updater.erl
> > +++ b/src/couch_db_updater.erl
> > @@ -342,40 +342,60 @@ collect_updates(GroupedDocsAcc, ClientsAcc,
> > MergeConflicts, FullCommit) ->
> >     end.
> >
> > rev_tree(DiskTree) ->
> > -    couch_key_tree:mapfold(fun
> > -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, leaf, _Acc) ->
> > +    couch_key_tree:map(fun
> > +        (_RevId, {Del, Ptr, Seq}) ->
> >             % pre 1.2 format, will be upgraded on compaction
> > -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> > seq=UpdateSeq}, nil};
> > -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, branch, Acc) ->
> > -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> > seq=UpdateSeq}, Acc};
> > -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, leaf, Acc)
> ->
> > -            Acc2 = sum_leaf_sizes(Acc, Size),
> > -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> > seq=UpdateSeq, size=Size}, Acc2};
> > -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, branch,
> Acc) ->
> > -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> > seq=UpdateSeq, size=Size}, Acc};
> > -        (_RevId, ?REV_MISSING, _Type, Acc) ->
> > -            {?REV_MISSING, Acc}
> > -    end, 0, DiskTree).
> > +            #leaf{deleted=?i2b(Del), ptr=Ptr, seq=Seq};
> > +        (_RevId, {Del, Ptr, Seq, Size}) ->
> > +            % Pre-bigcouch format, will be upgraded on compaction
> > +            #leaf{
> > +                deleted = ?i2b(Del),
> > +                ptr = Ptr,
> > +                seq = Seq,
> > +                sizes = {Size, 0},
> > +                atts = []
> > +            };
> > +        (_RevId, {Del, Ptr, Seq, Sizes, Atts}) ->
> > +            #leaf{
> > +                deleted = ?i2b(Del),
> > +                ptr = Ptr,
> > +                seq = Seq,
> > +                sizes = Sizes,
> > +                atts = Atts
> > +            };
> > +        (_RevId, ?REV_MISSING) ->
> > +            ?REV_MISSING
> > +    end, DiskTree).
> >
> > disk_tree(RevTree) ->
> >     couch_key_tree:map(fun
> >         (_RevId, ?REV_MISSING) ->
> >             ?REV_MISSING;
> > -        (_RevId, #leaf{deleted=IsDeleted, ptr=BodyPointer,
> seq=UpdateSeq,
> > size=Size}) ->
> > -            {?b2i(IsDeleted), BodyPointer, UpdateSeq, Size}
> > +        (_RevId, #leaf{}=Leaf) ->
> > +            #leaf{
> > +                deleted = Del,
> > +                ptr = Ptr,
> > +                seq = Seq,
> > +                sizes = Sizes,
> > +                atts = Atts
> > +            } = Leaf,
> > +            {?b2i(Del), Ptr, Seq, upgrade_sizes(Sizes), Atts}
> >     end, RevTree).
> >
> > +upgrade_sizes({_, _} = Sizes) ->
> > +    Sizes;
> > +upgrade_sizes(S) when is_integer(S) ->
> > +    {S, 0}.
> > +
> > btree_by_seq_split(#full_doc_info{id=Id, update_seq=Seq, deleted=Del,
> > rev_tree=T}) ->
> >     {Seq, {Id, ?b2i(Del), disk_tree(T)}}.
> >
> > btree_by_seq_join(Seq, {Id, Del, DiskTree}) when is_integer(Del) ->
> > -    {RevTree, LeafsSize} = rev_tree(DiskTree),
> >     #full_doc_info{
> >         id = Id,
> >         update_seq = Seq,
> >         deleted = ?i2b(Del),
> > -        rev_tree = RevTree,
> > -        leafs_size = LeafsSize
> > +        rev_tree = rev_tree(DiskTree)
> >     };
> > btree_by_seq_join(KeySeq, {Id, RevInfos, DeletedRevInfos}) ->
> >     % Older versions stored #doc_info records in the seq_tree.
> > @@ -389,49 +409,59 @@ btree_by_seq_join(KeySeq, {Id, RevInfos,
> > DeletedRevInfos}) ->
> >             [#rev_info{rev=Rev,seq=Seq,deleted=true,body_sp = Bp} ||
> >                 {Rev, Seq, Bp} <- DeletedRevInfos]}.
> >
> > -btree_by_id_split(#full_doc_info{id=Id, update_seq=Seq,
> > -        deleted=Deleted, rev_tree=Tree}) ->
> > -    {Id, {Seq, ?b2i(Deleted), disk_tree(Tree)}}.
> > +btree_by_id_split(#full_doc_info{}=Info) ->
> > +    #full_doc_info{
> > +        id = Id,
> > +        update_seq = Seq,
> > +        deleted = Del,
> > +        sizes = Sizes,
> > +        rev_tree = Tree
> > +    } = Info,
> > +    {Id, {Seq, ?b2i(Del), upgrade_sizes(Sizes), disk_tree(Tree)}}.
> >
> > btree_by_id_join(Id, {HighSeq, Deleted, DiskTree}) ->
> > -    {Tree, LeafsSize} = rev_tree(DiskTree),
> > +    % Upgrade from pre-BigCouch disk format
> > +    ActiveSize = couch_key_tree:fold(fun
> > +        (_RevId, {_Del, _Ptr, _Seq}, _, Acc) ->
> > +            Acc;
> > +        (_RevId, {_Del, _Ptr, _Seq, Size}, _, Acc) ->
> > +            Acc + Size;
> > +        (_RevId, {_Del, _Ptr, _Seq, Sizes, _Atts}, _, Acc) ->
> > +            {Active, _} = Sizes,
> > +            Active + Acc;
> > +        (_RevId, ?REV_MISSING, _, Acc) ->
> > +            Acc
> > +    end, 0, DiskTree),
> > +    btree_by_id_join(Id, {HighSeq, Deleted, {ActiveSize, 0}, DiskTree});
> > +
> > +btree_by_id_join(Id, {HighSeq, Deleted, Sizes, DiskTree}) ->
> >     #full_doc_info{
> >         id = Id,
> >         update_seq = HighSeq,
> >         deleted = ?i2b(Deleted),
> > -        rev_tree = Tree,
> > -        leafs_size = LeafsSize
> > +        sizes = Sizes,
> > +        rev_tree = rev_tree(DiskTree)
> >     }.
> >
> > btree_by_id_reduce(reduce, FullDocInfos) ->
> > -    lists:foldl(
> > -        fun(Info, {NotDeleted, Deleted, Size}) ->
> > -            Size2 = sum_leaf_sizes(Size, Info#full_doc_info.leafs_size),
> > -            case Info#full_doc_info.deleted of
> > -            true ->
> > -                {NotDeleted, Deleted + 1, Size2};
> > -            false ->
> > -                {NotDeleted + 1, Deleted, Size2}
> > -            end
> > -        end,
> > -        {0, 0, 0}, FullDocInfos);
> > -btree_by_id_reduce(rereduce, Reds) ->
> > -    lists:foldl(
> > -        fun({NotDeleted, Deleted}, {AccNotDeleted, AccDeleted,
> _AccSize})
> > ->
> > -            % pre 1.2 format, will be upgraded on compaction
> > -            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, nil};
> > -        ({NotDeleted, Deleted, Size}, {AccNotDeleted, AccDeleted,
> > AccSize}) ->
> > -            AccSize2 = sum_leaf_sizes(AccSize, Size),
> > -            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, AccSize2}
> > -        end,
> > -        {0, 0, 0}, Reds).
> > -
> > -sum_leaf_sizes(nil, _) ->
> > -    nil;
> > -sum_leaf_sizes(_, nil) ->
> > -    nil;
> > -sum_leaf_sizes(Size1, Size2) ->
> > -    Size1 + Size2.
> > +    lists:foldl(fun
> > +        (#full_doc_info{deleted=false, sizes=Sizes}, {NotDel, Del,
> SAcc})
> > ->
> > +            {NotDel + 1, Del, reduce_sizes(Sizes, SAcc)};
> > +        (#full_doc_info{deleted=true, sizes=Sizes}, {NotDel, Del,
> SAcc}) ->
> > +            {NotDel, Del + 1, reduce_sizes(Sizes, SAcc)}
> > +    end, {0, 0, {0, 0}}, FullDocInfos);
> > +btree_by_id_reduce(rereduce, Reductions) ->
> > +    lists:foldl(fun
> > +        ({NotDel, Del}, {NDAcc, DAcc, SAcc}) ->
> > +            {NotDel + NDAcc, Del + DAcc, SAcc};
> > +        ({NotDel, Del, Sizes}, {NDAcc, DAcc, SAcc}) ->
> > +            {NotDel + NDAcc, Del + DAcc, reduce_sizes(Sizes, SAcc)}
> > +    end, {0, 0, {0, 0}}, Reductions).
> > +
> > +reduce_sizes({A1, E1}, {A2, E2}) ->
> > +    {A1 + A2, E1 + E2};
> > +reduce_sizes(S, {_, _} = Acc) when is_integer(S) ->
> > +    reduce_sizes({S, 0}, Acc).
> >
> > btree_by_seq_reduce(reduce, DocInfos) ->
> >     % count the number of documents
> > @@ -549,10 +579,15 @@ flush_trees(_Db, [], AccFlushedTrees) ->
> > flush_trees(#db{fd = Fd} = Db,
> >         [InfoUnflushed | RestUnflushed], AccFlushed) ->
> >     #full_doc_info{update_seq=UpdateSeq, rev_tree=Unflushed} =
> > InfoUnflushed,
> > -    {Flushed, LeafsSize} = couch_key_tree:mapfold(
> > +    {Flushed, FinalAcc} = couch_key_tree:mapfold(
> >         fun(_Rev, Value, Type, Acc) ->
> >             case Value of
> > -            #doc{deleted = IsDeleted, body = {summary, Summary,
> AttsFd}} ->
> > +            #doc{} = Doc ->
> > +                #doc{
> > +                    deleted = IsDeleted,
> > +                    body = {summary, Summary, AttsFd},
> > +                    atts = Atts
> > +                } = Doc,
> >                 % this node value is actually an unwritten document
> > summary,
> >                 % write to disk.
> >                 % make sure the Fd in the written bins is the same Fd we
> > are
> > @@ -571,31 +606,44 @@ flush_trees(#db{fd = Fd} = Db,
> >                             " changed. Possibly retrying.", []),
> >                     throw(retry)
> >                 end,
> > -                {ok, NewSummaryPointer, SummarySize} =
> > -                    couch_file:append_raw_chunk(Fd, Summary),
> > -                TotalSize = lists:foldl(
> > -                    fun(#att{att_len = L}, A) -> A + L end,
> > -                    SummarySize, Value#doc.atts),
> > -                NewValue = #leaf{deleted=IsDeleted,
> ptr=NewSummaryPointer,
> > -                                 seq=UpdateSeq, size=TotalSize},
> > -                case Type of
> > -                leaf ->
> > -                    {NewValue, Acc + TotalSize};
> > -                branch ->
> > -                    {NewValue, Acc}
> > -                end;
> > -             {_, _, _, LeafSize} when Type =:= leaf, LeafSize =/= nil ->
> > -                {Value, Acc + LeafSize};
> > -             _ ->
> > +                AttsInfo = lists:usort([
> > +                        {P, L} || #att{data = {_, P}, att_len = L} <-
> Atts
> > +                    ]),
> > +                [_, _, SummaryBin] = Summary,
> > +                ExternalSize =
> > couch_compress:uncompressed_length(SummaryBin),
> > +                {ok, NewPtr, ActiveSize}
> > +                    = couch_file:append_raw_chunk(Fd, Summary),
> > +                Leaf = #leaf{
> > +                    deleted = IsDeleted,
> > +                    ptr = NewPtr,
> > +                    seq = UpdateSeq,
> > +                    sizes = {ActiveSize, ExternalSize},
> > +                    atts = AttsInfo
> > +                },
> > +                {Leaf, add_sizes(Type, Leaf, Acc)};
> > +            #leaf{} = Leaf ->
> > +                {Value, add_sizes(Type, Leaf, Acc)};
> > +             ?REV_MISSING ->
> >                 {Value, Acc}
> >             end
> > -        end, 0, Unflushed),
> > +        end, {0, 0, []}, Unflushed),
> > +    {FinalAS, FinalES, FinalAtts} = FinalAcc,
> > +    TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0,
> FinalAtts),
> >     InfoFlushed = InfoUnflushed#full_doc_info{
> >         rev_tree = Flushed,
> > -        leafs_size = LeafsSize
> > +        sizes = {FinalAS + TotalAttSize, FinalES + TotalAttSize}
> >     },
> >     flush_trees(Db, RestUnflushed, [InfoFlushed | AccFlushed]).
> >
> > +add_sizes(branch, _, Acc) ->
> > +    Acc;
> > +add_sizes(leaf, #leaf{sizes=Sizes, atts=AttSizes}, Acc) ->
> > +    {ActiveSize, ExternalSize} = upgrade_sizes(Sizes),
> > +    {ASAcc, ESAcc, AttsAcc} = Acc,
> > +    NewASAcc = ActiveSize + ASAcc,
> > +    NewESAcc = ExternalSize + ESAcc,
> > +    NewAttsAcc = lists:umerge(AttSizes, AttsAcc),
> > +    {NewASAcc, NewESAcc, NewAttsAcc}.
> >
> > send_result(Client, Ref, NewResult) ->
> >     % used to send a result to the client
> > @@ -896,23 +944,34 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos,
> > Retry) ->
> >         A =< B
> >     end, merge_lookups(MixedInfos, LookupResults)),
> >
> > -    NewInfos1 = lists:map(
> > -        fun(#full_doc_info{rev_tree=RevTree}=Info) ->
> > -            Info#full_doc_info{rev_tree=couch_key_tree:map(
> > -                fun(_, _, branch) ->
> > -                    ?REV_MISSING;
> > -                (_Rev, #leaf{ptr=Sp}=Leaf, leaf) ->
> > -                    {_Body, AttsInfo} = Summary = copy_doc_attachments(
> > -                        Db, Sp, DestFd),
> > -                    SummaryChunk = make_doc_summary(NewDb, Summary),
> > -                    {ok, Pos, SummarySize} =
> couch_file:append_raw_chunk(
> > -                        DestFd, SummaryChunk),
> > -                    TotalLeafSize = lists:foldl(
> > -                        fun({_, _, _, AttLen, _, _, _, _}, S) -> S +
> > AttLen end,
> > -                        SummarySize, AttsInfo),
> > -                    Leaf#leaf{ptr=Pos, size=TotalLeafSize}
> > -                end, RevTree)}
> > -        end, NewInfos0),
> > +    NewInfos1 = lists:map(fun(Info) ->
> > +        {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun
> > +            (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) ->
> > +                {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd),
> > +                Summary = make_doc_summary(NewDb, {Body, AttInfos}),
> > +                [_, _, SummaryBin] = Summary,
> > +                ExternalSize =
> > couch_compress:uncompressed_length(SummaryBin),
> > +                {ok, Pos, ActiveSize}
> > +                    = couch_file:append_raw_chunk(DestFd, Summary),
> > +                AttSizes = [{element(3, A), element(4, A)} || A <-
> > AttInfos],
> > +                NewLeaf = Leaf#leaf{
> > +                    ptr = Pos,
> > +                    sizes = {ActiveSize, ExternalSize},
> > +                    atts = lists:usort(AttSizes)
> > +                },
> > +                {NewLeaf, add_sizes(leaf, NewLeaf, SizesAcc)};
> > +            (_Rev, _Value, branch, SizesAcc) ->
> > +                {?REV_MISSING, SizesAcc}
> > +        end, {0, 0, []}, Info#full_doc_info.rev_tree),
> > +        {FinalAS, FinalES, FinalAtts} = FinalAcc,
> > +        TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0,
> > FinalAtts),
> > +        NewActiveSize = FinalAS + TotalAttSize,
> > +        NewExternalSize = FinalES + TotalAttSize,
> > +        Info#full_doc_info{
> > +            rev_tree = NewRevTree,
> > +            sizes = {NewActiveSize, NewExternalSize}
> > +        }
> > +    end, NewInfos0),
> >
> >     NewInfos = stem_full_doc_infos(Db, NewInfos1),
> >     RemoveSeqs =
> >
> >
> >
> >
> > --
> > Andy Wenk
> > Hamburg - Germany
> > RockIt!
> >
> > http://www.couchdb-buch.de
> > http://www.pg-praxisbuch.de
> >
> > GPG fingerprint: C044 8322 9E12 1483 4FEC 9452 B65D 6BE3 9ED3 9588
> >
> > https://people.apache.org/keys/committer/andywenk.asc
>
>


-- 
Andy Wenk
Hamburg - Germany
RockIt!

http://www.couchdb-buch.de
http://www.pg-praxisbuch.de

GPG fingerprint: C044 8322 9E12 1483 4FEC 9452 B65D 6BE3 9ED3 9588

https://people.apache.org/keys/committer/andywenk.asc

Re: couch commit: updated refs/heads/2001-feature-external-size to d9d11a9

Posted by Robert Samuel Newson <rn...@apache.org>.
Clear commit messages are something we’ve committed to in the past and will be a required practice once we get the merges nailed down. The review process will include ensuring that commits follow the standard we’ve agreed to.

B.

On 11 Feb 2014, at 08:13, Andy Wenk <an...@nms.de> wrote:

> This is not a rant but a selfish wish :)
> 
> I wish all commit messages would be this explanatory like the one below. As
> I am not really deep into the erlang code but would love to understand
> more, a good way for me is to read all the commit messages. But mostly we
> have oneliner. Reading the code is ok but with a explaining message, it is
> clearer.
> 
> Maybe you remember this when you write your next message to help me and
> others :)
> 
> Thanks a lot
> 
> Cheers
> 
> Andy
> 
> ---------- Forwarded message ----------
> From: <da...@apache.org>
> Date: 10 February 2014 22:54
> Subject: couch commit: updated refs/heads/2001-feature-external-size to
> d9d11a9
> To: commits@couchdb.apache.org
> 
> 
> Updated Branches:
>  refs/heads/2001-feature-external-size [created] d9d11a943
> 
> 
> Implement database external size calculations
> 
> This patch adds calculations to show the "external size" of a database
> which is roughly a measure of how much disk space it would take to store
> the contents of the database in flat files. It is used to calculate
> rough compression ratios for capacity planning.
> 
> COUCHDB-2001
> 
> 
> Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch/repo
> Commit: http://git-wip-us.apache.org/repos/asf/couchdb-couch/commit/d9d11a94
> Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch/tree/d9d11a94
> Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch/diff/d9d11a94
> 
> Branch: refs/heads/2001-feature-external-size
> Commit: d9d11a943e2815897ee5b5b3d1b1eef4b417c1fa
> Parents: 09c6556
> Author: Paul J. Davis <pa...@gmail.com>
> Authored: Mon Feb 10 14:03:47 2014 -0600
> Committer: Paul J. Davis <pa...@gmail.com>
> Committed: Mon Feb 10 15:32:21 2014 -0600
> 
> ----------------------------------------------------------------------
> include/couch_db.hrl     |   5 +-
> src/couch_btree.erl      |  10 +-
> src/couch_compress.erl   |  10 ++
> src/couch_db.erl         |  56 +++++-----
> src/couch_db_updater.erl | 237 ++++++++++++++++++++++++++----------------
> 5 files changed, 196 insertions(+), 122 deletions(-)
> ----------------------------------------------------------------------
> 
> 
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/include/couch_db.hrl
> ----------------------------------------------------------------------
> diff --git a/include/couch_db.hrl b/include/couch_db.hrl
> index 2c015df..2ce5ebe 100644
> --- a/include/couch_db.hrl
> +++ b/include/couch_db.hrl
> @@ -65,7 +65,7 @@
>     update_seq = 0,
>     deleted = false,
>     rev_tree = [],
> -    leafs_size = 0
> +    sizes = {0, 0}
>     }).
> 
> -record(httpd,
> @@ -251,6 +251,7 @@
>     deleted,
>     ptr,
>     seq,
> -    size = nil
> +    sizes = {0, 0},
> +    atts = []
> }).
> 
> 
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_btree.erl
> ----------------------------------------------------------------------
> diff --git a/src/couch_btree.erl b/src/couch_btree.erl
> index 9caceb8..ac5681d 100644
> --- a/src/couch_btree.erl
> +++ b/src/couch_btree.erl
> @@ -61,8 +61,8 @@ final_reduce(#btree{reduce=Reduce}, Val) ->
>     final_reduce(Reduce, Val);
> final_reduce(Reduce, {[], []}) ->
>     Reduce(reduce, []);
> -final_reduce(_Bt, {[], [Red]}) ->
> -    Red;
> +final_reduce(Reduce, {[], [Red]}) ->
> +    Reduce(rereduce, [Red]);
> final_reduce(Reduce, {[], Reductions}) ->
>     Reduce(rereduce, Reductions);
> final_reduce(Reduce, {KVs, Reductions}) ->
> @@ -92,14 +92,14 @@ fold_reduce(#btree{root=Root}=Bt, Fun, Acc, Options) ->
> 
> full_reduce(#btree{root=nil,reduce=Reduce}) ->
>     {ok, Reduce(reduce, [])};
> -full_reduce(#btree{root=Root}) ->
> -    {ok, element(2, Root)}.
> +full_reduce(#btree{root=Root, reduce=Reduce}) ->
> +    {ok, Reduce(rereduce, [element(2, Root)])}.
> 
> size(#btree{root = nil}) ->
>     0;
> size(#btree{root = {_P, _Red}}) ->
>     % pre 1.2 format
> -    nil;
> +    undefined;
> size(#btree{root = {_P, _Red, Size}}) ->
>     Size.
> 
> 
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_compress.erl
> ----------------------------------------------------------------------
> diff --git a/src/couch_compress.erl b/src/couch_compress.erl
> index 6b47a7a..49876d8 100644
> --- a/src/couch_compress.erl
> +++ b/src/couch_compress.erl
> @@ -14,6 +14,7 @@
> 
> -export([compress/2, decompress/1, is_compressed/2]).
> -export([get_compression_method/0]).
> +-export([uncompressed_length/1]).
> 
> -include_lib("couch/include/couch_db.hrl").
> 
> @@ -82,3 +83,12 @@ is_compressed(<<?TERM_PREFIX, _/binary>>, Method) ->
> is_compressed(Term, _Method) when not is_binary(Term) ->
>     false.
> 
> +
> +uncompressed_length(<<?SNAPPY_PREFIX, _/binary>> = Bin) ->
> +    snappy:uncompressed_length(Bin);
> +uncompressed_length(<<?COMPRESSED_TERM_PREFIX, _/binary>> = Bin) ->
> +    <<131, 80, Size:4/big-unsigned-integer-unit:8, _/binary>> = Bin,
> +    Size;
> +uncompressed_length(<<?TERM_PREFIX, _/binary>> = Bin) ->
> +    size(Bin).
> +
> 
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db.erl
> ----------------------------------------------------------------------
> diff --git a/src/couch_db.erl b/src/couch_db.erl
> index 32a0049..6432e8e 100644
> --- a/src/couch_db.erl
> +++ b/src/couch_db.erl
> @@ -302,44 +302,48 @@ get_db_info(Db) ->
>         name=Name,
>         instance_start_time=StartTime,
>         committed_update_seq=CommittedUpdateSeq,
> -        id_tree = IdBtree,
> -        seq_tree = SeqBtree,
> -        local_tree = LocalBtree
> +        id_tree = IdBtree
>     } = Db,
> -    {ok, Size} = couch_file:bytes(Fd),
> -    {ok, DbReduction} = couch_btree:full_reduce(IdBtree),
> +    {ok, FileSize} = couch_file:bytes(Fd),
> +    {ok, {Count, DelCount, Sizes}} = couch_btree:full_reduce(IdBtree),
> +    {ActiveSize0, ExternalSize} = Sizes,
> +    ActiveSize = active_size(Db, ActiveSize0),
>     InfoList = [
>         {db_name, Name},
> -        {doc_count, element(1, DbReduction)},
> -        {doc_del_count, element(2, DbReduction)},
> +        {doc_count, Count},
> +        {doc_del_count, DelCount},
>         {update_seq, SeqNum},
>         {purge_seq, couch_db:get_purge_seq(Db)},
>         {compact_running, Compactor/=nil},
> -        {disk_size, Size},
> -        {data_size, db_data_size(DbReduction, [SeqBtree, IdBtree,
> LocalBtree])},
> +        {disk_size, FileSize},
> +        {data_size, ActiveSize},
> +        {sizes, {[
> +            {file, FileSize},
> +            {active, ActiveSize},
> +            {external, ExternalSize}
> +        ]}},
>         {instance_start_time, StartTime},
>         {disk_format_version, DiskVersion},
>         {committed_update_seq, CommittedUpdateSeq}
>         ],
>     {ok, InfoList}.
> 
> -db_data_size({_Count, _DelCount}, _Trees) ->
> -    % pre 1.2 format, upgraded on compaction
> -    null;
> -db_data_size({_Count, _DelCount, nil}, _Trees) ->
> -    null;
> -db_data_size({_Count, _DelCount, DocAndAttsSize}, Trees) ->
> -    sum_tree_sizes(DocAndAttsSize, Trees).
> -
> -sum_tree_sizes(Acc, []) ->
> -    Acc;
> -sum_tree_sizes(Acc, [T | Rest]) ->
> -    case couch_btree:size(T) of
> -    nil ->
> -        null;
> -    Sz ->
> -        sum_tree_sizes(Acc + Sz, Rest)
> -    end.
> +active_size(#db{}=Db, DocActiveSize) ->
> +    Trees = [
> +        Db#db.id_tree,
> +        Db#db.seq_tree,
> +        Db#db.local_tree
> +    ],
> +    lists:foldl(fun(T, Acc) ->
> +        case couch_btree:size(T) of
> +            _ when Acc == null ->
> +                null;
> +            undefined ->
> +                null;
> +            Size ->
> +                Acc + Size
> +        end
> +    end, DocActiveSize, Trees).
> 
> get_design_docs(#db{name = <<"shards/", _:18/binary, DbName/binary>>}) ->
>     {_, Ref} = spawn_monitor(fun() -> exit(fabric:design_docs(DbName))
> end),
> 
> http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db_updater.erl
> ----------------------------------------------------------------------
> diff --git a/src/couch_db_updater.erl b/src/couch_db_updater.erl
> index 649826a..eb75177 100644
> --- a/src/couch_db_updater.erl
> +++ b/src/couch_db_updater.erl
> @@ -342,40 +342,60 @@ collect_updates(GroupedDocsAcc, ClientsAcc,
> MergeConflicts, FullCommit) ->
>     end.
> 
> rev_tree(DiskTree) ->
> -    couch_key_tree:mapfold(fun
> -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, leaf, _Acc) ->
> +    couch_key_tree:map(fun
> +        (_RevId, {Del, Ptr, Seq}) ->
>             % pre 1.2 format, will be upgraded on compaction
> -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> seq=UpdateSeq}, nil};
> -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, branch, Acc) ->
> -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> seq=UpdateSeq}, Acc};
> -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, leaf, Acc) ->
> -            Acc2 = sum_leaf_sizes(Acc, Size),
> -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> seq=UpdateSeq, size=Size}, Acc2};
> -        (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, branch, Acc) ->
> -            {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer,
> seq=UpdateSeq, size=Size}, Acc};
> -        (_RevId, ?REV_MISSING, _Type, Acc) ->
> -            {?REV_MISSING, Acc}
> -    end, 0, DiskTree).
> +            #leaf{deleted=?i2b(Del), ptr=Ptr, seq=Seq};
> +        (_RevId, {Del, Ptr, Seq, Size}) ->
> +            % Pre-bigcouch format, will be upgraded on compaction
> +            #leaf{
> +                deleted = ?i2b(Del),
> +                ptr = Ptr,
> +                seq = Seq,
> +                sizes = {Size, 0},
> +                atts = []
> +            };
> +        (_RevId, {Del, Ptr, Seq, Sizes, Atts}) ->
> +            #leaf{
> +                deleted = ?i2b(Del),
> +                ptr = Ptr,
> +                seq = Seq,
> +                sizes = Sizes,
> +                atts = Atts
> +            };
> +        (_RevId, ?REV_MISSING) ->
> +            ?REV_MISSING
> +    end, DiskTree).
> 
> disk_tree(RevTree) ->
>     couch_key_tree:map(fun
>         (_RevId, ?REV_MISSING) ->
>             ?REV_MISSING;
> -        (_RevId, #leaf{deleted=IsDeleted, ptr=BodyPointer, seq=UpdateSeq,
> size=Size}) ->
> -            {?b2i(IsDeleted), BodyPointer, UpdateSeq, Size}
> +        (_RevId, #leaf{}=Leaf) ->
> +            #leaf{
> +                deleted = Del,
> +                ptr = Ptr,
> +                seq = Seq,
> +                sizes = Sizes,
> +                atts = Atts
> +            } = Leaf,
> +            {?b2i(Del), Ptr, Seq, upgrade_sizes(Sizes), Atts}
>     end, RevTree).
> 
> +upgrade_sizes({_, _} = Sizes) ->
> +    Sizes;
> +upgrade_sizes(S) when is_integer(S) ->
> +    {S, 0}.
> +
> btree_by_seq_split(#full_doc_info{id=Id, update_seq=Seq, deleted=Del,
> rev_tree=T}) ->
>     {Seq, {Id, ?b2i(Del), disk_tree(T)}}.
> 
> btree_by_seq_join(Seq, {Id, Del, DiskTree}) when is_integer(Del) ->
> -    {RevTree, LeafsSize} = rev_tree(DiskTree),
>     #full_doc_info{
>         id = Id,
>         update_seq = Seq,
>         deleted = ?i2b(Del),
> -        rev_tree = RevTree,
> -        leafs_size = LeafsSize
> +        rev_tree = rev_tree(DiskTree)
>     };
> btree_by_seq_join(KeySeq, {Id, RevInfos, DeletedRevInfos}) ->
>     % Older versions stored #doc_info records in the seq_tree.
> @@ -389,49 +409,59 @@ btree_by_seq_join(KeySeq, {Id, RevInfos,
> DeletedRevInfos}) ->
>             [#rev_info{rev=Rev,seq=Seq,deleted=true,body_sp = Bp} ||
>                 {Rev, Seq, Bp} <- DeletedRevInfos]}.
> 
> -btree_by_id_split(#full_doc_info{id=Id, update_seq=Seq,
> -        deleted=Deleted, rev_tree=Tree}) ->
> -    {Id, {Seq, ?b2i(Deleted), disk_tree(Tree)}}.
> +btree_by_id_split(#full_doc_info{}=Info) ->
> +    #full_doc_info{
> +        id = Id,
> +        update_seq = Seq,
> +        deleted = Del,
> +        sizes = Sizes,
> +        rev_tree = Tree
> +    } = Info,
> +    {Id, {Seq, ?b2i(Del), upgrade_sizes(Sizes), disk_tree(Tree)}}.
> 
> btree_by_id_join(Id, {HighSeq, Deleted, DiskTree}) ->
> -    {Tree, LeafsSize} = rev_tree(DiskTree),
> +    % Upgrade from pre-BigCouch disk format
> +    ActiveSize = couch_key_tree:fold(fun
> +        (_RevId, {_Del, _Ptr, _Seq}, _, Acc) ->
> +            Acc;
> +        (_RevId, {_Del, _Ptr, _Seq, Size}, _, Acc) ->
> +            Acc + Size;
> +        (_RevId, {_Del, _Ptr, _Seq, Sizes, _Atts}, _, Acc) ->
> +            {Active, _} = Sizes,
> +            Active + Acc;
> +        (_RevId, ?REV_MISSING, _, Acc) ->
> +            Acc
> +    end, 0, DiskTree),
> +    btree_by_id_join(Id, {HighSeq, Deleted, {ActiveSize, 0}, DiskTree});
> +
> +btree_by_id_join(Id, {HighSeq, Deleted, Sizes, DiskTree}) ->
>     #full_doc_info{
>         id = Id,
>         update_seq = HighSeq,
>         deleted = ?i2b(Deleted),
> -        rev_tree = Tree,
> -        leafs_size = LeafsSize
> +        sizes = Sizes,
> +        rev_tree = rev_tree(DiskTree)
>     }.
> 
> btree_by_id_reduce(reduce, FullDocInfos) ->
> -    lists:foldl(
> -        fun(Info, {NotDeleted, Deleted, Size}) ->
> -            Size2 = sum_leaf_sizes(Size, Info#full_doc_info.leafs_size),
> -            case Info#full_doc_info.deleted of
> -            true ->
> -                {NotDeleted, Deleted + 1, Size2};
> -            false ->
> -                {NotDeleted + 1, Deleted, Size2}
> -            end
> -        end,
> -        {0, 0, 0}, FullDocInfos);
> -btree_by_id_reduce(rereduce, Reds) ->
> -    lists:foldl(
> -        fun({NotDeleted, Deleted}, {AccNotDeleted, AccDeleted, _AccSize})
> ->
> -            % pre 1.2 format, will be upgraded on compaction
> -            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, nil};
> -        ({NotDeleted, Deleted, Size}, {AccNotDeleted, AccDeleted,
> AccSize}) ->
> -            AccSize2 = sum_leaf_sizes(AccSize, Size),
> -            {AccNotDeleted + NotDeleted, AccDeleted + Deleted, AccSize2}
> -        end,
> -        {0, 0, 0}, Reds).
> -
> -sum_leaf_sizes(nil, _) ->
> -    nil;
> -sum_leaf_sizes(_, nil) ->
> -    nil;
> -sum_leaf_sizes(Size1, Size2) ->
> -    Size1 + Size2.
> +    lists:foldl(fun
> +        (#full_doc_info{deleted=false, sizes=Sizes}, {NotDel, Del, SAcc})
> ->
> +            {NotDel + 1, Del, reduce_sizes(Sizes, SAcc)};
> +        (#full_doc_info{deleted=true, sizes=Sizes}, {NotDel, Del, SAcc}) ->
> +            {NotDel, Del + 1, reduce_sizes(Sizes, SAcc)}
> +    end, {0, 0, {0, 0}}, FullDocInfos);
> +btree_by_id_reduce(rereduce, Reductions) ->
> +    lists:foldl(fun
> +        ({NotDel, Del}, {NDAcc, DAcc, SAcc}) ->
> +            {NotDel + NDAcc, Del + DAcc, SAcc};
> +        ({NotDel, Del, Sizes}, {NDAcc, DAcc, SAcc}) ->
> +            {NotDel + NDAcc, Del + DAcc, reduce_sizes(Sizes, SAcc)}
> +    end, {0, 0, {0, 0}}, Reductions).
> +
> +reduce_sizes({A1, E1}, {A2, E2}) ->
> +    {A1 + A2, E1 + E2};
> +reduce_sizes(S, {_, _} = Acc) when is_integer(S) ->
> +    reduce_sizes({S, 0}, Acc).
> 
> btree_by_seq_reduce(reduce, DocInfos) ->
>     % count the number of documents
> @@ -549,10 +579,15 @@ flush_trees(_Db, [], AccFlushedTrees) ->
> flush_trees(#db{fd = Fd} = Db,
>         [InfoUnflushed | RestUnflushed], AccFlushed) ->
>     #full_doc_info{update_seq=UpdateSeq, rev_tree=Unflushed} =
> InfoUnflushed,
> -    {Flushed, LeafsSize} = couch_key_tree:mapfold(
> +    {Flushed, FinalAcc} = couch_key_tree:mapfold(
>         fun(_Rev, Value, Type, Acc) ->
>             case Value of
> -            #doc{deleted = IsDeleted, body = {summary, Summary, AttsFd}} ->
> +            #doc{} = Doc ->
> +                #doc{
> +                    deleted = IsDeleted,
> +                    body = {summary, Summary, AttsFd},
> +                    atts = Atts
> +                } = Doc,
>                 % this node value is actually an unwritten document
> summary,
>                 % write to disk.
>                 % make sure the Fd in the written bins is the same Fd we
> are
> @@ -571,31 +606,44 @@ flush_trees(#db{fd = Fd} = Db,
>                             " changed. Possibly retrying.", []),
>                     throw(retry)
>                 end,
> -                {ok, NewSummaryPointer, SummarySize} =
> -                    couch_file:append_raw_chunk(Fd, Summary),
> -                TotalSize = lists:foldl(
> -                    fun(#att{att_len = L}, A) -> A + L end,
> -                    SummarySize, Value#doc.atts),
> -                NewValue = #leaf{deleted=IsDeleted, ptr=NewSummaryPointer,
> -                                 seq=UpdateSeq, size=TotalSize},
> -                case Type of
> -                leaf ->
> -                    {NewValue, Acc + TotalSize};
> -                branch ->
> -                    {NewValue, Acc}
> -                end;
> -             {_, _, _, LeafSize} when Type =:= leaf, LeafSize =/= nil ->
> -                {Value, Acc + LeafSize};
> -             _ ->
> +                AttsInfo = lists:usort([
> +                        {P, L} || #att{data = {_, P}, att_len = L} <- Atts
> +                    ]),
> +                [_, _, SummaryBin] = Summary,
> +                ExternalSize =
> couch_compress:uncompressed_length(SummaryBin),
> +                {ok, NewPtr, ActiveSize}
> +                    = couch_file:append_raw_chunk(Fd, Summary),
> +                Leaf = #leaf{
> +                    deleted = IsDeleted,
> +                    ptr = NewPtr,
> +                    seq = UpdateSeq,
> +                    sizes = {ActiveSize, ExternalSize},
> +                    atts = AttsInfo
> +                },
> +                {Leaf, add_sizes(Type, Leaf, Acc)};
> +            #leaf{} = Leaf ->
> +                {Value, add_sizes(Type, Leaf, Acc)};
> +             ?REV_MISSING ->
>                 {Value, Acc}
>             end
> -        end, 0, Unflushed),
> +        end, {0, 0, []}, Unflushed),
> +    {FinalAS, FinalES, FinalAtts} = FinalAcc,
> +    TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0, FinalAtts),
>     InfoFlushed = InfoUnflushed#full_doc_info{
>         rev_tree = Flushed,
> -        leafs_size = LeafsSize
> +        sizes = {FinalAS + TotalAttSize, FinalES + TotalAttSize}
>     },
>     flush_trees(Db, RestUnflushed, [InfoFlushed | AccFlushed]).
> 
> +add_sizes(branch, _, Acc) ->
> +    Acc;
> +add_sizes(leaf, #leaf{sizes=Sizes, atts=AttSizes}, Acc) ->
> +    {ActiveSize, ExternalSize} = upgrade_sizes(Sizes),
> +    {ASAcc, ESAcc, AttsAcc} = Acc,
> +    NewASAcc = ActiveSize + ASAcc,
> +    NewESAcc = ExternalSize + ESAcc,
> +    NewAttsAcc = lists:umerge(AttSizes, AttsAcc),
> +    {NewASAcc, NewESAcc, NewAttsAcc}.
> 
> send_result(Client, Ref, NewResult) ->
>     % used to send a result to the client
> @@ -896,23 +944,34 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos,
> Retry) ->
>         A =< B
>     end, merge_lookups(MixedInfos, LookupResults)),
> 
> -    NewInfos1 = lists:map(
> -        fun(#full_doc_info{rev_tree=RevTree}=Info) ->
> -            Info#full_doc_info{rev_tree=couch_key_tree:map(
> -                fun(_, _, branch) ->
> -                    ?REV_MISSING;
> -                (_Rev, #leaf{ptr=Sp}=Leaf, leaf) ->
> -                    {_Body, AttsInfo} = Summary = copy_doc_attachments(
> -                        Db, Sp, DestFd),
> -                    SummaryChunk = make_doc_summary(NewDb, Summary),
> -                    {ok, Pos, SummarySize} = couch_file:append_raw_chunk(
> -                        DestFd, SummaryChunk),
> -                    TotalLeafSize = lists:foldl(
> -                        fun({_, _, _, AttLen, _, _, _, _}, S) -> S +
> AttLen end,
> -                        SummarySize, AttsInfo),
> -                    Leaf#leaf{ptr=Pos, size=TotalLeafSize}
> -                end, RevTree)}
> -        end, NewInfos0),
> +    NewInfos1 = lists:map(fun(Info) ->
> +        {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun
> +            (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) ->
> +                {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd),
> +                Summary = make_doc_summary(NewDb, {Body, AttInfos}),
> +                [_, _, SummaryBin] = Summary,
> +                ExternalSize =
> couch_compress:uncompressed_length(SummaryBin),
> +                {ok, Pos, ActiveSize}
> +                    = couch_file:append_raw_chunk(DestFd, Summary),
> +                AttSizes = [{element(3, A), element(4, A)} || A <-
> AttInfos],
> +                NewLeaf = Leaf#leaf{
> +                    ptr = Pos,
> +                    sizes = {ActiveSize, ExternalSize},
> +                    atts = lists:usort(AttSizes)
> +                },
> +                {NewLeaf, add_sizes(leaf, NewLeaf, SizesAcc)};
> +            (_Rev, _Value, branch, SizesAcc) ->
> +                {?REV_MISSING, SizesAcc}
> +        end, {0, 0, []}, Info#full_doc_info.rev_tree),
> +        {FinalAS, FinalES, FinalAtts} = FinalAcc,
> +        TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0,
> FinalAtts),
> +        NewActiveSize = FinalAS + TotalAttSize,
> +        NewExternalSize = FinalES + TotalAttSize,
> +        Info#full_doc_info{
> +            rev_tree = NewRevTree,
> +            sizes = {NewActiveSize, NewExternalSize}
> +        }
> +    end, NewInfos0),
> 
>     NewInfos = stem_full_doc_infos(Db, NewInfos1),
>     RemoveSeqs =
> 
> 
> 
> 
> -- 
> Andy Wenk
> Hamburg - Germany
> RockIt!
> 
> http://www.couchdb-buch.de
> http://www.pg-praxisbuch.de
> 
> GPG fingerprint: C044 8322 9E12 1483 4FEC 9452 B65D 6BE3 9ED3 9588
> 
> https://people.apache.org/keys/committer/andywenk.asc