You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@couchdb.apache.org by gi...@git.apache.org on 2017/09/15 14:00:31 UTC

[GitHub] nickva commented on a change in pull request #806: FEEDBACK ONLY: Compactor optimize emsort

nickva commented on a change in pull request #806: FEEDBACK ONLY: Compactor optimize emsort
URL: https://github.com/apache/couchdb/pull/806#discussion_r139156221
 
 

 ##########
 File path: src/couch/src/couch_db_updater.erl
 ##########
 @@ -1065,83 +1039,156 @@ merge_lookups([FDI | RestInfos], Lookups) ->
 check_md5(Md5, Md5) -> ok;
 check_md5(_, _) -> throw(md5_mismatch).
 
-copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) ->
-    DocInfoIds = [Id || #doc_info{id=Id} <- MixedInfos],
-    LookupResults = couch_btree:lookup(Db#db.id_tree, DocInfoIds),
-    % COUCHDB-968, make sure we prune duplicates during compaction
-    NewInfos0 = lists:usort(fun(#full_doc_info{id=A}, #full_doc_info{id=B}) ->
-        A =< B
-    end, merge_lookups(MixedInfos, LookupResults)),
 
-    NewInfos1 = lists:map(fun(Info) ->
-        {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun
-            (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) ->
-                {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd),
-                % In the future, we should figure out how to do this for
-                % upgrade purposes.
-                EJsonBody = case is_binary(Body) of
-                    true ->
-                        couch_compress:decompress(Body);
-                    false ->
-                        Body
-                end,
-                SummaryChunk = make_doc_summary(NewDb, {Body, AttInfos}),
-                ExternalSize = ?term_size(EJsonBody),
-                {ok, Pos, SummarySize} = couch_file:append_raw_chunk(
-                    DestFd, SummaryChunk),
-                AttSizes = [{element(3,A), element(4,A)} || A <- AttInfos],
-                NewLeaf = Leaf#leaf{
-                    ptr = Pos,
-                    sizes = #size_info{
-                        active = SummarySize,
-                        external = ExternalSize
-                    },
-                    atts = AttSizes
-                },
-                {NewLeaf, add_sizes(leaf, NewLeaf, SizesAcc)};
-            (_Rev, _Leaf, branch, SizesAcc) ->
-                {?REV_MISSING, SizesAcc}
-        end, {0, 0, []}, Info#full_doc_info.rev_tree),
-        {FinalAS, FinalES, FinalAtts} = FinalAcc,
-        TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0, FinalAtts),
-        NewActiveSize = FinalAS + TotalAttSize,
-        NewExternalSize = FinalES + TotalAttSize,
-        Info#full_doc_info{
-            rev_tree = NewRevTree,
-            sizes = #size_info{
-                active = NewActiveSize,
-                external = NewExternalSize
-            }
-        }
-    end, NewInfos0),
+-ifdef(TEST).
+-define(COMP_EVENT(Name),
+        couch_db_updater_ev:event(Name)).
+-else.
+-define(COMP_EVENT(Name), ignore).
+-endif.
 
-    NewInfos = stem_full_doc_infos(Db, NewInfos1),
-    RemoveSeqs =
-    case Retry of
-    nil ->
-        [];
-    OldDocIdTree ->
-        % Compaction is being rerun to catch up to writes during the
-        % first pass. This means we may have docs that already exist
-        % in the seq_tree in the .data file. Here we lookup any old
-        % update_seqs so that they can be removed.
-        Ids = [Id || #full_doc_info{id=Id} <- NewInfos],
-        Existing = couch_btree:lookup(OldDocIdTree, Ids),
-        [Seq || {ok, #full_doc_info{update_seq=Seq}} <- Existing]
+
+start_copy_compact(#db{}=Db) ->
+    erlang:put(io_priority, {db_compact, Db#db.name}),
+    couch_log:debug("Compaction process spawned for db \"~s\"", [Db#db.name]),
+
+    ?COMP_EVENT(init),
+    {ok, InitCompSt} = open_compaction_files(Db),
+    ?COMP_EVENT(files_opened),
+
+    Stages = [
 
 Review comment:
   Nice refactoring here. I like the pattern.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services