You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@couchdb.apache.org by davisp <gi...@git.apache.org> on 2017/03/30 15:06:37 UTC

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

GitHub user davisp opened a pull request:

    https://github.com/apache/couchdb-couch/pull/241

    Improve compaction task status updates

    Previous the emsort related operations did not update the compaction
    task status. For large databases this leads to some very long waits
    while the compaction task stays at 100%. This change adds progress
    reports to the steps for sorting and copying document ids back into the
    database file.

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/apache/couchdb-couch feat-improve-compaction-task-status

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/couchdb-couch/pull/241.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #241
    
----
commit 1db1337301a7c897be41e13cd328270509008478
Author: Paul J. Davis <pa...@gmail.com>
Date:   2017-03-28T21:21:38Z

    Improve compaction task status updates
    
    Previous the emsort related operations did not update the compaction
    task status. For large databases this leads to some very long waits
    while the compaction task stays at 100%. This change adds progress
    reports to the steps for sorting and copying document ids back into the
    database file.

----


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

Posted by iilyak <gi...@git.apache.org>.
Github user iilyak commented on a diff in the pull request:

    https://github.com/apache/couchdb-couch/pull/241#discussion_r109047926
  
    --- Diff: src/couch_db_updater.erl ---
    @@ -1323,12 +1326,82 @@ bind_id_tree(Db, Fd, State) ->
         Db#db{id_tree=IdBtree}.
     
     
    -sort_meta_data(Db0) ->
    -    {ok, Ems} = couch_emsort:merge(Db0#db.id_tree),
    -    Db0#db{id_tree=Ems}.
    +sort_meta_data(Db0, DocCount) ->
    +    couch_task_status:update([
    +        {phase, sort_ids_init},
    +        {total_changes, DocCount},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    Ems0 = Db0#db.id_tree,
    +    Options = [
    +        {event_cb, fun emsort_cb/3},
    +        {event_st, {init, 0, 0}}
    +    ],
    +    Ems1 = couch_emsort:set_options(Ems0, Options),
    +    {ok, Ems2} = couch_emsort:merge(Ems1),
    +    Db0#db{id_tree=Ems2}.
    +
    +
    +emsort_cb(_Ems, {merge, chain}, {init, Copied, Nodes}) ->
    +    {init, Copied, Nodes + 1};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) when Copied >= 1000 ->
    +    update_compact_task(Copied + 1),
    +    {init, 0, Nodes};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) ->
    +    {init, Copied + 1, Nodes};
    +emsort_cb(Ems, {merge_start, reverse}, {init, Copied, Nodes}) ->
    --- End diff --
    
    It looks like we wouldn't have a progress updates for the second pass. Is it intentional?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

Posted by iilyak <gi...@git.apache.org>.
Github user iilyak commented on a diff in the pull request:

    https://github.com/apache/couchdb-couch/pull/241#discussion_r109013211
  
    --- Diff: src/couch_db_updater.erl ---
    @@ -1323,12 +1326,82 @@ bind_id_tree(Db, Fd, State) ->
         Db#db{id_tree=IdBtree}.
     
     
    -sort_meta_data(Db0) ->
    -    {ok, Ems} = couch_emsort:merge(Db0#db.id_tree),
    -    Db0#db{id_tree=Ems}.
    +sort_meta_data(Db0, DocCount) ->
    +    couch_task_status:update([
    +        {phase, sort_ids_init},
    +        {total_changes, DocCount},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    Ems0 = Db0#db.id_tree,
    +    Options = [
    +        {event_cb, fun emsort_cb/3},
    +        {event_st, {init, 0, 0}}
    +    ],
    +    Ems1 = couch_emsort:set_options(Ems0, Options),
    +    {ok, Ems2} = couch_emsort:merge(Ems1),
    +    Db0#db{id_tree=Ems2}.
    +
    +
    +emsort_cb(_Ems, {merge, chain}, {init, Copied, Nodes}) ->
    +    {init, Copied, Nodes + 1};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) when Copied >= 1000 ->
    +    update_compact_task(Copied + 1),
    +    {init, 0, Nodes};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) ->
    +    {init, Copied + 1, Nodes};
    +emsort_cb(Ems, {merge_start, reverse}, {init, Copied, Nodes}) ->
    +    BBChunkSize = couch_emsort:get_bb_chunk_size(Ems),
    +
    +    % Subtract one because we already finished the first
    +    % iteration when we were counting the number of nodes
    +    % in the backbone.
    +    Iters = calculate_sort_iters(Nodes, BBChunkSize, 0) - 1,
    +
    +    % Compaction retries mean we may have copied more than
    +    % doc count rows. This accounts for that by using the
    +    % number we've actually copied.
    +    [PrevCopied] = couch_task_status:get([changes_done]),
    +    TotalCopied = PrevCopied + Copied,
    +
    +    couch_task_status:update([
    +        {phase, sort_ids},
    +        {total_changes, Iters * TotalCopied},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    0;
    +
    +emsort_cb(_Ems, row_copy, Copied) when is_integer(Copied), Copied > 1000 ->
    --- End diff --
    
    Magic constant 1000.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

Posted by iilyak <gi...@git.apache.org>.
Github user iilyak commented on a diff in the pull request:

    https://github.com/apache/couchdb-couch/pull/241#discussion_r109049344
  
    --- Diff: src/couch_db_updater.erl ---
    @@ -1323,12 +1326,82 @@ bind_id_tree(Db, Fd, State) ->
         Db#db{id_tree=IdBtree}.
     
     
    -sort_meta_data(Db0) ->
    -    {ok, Ems} = couch_emsort:merge(Db0#db.id_tree),
    -    Db0#db{id_tree=Ems}.
    +sort_meta_data(Db0, DocCount) ->
    +    couch_task_status:update([
    +        {phase, sort_ids_init},
    +        {total_changes, DocCount},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    Ems0 = Db0#db.id_tree,
    +    Options = [
    +        {event_cb, fun emsort_cb/3},
    +        {event_st, {init, 0, 0}}
    +    ],
    +    Ems1 = couch_emsort:set_options(Ems0, Options),
    +    {ok, Ems2} = couch_emsort:merge(Ems1),
    +    Db0#db{id_tree=Ems2}.
    +
    +
    +emsort_cb(_Ems, {merge, chain}, {init, Copied, Nodes}) ->
    +    {init, Copied, Nodes + 1};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) when Copied >= 1000 ->
    --- End diff --
    
    Yes I mean that it would be better to have a define. Like ?UPDATE_FREQ or ?BATCH_SIZE or something.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch issue #241: Improve compaction task status updates

Posted by davisp <gi...@git.apache.org>.
Github user davisp commented on the issue:

    https://github.com/apache/couchdb-couch/pull/241
  
    Updated to use define's for batch sizes and fixed the changes/doc count mixup.
    
    For the {merge_start, forward}, thats only there for completeness. It felt a bit weird to not include that event even though I don't necessarily need it for this work.
    
    Updates during the second phase happen here:
    
    https://github.com/apache/couchdb-couch/pull/241/files#diff-f6f654ab26b490bab95be6f502c49d89R1376
    
    This is a bit subtle but it seemed like the best I could do without starting to modify the on-disk contents of files which I like to avoid when at all possible.
    
    Its a bit funky but without starting to store the total number of rows in emsort the best approach I had was to guess with the input being the total number of changes processed in this run and then in the first phase of the merge sort just count how many rows we have. Once that first pass is over we can calculate the total number of rows that will be copied and then just update progress in the normal fashion. Its a bit awkward but that emsort_cb kind of switches modes once it sees that the first phase has ended and then its purely just listening for row_copy events.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch issue #241: Improve compaction task status updates

Posted by davisp <gi...@git.apache.org>.
Github user davisp commented on the issue:

    https://github.com/apache/couchdb-couch/pull/241
  
    And I hesitate to store the total number of rows because there'd then be upgrade things to worry about and generally speaking this will just sort itself out while still giving an approximate progress update.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

Posted by iilyak <gi...@git.apache.org>.
Github user iilyak commented on a diff in the pull request:

    https://github.com/apache/couchdb-couch/pull/241#discussion_r109013180
  
    --- Diff: src/couch_db_updater.erl ---
    @@ -1323,12 +1326,82 @@ bind_id_tree(Db, Fd, State) ->
         Db#db{id_tree=IdBtree}.
     
     
    -sort_meta_data(Db0) ->
    -    {ok, Ems} = couch_emsort:merge(Db0#db.id_tree),
    -    Db0#db{id_tree=Ems}.
    +sort_meta_data(Db0, DocCount) ->
    +    couch_task_status:update([
    +        {phase, sort_ids_init},
    +        {total_changes, DocCount},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    Ems0 = Db0#db.id_tree,
    +    Options = [
    +        {event_cb, fun emsort_cb/3},
    +        {event_st, {init, 0, 0}}
    +    ],
    +    Ems1 = couch_emsort:set_options(Ems0, Options),
    +    {ok, Ems2} = couch_emsort:merge(Ems1),
    +    Db0#db{id_tree=Ems2}.
    +
    +
    +emsort_cb(_Ems, {merge, chain}, {init, Copied, Nodes}) ->
    +    {init, Copied, Nodes + 1};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) when Copied >= 1000 ->
    --- End diff --
    
    Magic constant 1000.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch issue #241: Improve compaction task status updates

Posted by davisp <gi...@git.apache.org>.
Github user davisp commented on the issue:

    https://github.com/apache/couchdb-couch/pull/241
  
    Also I need to change my use of DocCount to be TotalChanges. I noticed while testing the optimizations I tried that its currently not correct during compaction retries.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

Posted by davisp <gi...@git.apache.org>.
Github user davisp commented on a diff in the pull request:

    https://github.com/apache/couchdb-couch/pull/241#discussion_r109026748
  
    --- Diff: src/couch_db_updater.erl ---
    @@ -1323,12 +1326,82 @@ bind_id_tree(Db, Fd, State) ->
         Db#db{id_tree=IdBtree}.
     
     
    -sort_meta_data(Db0) ->
    -    {ok, Ems} = couch_emsort:merge(Db0#db.id_tree),
    -    Db0#db{id_tree=Ems}.
    +sort_meta_data(Db0, DocCount) ->
    +    couch_task_status:update([
    +        {phase, sort_ids_init},
    +        {total_changes, DocCount},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    Ems0 = Db0#db.id_tree,
    +    Options = [
    +        {event_cb, fun emsort_cb/3},
    +        {event_st, {init, 0, 0}}
    +    ],
    +    Ems1 = couch_emsort:set_options(Ems0, Options),
    +    {ok, Ems2} = couch_emsort:merge(Ems1),
    +    Db0#db{id_tree=Ems2}.
    +
    +
    +emsort_cb(_Ems, {merge, chain}, {init, Copied, Nodes}) ->
    +    {init, Copied, Nodes + 1};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) when Copied >= 1000 ->
    --- End diff --
    
    I assume you mean you'd rather that be a define? I just copied the same shape as merge_docids down below.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] couchdb-couch pull request #241: Improve compaction task status updates

Posted by iilyak <gi...@git.apache.org>.
Github user iilyak commented on a diff in the pull request:

    https://github.com/apache/couchdb-couch/pull/241#discussion_r109014150
  
    --- Diff: src/couch_db_updater.erl ---
    @@ -1323,12 +1326,82 @@ bind_id_tree(Db, Fd, State) ->
         Db#db{id_tree=IdBtree}.
     
     
    -sort_meta_data(Db0) ->
    -    {ok, Ems} = couch_emsort:merge(Db0#db.id_tree),
    -    Db0#db{id_tree=Ems}.
    +sort_meta_data(Db0, DocCount) ->
    +    couch_task_status:update([
    +        {phase, sort_ids_init},
    +        {total_changes, DocCount},
    +        {changes_done, 0},
    +        {progress, 0}
    +    ]),
    +    Ems0 = Db0#db.id_tree,
    +    Options = [
    +        {event_cb, fun emsort_cb/3},
    +        {event_st, {init, 0, 0}}
    +    ],
    +    Ems1 = couch_emsort:set_options(Ems0, Options),
    +    {ok, Ems2} = couch_emsort:merge(Ems1),
    +    Db0#db{id_tree=Ems2}.
    +
    +
    +emsort_cb(_Ems, {merge, chain}, {init, Copied, Nodes}) ->
    +    {init, Copied, Nodes + 1};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) when Copied >= 1000 ->
    +    update_compact_task(Copied + 1),
    +    {init, 0, Nodes};
    +emsort_cb(_Ems, row_copy, {init, Copied, Nodes}) ->
    +    {init, Copied + 1, Nodes};
    +emsort_cb(Ems, {merge_start, reverse}, {init, Copied, Nodes}) ->
    --- End diff --
    
    I noticed that we can also have `{merge_start, forward}` [event](https://github.com/apache/couchdb-couch/blob/1db1337301a7c897be41e13cd328270509008478/src/couch_emsort.erl#L246:L252). Should we handle it as well?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---