You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by br...@apache.org on 2013/04/25 22:20:34 UTC
[07/13] git commit: [#6078] Chunk processing of commits when cleaning data during repo refresh to avoid BSON doc size limits

[#6078] Chunk processing of commits when cleaning data during repo refresh to avoid BSON doc size limits

Signed-off-by: Cory Johns <cj...@slashdotmedia.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/115cdad1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/115cdad1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/115cdad1

Branch: refs/heads/db/2835
Commit: 115cdad111a046102abf4d8063482c6c6bf0c615
Parents: 51f5361
Author: Cory Johns <cj...@slashdotmedia.com>
Authored: Thu Apr 11 16:18:12 2013 +0000
Committer: Dave Brondsema <db...@slashdotmedia.com>
Committed: Wed Apr 24 21:37:34 2013 +0000

----------------------------------------------------------------------
 Allura/allura/scripts/refreshrepo.py |   66 +++++++++++++++++------------
 1 files changed, 39 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/115cdad1/Allura/allura/scripts/refreshrepo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/refreshrepo.py b/Allura/allura/scripts/refreshrepo.py
index 43ff7c2..f55a054 100644
--- a/Allura/allura/scripts/refreshrepo.py
+++ b/Allura/allura/scripts/refreshrepo.py
@@ -68,42 +68,54 @@ class RefreshRepo(ScriptTask):
                     if options.clean:
                         ci_ids = list(c.app.repo.all_commit_ids())
                         log.info("Deleting mongo data for %i commits...", len(ci_ids))
-                        tree_ids = [
-                                tree_id for doc in
-                                M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids}},
-                                                       {"tree_ids": 1})
-                                for tree_id in doc.get("tree_ids", [])]
-
-                        i = M.repo.CommitDoc.m.find({"_id": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i CommitDoc docs...", i)
-                        M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids}})
+                        # like the tree_ids themselves below, we need to process these in
+                        # chunks to avoid hitting the BSON max size limit
+                        tree_ids = []
+                        for ci_ids_chunk in chunked_list(ci_ids, 3000):
+                            tree_ids.extend([
+                                    tree_id for doc in
+                                    M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids_chunk}},
+                                                           {"tree_ids": 1})
+                                    for tree_id in doc.get("tree_ids", [])])
+
+                            i = M.repo.CommitDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i CommitDoc docs...", i)
+                                M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
 
                         # delete these in chunks, otherwise the query doc can
                         # exceed the max BSON size limit (16MB at the moment)
                         for tree_ids_chunk in chunked_list(tree_ids, 300000):
                             i = M.repo.TreeDoc.m.find({"_id": {"$in": tree_ids_chunk}}).count()
-                            log.info("Deleting %i TreeDoc docs...", i)
-                            M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
+                            if i:
+                                log.info("Deleting %i TreeDoc docs...", i)
+                                M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
                         del tree_ids
 
                         # delete these after TreeDoc and LastCommitDoc so that if
                         # we crash, we don't lose the ability to delete those
-                        i = M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i TreesDoc docs...", i)
-                        M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}})
-
-                        # delete LastCommitDocs
-                        i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
-                        log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i)
-                        M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
-
-                        i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i DiffInfoDoc docs...", i)
-                        M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids}})
-
-                        i = M.repo.CommitRunDoc.m.find({"commit_ids": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i CommitRunDoc docs...", i)
-                        M.repo.CommitRunDoc.m.remove({"commit_ids": {"$in": ci_ids}})
+                        for ci_ids_chunk in chunked_list(ci_ids, 3000):
+                            # delete TreesDocs
+                            i = M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i TreesDoc docs...", i)
+                                M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
+
+                            # delete LastCommitDocs
+                            i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids_chunk})).count()
+                            if i:
+                                log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i)
+                                M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids_chunk}))
+
+                            i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i DiffInfoDoc docs...", i)
+                                M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
+
+                            i = M.repo.CommitRunDoc.m.find({"commit_ids": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i CommitRunDoc docs...", i)
+                                M.repo.CommitRunDoc.m.remove({"commit_ids": {"$in": ci_ids_chunk}})
                         del ci_ids
 
                     try: