You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by br...@apache.org on 2013/04/25 22:20:34 UTC
[07/13] git commit: [#6078] Chunk processing of commits when cleaning
data during repo refresh to avoid BSON doc size limits
[#6078] Chunk processing of commits when cleaning data during repo refresh to avoid BSON doc size limits
Signed-off-by: Cory Johns <cj...@slashdotmedia.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/115cdad1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/115cdad1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/115cdad1
Branch: refs/heads/db/2835
Commit: 115cdad111a046102abf4d8063482c6c6bf0c615
Parents: 51f5361
Author: Cory Johns <cj...@slashdotmedia.com>
Authored: Thu Apr 11 16:18:12 2013 +0000
Committer: Dave Brondsema <db...@slashdotmedia.com>
Committed: Wed Apr 24 21:37:34 2013 +0000
----------------------------------------------------------------------
Allura/allura/scripts/refreshrepo.py | 66 +++++++++++++++++------------
1 files changed, 39 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/115cdad1/Allura/allura/scripts/refreshrepo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/refreshrepo.py b/Allura/allura/scripts/refreshrepo.py
index 43ff7c2..f55a054 100644
--- a/Allura/allura/scripts/refreshrepo.py
+++ b/Allura/allura/scripts/refreshrepo.py
@@ -68,42 +68,54 @@ class RefreshRepo(ScriptTask):
if options.clean:
ci_ids = list(c.app.repo.all_commit_ids())
log.info("Deleting mongo data for %i commits...", len(ci_ids))
- tree_ids = [
- tree_id for doc in
- M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids}},
- {"tree_ids": 1})
- for tree_id in doc.get("tree_ids", [])]
-
- i = M.repo.CommitDoc.m.find({"_id": {"$in": ci_ids}}).count()
- log.info("Deleting %i CommitDoc docs...", i)
- M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids}})
+ # like the tree_ids themselves below, we need to process these in
+ # chunks to avoid hitting the BSON max size limit
+ tree_ids = []
+ for ci_ids_chunk in chunked_list(ci_ids, 3000):
+ tree_ids.extend([
+ tree_id for doc in
+ M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids_chunk}},
+ {"tree_ids": 1})
+ for tree_id in doc.get("tree_ids", [])])
+
+ i = M.repo.CommitDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+ if i:
+ log.info("Deleting %i CommitDoc docs...", i)
+ M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
# delete these in chunks, otherwise the query doc can
# exceed the max BSON size limit (16MB at the moment)
for tree_ids_chunk in chunked_list(tree_ids, 300000):
i = M.repo.TreeDoc.m.find({"_id": {"$in": tree_ids_chunk}}).count()
- log.info("Deleting %i TreeDoc docs...", i)
- M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
+ if i:
+ log.info("Deleting %i TreeDoc docs...", i)
+ M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
del tree_ids
# delete these after TreeDoc and LastCommitDoc so that if
# we crash, we don't lose the ability to delete those
- i = M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids}}).count()
- log.info("Deleting %i TreesDoc docs...", i)
- M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}})
-
- # delete LastCommitDocs
- i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
- log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i)
- M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
-
- i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids}}).count()
- log.info("Deleting %i DiffInfoDoc docs...", i)
- M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids}})
-
- i = M.repo.CommitRunDoc.m.find({"commit_ids": {"$in": ci_ids}}).count()
- log.info("Deleting %i CommitRunDoc docs...", i)
- M.repo.CommitRunDoc.m.remove({"commit_ids": {"$in": ci_ids}})
+ for ci_ids_chunk in chunked_list(ci_ids, 3000):
+ # delete TreesDocs
+ i = M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+ if i:
+ log.info("Deleting %i TreesDoc docs...", i)
+ M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
+
+ # delete LastCommitDocs
+ i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids_chunk})).count()
+ if i:
+ log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i)
+ M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids_chunk}))
+
+ i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+ if i:
+ log.info("Deleting %i DiffInfoDoc docs...", i)
+ M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
+
+ i = M.repo.CommitRunDoc.m.find({"commit_ids": {"$in": ci_ids_chunk}}).count()
+ if i:
+ log.info("Deleting %i CommitRunDoc docs...", i)
+ M.repo.CommitRunDoc.m.remove({"commit_ids": {"$in": ci_ids_chunk}})
del ci_ids
try: