You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by br...@apache.org on 2017/10/20 20:54:05 UTC

[2/2] allura git commit: [#8168] remove TreesDoc (plural) model. Large space savings

[#8168] remove TreesDoc (plural) model.  Large space savings


Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/a433213c
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/a433213c
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/a433213c

Branch: refs/heads/db/8168
Commit: a433213c7e49d66d3a07d0678df68e6b59b46444
Parents: c373909
Author: Dave Brondsema <da...@brondsema.net>
Authored: Tue Oct 17 16:30:01 2017 -0400
Committer: Dave Brondsema <da...@brondsema.net>
Committed: Fri Oct 20 14:44:01 2017 -0400

----------------------------------------------------------------------
 Allura/allura/model/repo.py                     |  4 +-
 Allura/allura/model/repo_refresh.py             | 83 +-------------------
 Allura/allura/model/repository.py               |  9 +--
 Allura/allura/scripts/refresh_last_commits.py   |  1 +
 Allura/allura/scripts/refreshrepo.py            | 38 ++-------
 Allura/allura/tests/model/test_repo.py          |  8 --
 Allura/test-light.py                            | 49 ------------
 ForgeSVN/forgesvn/model/svn.py                  |  4 -
 .../forgesvn/tests/model/test_repository.py     |  3 -
 .../tests/model/test_svnimplementation.py       |  4 +-
 10 files changed, 13 insertions(+), 190 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
index 6ba6633..4cbbea3 100644
--- a/Allura/allura/model/repo.py
+++ b/Allura/allura/model/repo.py
@@ -21,11 +21,11 @@
 
 from .repository import SUser, SObjType
 from .repository import QSIZE, README_RE, VIEWABLE_EXTENSIONS, PYPELINE_EXTENSIONS, DIFF_SIMILARITY_THRESHOLD
-from .repository import CommitDoc, TreeDoc, LastCommitDoc, TreesDoc, CommitRunDoc
+from .repository import CommitDoc, TreeDoc, LastCommitDoc, CommitRunDoc
 from .repository import RepoObject, Commit, Tree, Blob, LastCommit
 from .repository import ModelCache
 
 __all__ = [
     'SUser', 'SObjType', 'QSIZE', 'README_RE', 'VIEWABLE_EXTENSIONS', 'PYPELINE_EXTENSIONS',
-    'DIFF_SIMILARITY_THRESHOLD', 'CommitDoc', 'TreeDoc', 'LastCommitDoc', 'TreesDoc', 'CommitRunDoc', 'RepoObject',
+    'DIFF_SIMILARITY_THRESHOLD', 'CommitDoc', 'TreeDoc', 'LastCommitDoc', 'CommitRunDoc', 'RepoObject',
     'Commit', 'Tree', 'Blob', 'LastCommit', 'ModelCache']

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index 99fda0b..ccb5a6d 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -32,7 +32,7 @@ from ming.orm import mapper, session, ThreadLocalORMSession
 
 from allura.lib import utils
 from allura.lib import helpers as h
-from allura.model.repository import CommitDoc, TreeDoc, TreesDoc
+from allura.model.repository import CommitDoc
 from allura.model.repository import CommitRunDoc
 from allura.model.repository import Commit, Tree, LastCommit, ModelCache
 from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
@@ -98,33 +98,6 @@ def refresh_repo(repo, all_commits=False, notify=True, new_clone=False):
         rb.cleanup()
         log.info('Finished CommitRunBuilder for %s', repo.full_fs_path)
 
-    # Refresh trees
-    # Like diffs below, pre-computing trees for some SCMs is too expensive,
-    # so we skip it here, then do it on-demand later.
-    if repo._refresh_precompute:
-        cache = {}
-        for i, oid in enumerate(commit_ids):
-            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
-            cache = refresh_commit_trees(ci, cache)
-            if (i + 1) % 100 == 0:
-                log.info('Refresh commit trees %d: %s', (i + 1), ci._id)
-
-    # Compute diffs
-    cache = {}
-    # For some SCMs, we don't want to pre-compute the LCDs because that
-    # would be too expensive, so we skip them here and do them on-demand
-    # with caching.
-    if repo._refresh_precompute:
-        model_cache = ModelCache()
-        lcid_cache = {}
-        for i, oid in enumerate(reversed(commit_ids)):
-            ci = model_cache.get(Commit, dict(_id=oid))
-            ci.set_context(repo)
-            compute_lcds(ci, model_cache, lcid_cache)
-            ThreadLocalORMSession.flush_all()
-            if (i + 1) % 100 == 0:
-                log.info('Compute last commit info %d: %s', (i + 1), ci._id)
-
     # Clear any existing caches for branches/tags
     if repo.cached_branches:
         repo.cached_branches = []
@@ -172,20 +145,6 @@ def refresh_repo(repo, all_commits=False, notify=True, new_clone=False):
         send_notifications(repo, reversed(commit_ids))
 
 
-def refresh_commit_trees(ci, cache):
-    '''Refresh the list of trees included withn a commit'''
-    if ci.tree_id is None:
-        return cache
-    trees_doc = TreesDoc(dict(
-        _id=ci._id,
-        tree_ids=list(trees(ci.tree_id, cache))))
-    trees_doc.m.save(safe=False)
-    new_cache = dict(
-        (oid, cache[oid])
-        for oid in trees_doc.tree_ids)
-    return new_cache
-
-
 def refresh_commit_repos(all_commit_ids, repo):
     '''Refresh the list of repositories within which a set of commits are
     contained'''
@@ -349,19 +308,6 @@ class CommitRunBuilder(object):
             del self.runs[p_run_id]
 
 
-def trees(id, cache):
-    '''Recursively generate the list of trees contained within a given tree ID'''
-    yield id
-    entries = cache.get(id, None)
-    if entries is None:
-        t = TreeDoc.m.get(_id=id)
-        entries = [o.id for o in t.tree_ids]
-        cache[id] = entries
-    for i in entries:
-        for x in trees(i, cache):
-            yield x
-
-
 def unknown_commit_ids(all_commit_ids):
     '''filter out all commit ids that have already been cached'''
     result = []
@@ -541,33 +487,6 @@ def last_known_commit_id(all_commit_ids, new_commit_ids):
     return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
 
 
-def compute_lcds(commit, model_cache, lcid_cache):
-    '''
-    Compute LastCommit data for every Tree node under this tree.
-    '''
-    trees = model_cache.get(TreesDoc, dict(_id=commit._id))
-    if not trees:
-        log.error('Missing TreesDoc for %s; skipping compute_lcd' % commit)
-        return
-    with h.push_config(c, model_cache=model_cache, lcid_cache=lcid_cache):
-        _update_tree_cache(trees.tree_ids, model_cache)
-        tree = _pull_tree(model_cache, commit.tree_id, commit)
-        _compute_lcds(tree, model_cache)
-        for changed_path in tree.commit.changed_paths:
-            lcid_cache[changed_path] = tree.commit._id
-
-
-def _compute_lcds(tree, cache):
-    path = tree.path().strip('/')
-    if path not in tree.commit.changed_paths:
-        return
-    if not cache.get(LastCommit, dict(commit_id=tree.commit._id, path=path)):
-        lcd = LastCommit._build(tree)
-    for x in tree.tree_ids:
-        sub_tree = _pull_tree(cache, x.id, tree, x.name)
-        _compute_lcds(sub_tree, cache)
-
-
 def _pull_tree(cache, tree_id, *context):
     '''
     Since the Tree instances stick around in our cache,

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/allura/model/repository.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repository.py b/Allura/allura/model/repository.py
index 5386ae7..996b92f 100644
--- a/Allura/allura/model/repository.py
+++ b/Allura/allura/model/repository.py
@@ -981,7 +981,7 @@ CommitDoc = collection(
     Field('child_ids', [str], index=True),
     Field('repo_ids', [S.ObjectId()], index=True))
 
-# Basic tree information (also see TreesDoc)
+# Basic tree information
 TreeDoc = collection(
     'repo_tree', main_doc_session,
     Field('_id', str),
@@ -1000,13 +1000,6 @@ LastCommitDoc = collection(
         name=str,
         commit_id=str)]))
 
-# List of all trees contained within a commit
-# TreesDoc._id = CommitDoc._id
-# TreesDoc.tree_ids = [ TreeDoc._id, ... ]
-TreesDoc = collection(
-    'repo_trees', main_doc_session,
-    Field('_id', str),
-    Field('tree_ids', [str]))
 
 # List of commit runs (a run is a linear series of single-parent commits)
 # CommitRunDoc.commit_ids = [ CommitDoc._id, ... ]

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/allura/scripts/refresh_last_commits.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/refresh_last_commits.py b/Allura/allura/scripts/refresh_last_commits.py
index 095fafb..cbaae53 100644
--- a/Allura/allura/scripts/refresh_last_commits.py
+++ b/Allura/allura/scripts/refresh_last_commits.py
@@ -150,6 +150,7 @@ class RefreshLastCommits(ScriptTask):
                 continue
             commit.set_context(c.app.repo)
             with time(timings):
+                # FIXME call LastCommit._build() instead?  or remove this script?
                 M.repo_refresh.compute_lcds(commit, model_cache, lcid_cache)
                 ThreadLocalORMSession.flush_all()
             if i % 100 == 0:

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/allura/scripts/refreshrepo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/refreshrepo.py b/Allura/allura/scripts/refreshrepo.py
index 115e5a4..81fdfc5 100644
--- a/Allura/allura/scripts/refreshrepo.py
+++ b/Allura/allura/scripts/refreshrepo.py
@@ -69,17 +69,10 @@ class RefreshRepo(ScriptTask):
                         ci_ids = list(c.app.repo.all_commit_ids())
                         log.info("Deleting mongo data for %i commits...",
                                  len(ci_ids))
-                        # like the tree_ids themselves below, we need to process these in
-                        # chunks to avoid hitting the BSON max size limit
-                        tree_ids = []
-                        for ci_ids_chunk in chunked_list(ci_ids, 3000):
-                            tree_ids.extend([
-                                tree_id for doc in
-                                M.repository.TreesDoc.m.find(
-                                    {"_id": {"$in": ci_ids_chunk}},
-                                    {"tree_ids": 1})
-                                for tree_id in doc.get("tree_ids", [])])
 
+                        # delete these in chunks, otherwise the query doc can
+                        # exceed the max BSON size limit (16MB at the moment)
+                        for ci_ids_chunk in chunked_list(ci_ids, 3000):
                             i = M.repository.CommitDoc.m.find(
                                 {"_id": {"$in": ci_ids_chunk}}).count()
                             if i:
@@ -87,34 +80,17 @@ class RefreshRepo(ScriptTask):
                                 M.repository.CommitDoc.m.remove(
                                     {"_id": {"$in": ci_ids_chunk}})
 
-                        # delete these in chunks, otherwise the query doc can
-                        # exceed the max BSON size limit (16MB at the moment)
-                        for tree_ids_chunk in chunked_list(tree_ids, 300000):
-                            i = M.repository.TreeDoc.m.find(
-                                {"_id": {"$in": tree_ids_chunk}}).count()
-                            if i:
-                                log.info("Deleting %i TreeDoc docs...", i)
-                                M.repository.TreeDoc.m.remove(
-                                    {"_id": {"$in": tree_ids_chunk}})
-                        del tree_ids
+                        # we used to have a TreesDoc (plural) collection to provide a mapping of commit_id to tree_id
+                        # so that we could clear the relevant TreeDoc records
+                        # its ok though, since they are created in refresh_tree_info() and overwrite existing records
 
-                        # delete these after TreeDoc and LastCommitDoc so that if
-                        # we crash, we don't lose the ability to delete those
                         for ci_ids_chunk in chunked_list(ci_ids, 3000):
-                            # delete TreesDocs
-                            i = M.repository.TreesDoc.m.find(
-                                {"_id": {"$in": ci_ids_chunk}}).count()
-                            if i:
-                                log.info("Deleting %i TreesDoc docs...", i)
-                                M.repository.TreesDoc.m.remove(
-                                    {"_id": {"$in": ci_ids_chunk}})
-
                             # delete LastCommitDocs
                             i = M.repository.LastCommitDoc.m.find(
                                 dict(commit_id={'$in': ci_ids_chunk})).count()
                             if i:
                                 log.info(
-                                    "Deleting %i remaining LastCommitDoc docs, by repo id...", i)
+                                    "Deleting %i LastCommitDoc docs...", i)
                                 M.repository.LastCommitDoc.m.remove(
                                     dict(commit_id={'$in': ci_ids_chunk}))
 

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/allura/tests/model/test_repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py
index 3761821..189fdf4 100644
--- a/Allura/allura/tests/model/test_repo.py
+++ b/Allura/allura/tests/model/test_repo.py
@@ -443,14 +443,6 @@ class TestModelCache(unittest.TestCase):
         tr_get.assert_called_once_with(_id='foo')
         self.assertEqual(val, tree1)
 
-    @mock.patch.object(M.repository.TreesDoc.m, 'get')
-    def test_get_doc(self, tr_get):
-        trees = tr_get.return_value = mock.Mock(
-            spec=['_id', 'val'], _id='foo', val='bar')
-        val = self.cache.get(M.repository.TreesDoc, {'_id': 'foo'})
-        tr_get.assert_called_once_with(_id='foo')
-        self.assertEqual(val, trees)
-
     def test_set(self):
         tree = mock.Mock(spec=['_id', 'test_set'], _id='foo', val='test_set')
         self.cache.set(M.repository.Tree, {'val': 'test_set'}, tree)

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/Allura/test-light.py
----------------------------------------------------------------------
diff --git a/Allura/test-light.py b/Allura/test-light.py
deleted file mode 100644
index f095214..0000000
--- a/Allura/test-light.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#       Licensed to the Apache Software Foundation (ASF) under one
-#       or more contributor license agreements.  See the NOTICE file
-#       distributed with this work for additional information
-#       regarding copyright ownership.  The ASF licenses this file
-#       to you under the Apache License, Version 2.0 (the
-#       "License"); you may not use this file except in compliance
-#       with the License.  You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#       Unless required by applicable law or agreed to in writing,
-#       software distributed under the License is distributed on an
-#       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#       KIND, either express or implied.  See the License for the
-#       specific language governing permissions and limitations
-#       under the License.
-
-import sys
-
-from pylons import tmpl_context as c
-
-from allura.lib import helpers as h
-from allura.model.repository import CommitDoc, TreeDoc, TreesDoc
-from allura.model.repository import LastCommitDoc, CommitRunDoc
-from allura.model.repo_refresh import refresh_repo
-
-
-def main():
-    if len(sys.argv) > 1:
-        h.set_context('test')
-        c.project.install_app('Git', 'code', 'Code',
-                              init_from_url='/home/rick446/src/forge')
-        c.project.install_app('Hg', 'code2', 'Code2',
-                              init_from_url='/home/rick446/src/Kajiki')
-    CommitDoc.m.remove({})
-    TreeDoc.m.remove({})
-    TreesDoc.m.remove({})
-    LastCommitDoc.m.remove({})
-    CommitRunDoc.m.remove({})
-
-    h.set_context('test', 'code')
-    refresh_repo(c.app.repo, notify=False)
-    h.set_context('test', 'code2')
-    refresh_repo(c.app.repo, notify=False)
-
-
-if __name__ == '__main__':
-    main()
-    # dolog()

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/ForgeSVN/forgesvn/model/svn.py
----------------------------------------------------------------------
diff --git a/ForgeSVN/forgesvn/model/svn.py b/ForgeSVN/forgesvn/model/svn.py
index db52738..2aa2b40 100644
--- a/ForgeSVN/forgesvn/model/svn.py
+++ b/ForgeSVN/forgesvn/model/svn.py
@@ -439,10 +439,6 @@ class SVNImplementation(M.RepositoryImplementation):
         if is_new:
             commit_id = self._oid(infos[0][1].last_changed_rev.number)
             path = tree_path.strip('/')
-            RM.TreesDoc.m.update_partial(
-                {'_id': commit._id},
-                {'$addToSet': {'tree_ids': tree_id}},
-                upsert=True)
             RM.LastCommitDoc.m.update_partial(
                 {'commit_id': commit_id, 'path': path},
                 {'commit_id': commit_id, 'path':

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/ForgeSVN/forgesvn/tests/model/test_repository.py
----------------------------------------------------------------------
diff --git a/ForgeSVN/forgesvn/tests/model/test_repository.py b/ForgeSVN/forgesvn/tests/model/test_repository.py
index 18ec0fa..38dc5fe 100644
--- a/ForgeSVN/forgesvn/tests/model/test_repository.py
+++ b/ForgeSVN/forgesvn/tests/model/test_repository.py
@@ -1002,7 +1002,6 @@ class TestCommit(_TestWithRepo):
             'removed': [],
             'total': 5,
         }
-        M.repo_refresh.refresh_commit_trees(self.ci, {})
         assert_equal(self.ci.diffs.added,
                      ['a', 'a/a', 'a/a/a', 'a/a/b', 'a/b'])
         assert (self.ci.diffs.copied
@@ -1027,7 +1026,6 @@ class TestCommit(_TestWithRepo):
             'removed': ['a', 'a/a', 'a/a/a', 'a/a/b', 'a/b'],
             'total': 10,
         }
-        M.repo_refresh.refresh_commit_trees(ci, {})
         assert_equal(ci.diffs.added, ['b', 'b/a', 'b/a/a', 'b/a/b', 'b/b'])
         assert_equal(ci.diffs.removed, ['a', 'a/a', 'a/a/a', 'a/a/b', 'a/b'])
         assert (ci.diffs.copied
@@ -1063,7 +1061,6 @@ class TestCommit(_TestWithRepo):
             'renamed': [],
             'total': 2
         }
-        M.repo_refresh.refresh_commit_trees(ci, {})
         assert_equal(ci.diffs.added, [u'b/a/z', u'b/c'])
         assert_equal(ci.diffs.changed, [])
         assert_equal(ci.diffs.removed, [u'/b/a/b', u'b/b'])

http://git-wip-us.apache.org/repos/asf/allura/blob/a433213c/ForgeSVN/forgesvn/tests/model/test_svnimplementation.py
----------------------------------------------------------------------
diff --git a/ForgeSVN/forgesvn/tests/model/test_svnimplementation.py b/ForgeSVN/forgesvn/tests/model/test_svnimplementation.py
index c51cdce..216255d 100644
--- a/ForgeSVN/forgesvn/tests/model/test_svnimplementation.py
+++ b/ForgeSVN/forgesvn/tests/model/test_svnimplementation.py
@@ -36,10 +36,9 @@ class TestSVNImplementation(object):
         self._test_compute_tree_new('trunk/foo')
 
     @patch('allura.model.repository.LastCommitDoc.m.update_partial')
-    @patch('allura.model.repository.TreesDoc.m.update_partial')
     @patch('allura.model.repository.Tree.upsert')
     @patch('allura.model.repository.Tree.query.get')
-    def _test_compute_tree_new(self, path, tree_get, tree_upsert, treesdoc_partial, lcd_partial):
+    def _test_compute_tree_new(self, path, tree_get, tree_upsert, lcd_partial):
         repo = Mock(fs_path=g.tmpdir + '/')
         repo.name = 'code'
         impl = SVNImplementation(repo)
@@ -54,7 +53,6 @@ class TestSVNImplementation(object):
 
         assert_equal(impl._svn.info2.call_args[0]
                      [0], 'file://' + g.tmpdir + '/code/trunk/foo')
-        assert treesdoc_partial.called
         assert lcd_partial.called
 
     def test_last_commit_ids(self):