You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by jo...@apache.org on 2014/04/03 23:30:00 UTC
[2/3] git commit: [#7305] Consolidate allura.model.repo into allura.model.repository

[#7305] Consolidate allura.model.repo into allura.model.repository

Signed-off-by: Cory Johns <cj...@slashdotmedia.com>


Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/839d9cfb
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/839d9cfb
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/839d9cfb

Branch: refs/heads/cj/7305
Commit: 839d9cfb992d3401c687621fec2dc6c1eda40584
Parents: f72b7fd
Author: Cory Johns <cj...@slashdotmedia.com>
Authored: Wed Apr 2 20:26:03 2014 +0000
Committer: Cory Johns <cj...@slashdotmedia.com>
Committed: Wed Apr 2 20:26:03 2014 +0000

----------------------------------------------------------------------
 Allura/allura/controllers/repository.py         |   4 +-
 Allura/allura/lib/custom_middleware.py          |  16 +-
 Allura/allura/model/__init__.py                 |   3 +-
 Allura/allura/model/repo.py                     | 983 ------------------
 Allura/allura/model/repo_refresh.py             |   8 +-
 Allura/allura/model/repository.py               | 994 ++++++++++++++++++-
 Allura/allura/scripts/refresh_last_commits.py   |  18 +-
 Allura/allura/scripts/refreshrepo.py            |  26 +-
 Allura/allura/tests/model/test_repo.py          | 210 ++--
 Allura/allura/tests/unit/test_repo.py           |  51 +-
 Allura/test-light.py                            |   4 +-
 ForgeGit/forgegit/model/git_repo.py             |  13 +-
 .../forgegit/tests/model/test_repository.py     |  10 +-
 ForgeSVN/forgesvn/model/svn.py                  |   8 +-
 .../forgesvn/tests/model/test_repository.py     |  22 +-
 .../tests/model/test_svnimplementation.py       |  10 +-
 scripts/migrations/028-remove-svn-trees.py      |   8 +-
 17 files changed, 1175 insertions(+), 1213 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/controllers/repository.py
----------------------------------------------------------------------
diff --git a/Allura/allura/controllers/repository.py b/Allura/allura/controllers/repository.py
index aabc467..1751839 100644
--- a/Allura/allura/controllers/repository.py
+++ b/Allura/allura/controllers/repository.py
@@ -223,7 +223,7 @@ class RepoRootController(BaseController, FeedController):
         log.info('Grab %d commit objects by ID', len(commit_ids))
         commits_by_id = {
             c_obj._id: c_obj
-            for c_obj in M.repo.CommitDoc.m.find(dict(_id={'$in': commit_ids}))}
+            for c_obj in M.repository.CommitDoc.m.find(dict(_id={'$in': commit_ids}))}
         log.info('... build graph')
         parents = {}
         children = defaultdict(list)
@@ -629,7 +629,7 @@ class TreeBrowser(BaseController, DispatchIndex):
                     obj = self._tree[filename]
                 except KeyError:
                     raise exc.HTTPNotFound()
-                if isinstance(obj, M.repo.Blob):
+                if isinstance(obj, M.repository.Blob):
                     return self.FileBrowserClass(
                         self._commit,
                         self._tree,

http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/lib/custom_middleware.py
----------------------------------------------------------------------
diff --git a/Allura/allura/lib/custom_middleware.py b/Allura/allura/lib/custom_middleware.py
index ad1f276..c5737f0 100644
--- a/Allura/allura/lib/custom_middleware.py
+++ b/Allura/allura/lib/custom_middleware.py
@@ -30,7 +30,7 @@ from webob import exc, Request
 import pysolr
 
 from allura.lib import helpers as h
-import allura.model.repo
+import allura.model.repository
 
 log = logging.getLogger(__name__)
 
@@ -235,11 +235,11 @@ class AlluraTimerMiddleware(TimerMiddleware):
                   '_refresh'),
             # urlopen and socket io may or may not overlap partially
             Timer('render', genshi.Stream, 'render'),
-            Timer('repo.Blob.{method_name}', allura.model.repo.Blob, '*'),
-            Timer('repo.Commit.{method_name}', allura.model.repo.Commit, '*'),
+            Timer('repo.Blob.{method_name}', allura.model.repository.Blob, '*'),
+            Timer('repo.Commit.{method_name}', allura.model.repository.Commit, '*'),
             Timer('repo.LastCommit.{method_name}',
-                  allura.model.repo.LastCommit, '*'),
-            Timer('repo.Tree.{method_name}', allura.model.repo.Tree, '*'),
+                  allura.model.repository.LastCommit, '*'),
+            Timer('repo.Tree.{method_name}', allura.model.repository.Tree, '*'),
             Timer('socket_read', socket._fileobject, 'read', 'readline',
                   'readlines', debug_each_call=False),
             Timer('socket_write', socket._fileobject, 'write', 'writelines',
@@ -250,11 +250,11 @@ class AlluraTimerMiddleware(TimerMiddleware):
             Timer('urlopen', urllib2, 'urlopen'),
             Timer('base_repo_tool.{method_name}',
                   allura.model.repository.RepositoryImplementation, 'last_commit_ids'),
-            Timer('_diffs_copied', allura.model.repo.Commit, '_diffs_copied'),
+            Timer('_diffs_copied', allura.model.repository.Commit, '_diffs_copied'),
             Timer(
-                'sequencematcher.{method_name}', allura.model.repo.SequenceMatcher,
+                'sequencematcher.{method_name}', allura.model.repository.SequenceMatcher,
                 'ratio', 'quick_ratio', 'real_quick_ratio'),
-            Timer('unified_diff', allura.model.repo, 'unified_diff'),
+            Timer('unified_diff', allura.model.repository, 'unified_diff'),
         ] + [Timer('sidebar', ep.load(), 'sidebar_menu') for ep in tool_entry_points]
 
     def before_logging(self, stat_record):

http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/model/__init__.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/__init__.py b/Allura/allura/model/__init__.py
index 65a4561..a785773 100644
--- a/Allura/allura/model/__init__.py
+++ b/Allura/allura/model/__init__.py
@@ -42,7 +42,8 @@ from .session import artifact_orm_session, repository_orm_session
 from .session import task_orm_session
 from .session import ArtifactSessionExtension
 
-import repo
+from . import repository
+from . import repo_refresh
 
 from ming.orm import Mapper
 Mapper.compile_all()

http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
deleted file mode 100644
index 81d843d..0000000
--- a/Allura/allura/model/repo.py
+++ /dev/null
@@ -1,983 +0,0 @@
-#       Licensed to the Apache Software Foundation (ASF) under one
-#       or more contributor license agreements.  See the NOTICE file
-#       distributed with this work for additional information
-#       regarding copyright ownership.  The ASF licenses this file
-#       to you under the Apache License, Version 2.0 (the
-#       "License"); you may not use this file except in compliance
-#       with the License.  You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#       Unless required by applicable law or agreed to in writing,
-#       software distributed under the License is distributed on an
-#       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#       KIND, either express or implied.  See the License for the
-#       specific language governing permissions and limitations
-#       under the License.
-
-import os
-import re
-import logging
-from hashlib import sha1
-from itertools import chain
-from datetime import datetime
-from collections import defaultdict, OrderedDict
-from difflib import SequenceMatcher, unified_diff
-import bson
-
-from pylons import tmpl_context as c
-import pymongo.errors
-
-from ming import Field, collection, Index
-from ming import schema as S
-from ming.base import Object
-from ming.utils import LazyProperty
-from ming.orm import mapper, session
-
-from allura.lib import utils
-from allura.lib import helpers as h
-from allura.lib.security import has_access
-
-from .auth import User
-from .project import AppConfig
-from .session import main_doc_session
-from .session import repository_orm_session
-from .timeline import ActivityObject
-
-log = logging.getLogger(__name__)
-
-# Some schema types
-SUser = dict(name=str, email=str, date=datetime)
-SObjType = S.OneOf('blob', 'tree', 'submodule')
-
-# Used for when we're going to batch queries using $in
-QSIZE = 100
-README_RE = re.compile('^README(\.[^.]*)?$', re.IGNORECASE)
-VIEWABLE_EXTENSIONS = [
-    '.php', '.py', '.js', '.java', '.html', '.htm', '.yaml', '.sh',
-    '.rb', '.phtml', '.txt', '.bat', '.ps1', '.xhtml', '.css', '.cfm', '.jsp', '.jspx',
-    '.pl', '.php4', '.php3', '.rhtml', '.svg', '.markdown', '.json', '.ini', '.tcl', '.vbs', '.xsl']
-PYPELINE_EXTENSIONS = utils.MARKDOWN_EXTENSIONS + ['.rst']
-
-DIFF_SIMILARITY_THRESHOLD = .5  # used for determining file renames
-
-# Basic commit information
-# One of these for each commit in the physical repo on disk. The _id is the
-# hexsha of the commit (for Git and Hg).
-CommitDoc = collection(
-    'repo_ci', main_doc_session,
-    Field('_id', str),
-    Field('tree_id', str),
-    Field('committed', SUser),
-    Field('authored', SUser),
-    Field('message', str),
-    Field('parent_ids', [str], index=True),
-    Field('child_ids', [str], index=True),
-    Field('repo_ids', [S.ObjectId()], index=True))
-
-# Basic tree information (also see TreesDoc)
-TreeDoc = collection(
-    'repo_tree', main_doc_session,
-    Field('_id', str),
-    Field('tree_ids', [dict(name=str, id=str)]),
-    Field('blob_ids', [dict(name=str, id=str)]),
-    Field('other_ids', [dict(name=str, id=str, type=SObjType)]))
-
-# Information about the last commit to touch a tree
-LastCommitDoc = collection(
-    'repo_last_commit', main_doc_session,
-    Field('_id', S.ObjectId()),
-    Field('commit_id', str),
-    Field('path', str),
-    Index('commit_id', 'path'),
-    Field('entries', [dict(
-        name=str,
-        commit_id=str)]))
-
-# List of all trees contained within a commit
-# TreesDoc._id = CommitDoc._id
-# TreesDoc.tree_ids = [ TreeDoc._id, ... ]
-TreesDoc = collection(
-    'repo_trees', main_doc_session,
-    Field('_id', str),
-    Field('tree_ids', [str]))
-
-# Information about which things were added/removed in  commit
-# DiffInfoDoc._id = CommitDoc._id
-DiffInfoDoc = collection(
-    'repo_diffinfo', main_doc_session,
-    Field('_id', str),
-    Field(
-        'differences',
-        [dict(name=str, lhs_id=str, rhs_id=str)]))
-
-# List of commit runs (a run is a linear series of single-parent commits)
-# CommitRunDoc.commit_ids = [ CommitDoc._id, ... ]
-CommitRunDoc = collection(
-    'repo_commitrun', main_doc_session,
-    Field('_id', str),
-    Field('parent_commit_ids', [str], index=True),
-    Field('commit_ids', [str], index=True),
-    Field('commit_times', [datetime]))
-
-
-class RepoObject(object):
-
-    def __repr__(self):  # pragma no cover
-        return '<%s %s>' % (
-            self.__class__.__name__, self._id)
-
-    def primary(self):
-        return self
-
-    def index_id(self):
-        '''Globally unique artifact identifier.  Used for
-        SOLR ID, shortlinks, and maybe elsewhere
-        '''
-        id = '%s.%s#%s' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self._id)
-        return id.replace('.', '/')
-
-    @classmethod
-    def upsert(cls, id, **kwargs):
-        isnew = False
-        r = cls.query.get(_id=id)
-        if r is not None:
-            return r, isnew
-        try:
-            r = cls(_id=id, **kwargs)
-            session(r).flush(r)
-            isnew = True
-        except pymongo.errors.DuplicateKeyError:  # pragma no cover
-            session(r).expunge(r)
-            r = cls.query.get(_id=id)
-        return r, isnew
-
-
-class Commit(RepoObject, ActivityObject):
-    type_s = 'Commit'
-    # Ephemeral attrs
-    repo = None
-
-    def __init__(self, **kw):
-        for k, v in kw.iteritems():
-            setattr(self, k, v)
-
-    @property
-    def activity_name(self):
-        return self.shorthand_id()
-
-    @property
-    def activity_extras(self):
-        d = ActivityObject.activity_extras.fget(self)
-        d.update(summary=self.summary)
-        if self.repo:
-            d.update(app_config_id=self.repo.app.config._id)
-        return d
-
-    def has_activity_access(self, perm, user, activity):
-        """
-        Check access against the original app.
-
-        Commits have no ACLs and are therefore always viewable by any user, if
-        they have access to the tool.
-        """
-        app_config_id = activity.obj.activity_extras.get('app_config_id')
-        if app_config_id:
-            app_config = AppConfig.query.get(_id=app_config_id)
-            return has_access(app_config, perm, user)
-        return True
-
-    def set_context(self, repo):
-        self.repo = repo
-
-    @LazyProperty
-    def author_url(self):
-        u = User.by_email_address(self.authored.email)
-        if u:
-            return u.url()
-
-    @LazyProperty
-    def committer_url(self):
-        u = User.by_email_address(self.committed.email)
-        if u:
-            return u.url()
-
-    @LazyProperty
-    def tree(self):
-        return self.get_tree(create=True)
-
-    def get_tree(self, create=True):
-        if self.tree_id is None and create:
-            self.tree_id = self.repo.compute_tree_new(self)
-        if self.tree_id is None:
-            return None
-        cache = getattr(c, 'model_cache', '') or ModelCache()
-        t = cache.get(Tree, dict(_id=self.tree_id))
-        if t is None and create:
-            self.tree_id = self.repo.compute_tree_new(self)
-            t = Tree.query.get(_id=self.tree_id)
-            cache.set(Tree, dict(_id=self.tree_id), t)
-        if t is not None:
-            t.set_context(self)
-        return t
-
-    @LazyProperty
-    def summary(self):
-        message = h.really_unicode(self.message)
-        first_line = message.split('\n')[0]
-        return h.text.truncate(first_line, 50)
-
-    def shorthand_id(self):
-        if self.repo is None:
-            self.repo = self.guess_repo()
-        if self.repo is None:
-            return repr(self)
-        return self.repo.shorthand_for_commit(self._id)
-
-    @LazyProperty
-    def symbolic_ids(self):
-        return self.repo.symbolics_for_commit(self)
-
-    def get_parent(self, index=0):
-        '''Get the parent of this commit.
-
-        If there is no parent commit, or if an invalid index is given,
-        returns None.
-        '''
-        try:
-            cache = getattr(c, 'model_cache', '') or ModelCache()
-            ci = cache.get(Commit, dict(_id=self.parent_ids[index]))
-            if not ci:
-                return None
-            ci.set_context(self.repo)
-            return ci
-        except IndexError:
-            return None
-
-    def climb_commit_tree(self, predicate=None):
-        '''
-        Returns a generator that walks up the commit tree along
-        the first-parent ancestory, starting with this commit,
-        optionally filtering by a predicate.'''
-        ancestor = self
-        while ancestor:
-            if predicate is None or predicate(ancestor):
-                yield ancestor
-            ancestor = ancestor.get_parent()
-
-    def url(self):
-        if self.repo is None:
-            self.repo = self.guess_repo()
-        if self.repo is None:
-            return '#'
-        return self.repo.url_for_commit(self)
-
-    def guess_repo(self):
-        import traceback
-        log.error('guess_repo: should not be called: %s' %
-                  ''.join(traceback.format_stack()))
-        for ac in c.project.app_configs:
-            try:
-                app = c.project.app_instance(ac)
-                if app.repo._id in self.repo_ids:
-                    return app.repo
-            except AttributeError:
-                pass
-        return None
-
-    def link_text(self):
-        '''The link text that will be used when a shortlink to this artifact
-        is expanded into an <a></a> tag.
-
-        By default this method returns type_s + shorthand_id(). Subclasses should
-        override this method to provide more descriptive link text.
-        '''
-        return self.shorthand_id()
-
-    def context(self):
-        result = dict(prev=None, next=None)
-        if self.parent_ids:
-            result['prev'] = self.query.find(
-                dict(_id={'$in': self.parent_ids})).all()
-            for ci in result['prev']:
-                ci.set_context(self.repo)
-        if self.child_ids:
-            result['next'] = self.query.find(
-                dict(_id={'$in': self.child_ids})).all()
-            for ci in result['next']:
-                ci.set_context(self.repo)
-        return result
-
-    @LazyProperty
-    def diffs(self):
-        return self.paged_diffs()
-
-    def paged_diffs(self, start=0, end=None):
-        di = DiffInfoDoc.m.get(_id=self._id)
-        if di is None:
-            return Object(added=[], removed=[], changed=[], copied=[], total=0)
-        added = []
-        removed = []
-        changed = []
-        copied = []
-        for change in di.differences[start:end]:
-            if change.rhs_id is None:
-                removed.append(change.name)
-            elif change.lhs_id is None:
-                added.append(change.name)
-            else:
-                changed.append(change.name)
-        copied = self._diffs_copied(added, removed)
-        return Object(
-            added=added, removed=removed,
-            changed=changed, copied=copied,
-            total=len(di.differences))
-
-    def _diffs_copied(self, added, removed):
-        '''Return list with file renames diffs.
-
-        Will change `added` and `removed` lists also.
-        '''
-        def _blobs_similarity(removed_blob, added):
-            best = dict(ratio=0, name='', blob=None)
-            for added_name in added:
-                added_blob = self.tree.get_obj_by_path(added_name)
-                if not isinstance(added_blob, Blob):
-                    continue
-                diff = SequenceMatcher(None, removed_blob.text,
-                                       added_blob.text)
-                ratio = diff.quick_ratio()
-                if ratio > best['ratio']:
-                    best['ratio'] = ratio
-                    best['name'] = added_name
-                    best['blob'] = added_blob
-
-                if ratio == 1:
-                    break  # we'll won't find better similarity than 100% :)
-
-            if best['ratio'] > DIFF_SIMILARITY_THRESHOLD:
-                diff = ''
-                if best['ratio'] < 1:
-                    added_blob = best['blob']
-                    rpath = ('a' + removed_blob.path()).encode('utf-8')
-                    apath = ('b' + added_blob.path()).encode('utf-8')
-                    diff = ''.join(unified_diff(list(removed_blob),
-                                                list(added_blob),
-                                                rpath, apath))
-                return dict(new=best['name'],
-                            ratio=best['ratio'], diff=diff)
-
-        def _trees_similarity(removed_tree, added):
-            for added_name in added:
-                added_tree = self.tree.get_obj_by_path(added_name)
-                if not isinstance(added_tree, Tree):
-                    continue
-                if removed_tree._id == added_tree._id:
-                    return dict(new=added_name,
-                                ratio=1, diff='')
-
-        if not removed:
-            return []
-        copied = []
-        prev_commit = self.get_parent()
-        for removed_name in removed[:]:
-            removed_blob = prev_commit.tree.get_obj_by_path(removed_name)
-            rename_info = None
-            if isinstance(removed_blob, Blob):
-                rename_info = _blobs_similarity(removed_blob, added)
-            elif isinstance(removed_blob, Tree):
-                rename_info = _trees_similarity(removed_blob, added)
-            if rename_info is not None:
-                rename_info['old'] = removed_name
-                copied.append(rename_info)
-                removed.remove(rename_info['old'])
-                added.remove(rename_info['new'])
-        return copied
-
-    def get_path(self, path, create=True):
-        path = path.lstrip('/')
-        parts = path.split('/')
-        cur = self.get_tree(create)
-        if cur is not None:
-            for part in parts:
-                if part != '':
-                    cur = cur[part]
-        return cur
-
-    def has_path(self, path):
-        try:
-            self.get_path(path)
-            return True
-        except KeyError:
-            return False
-
-    @LazyProperty
-    def changed_paths(self):
-        '''
-        Returns a list of paths changed in this commit.
-        Leading and trailing slashes are removed, and
-        the list is complete, meaning that if a sub-path
-        is changed, all of the parent paths are included
-        (including '' to represent the root path).
-
-        Example:
-
-            If the file /foo/bar is changed in the commit,
-            this would return ['', 'foo', 'foo/bar']
-        '''
-        changes = self.repo.get_changes(self._id)
-        changed_paths = set()
-        for c in changes:
-            node = c.strip('/')
-            changed_paths.add(node)
-            node_path = os.path.dirname(node)
-            while node_path:
-                changed_paths.add(node_path)
-                node_path = os.path.dirname(node_path)
-            changed_paths.add('')  # include '/' if there are any changes
-        return changed_paths
-
-    @LazyProperty
-    def added_paths(self):
-        '''
-        Returns a list of paths added in this commit.
-        Leading and trailing slashes are removed, and
-        the list is complete, meaning that if a directory
-        with subdirectories is added, all of the child
-        paths are included (this relies on the DiffInfoDoc
-        being complete).
-
-        Example:
-
-            If the directory /foo/bar/ is added in the commit
-            which contains a subdirectory /foo/bar/baz/ with
-            the file /foo/bar/baz/qux.txt, this would return:
-            ['foo/bar', 'foo/bar/baz', 'foo/bar/baz/qux.txt']
-        '''
-        diff_info = DiffInfoDoc.m.get(_id=self._id)
-        diffs = set()
-        if diff_info:
-            for d in diff_info.differences:
-                if d.lhs_id is None:
-                    diffs.add(d.name.strip('/'))
-        return diffs
-
-    @LazyProperty
-    def info(self):
-        return dict(
-            id=self._id,
-            author=self.authored.name,
-            author_email=self.authored.email,
-            date=self.authored.date,
-            author_url=self.author_url,
-            shortlink=self.shorthand_id(),
-            summary=self.summary
-        )
-
-
-class Tree(RepoObject):
-    # Ephemeral attrs
-    repo = None
-    commit = None
-    parent = None
-    name = None
-
-    def compute_hash(self):
-        '''Compute a hash based on the contents of the tree.  Note that this
-        hash does not necessarily correspond to any actual DVCS hash.
-        '''
-        lines = (
-            ['tree' + x.name + x.id for x in self.tree_ids]
-            + ['blob' + x.name + x.id for x in self.blob_ids]
-            + [x.type + x.name + x.id for x in self.other_ids])
-        sha_obj = sha1()
-        for line in sorted(lines):
-            sha_obj.update(line)
-        return sha_obj.hexdigest()
-
-    def __getitem__(self, name):
-        cache = getattr(c, 'model_cache', '') or ModelCache()
-        obj = self.by_name[name]
-        if obj['type'] == 'blob':
-            return Blob(self, name, obj['id'])
-        if obj['type'] == 'submodule':
-            log.info('Skipping submodule "%s"' % name)
-            raise KeyError, name
-        obj = cache.get(Tree, dict(_id=obj['id']))
-        if obj is None:
-            oid = self.repo.compute_tree_new(
-                self.commit, self.path() + name + '/')
-            obj = cache.get(Tree, dict(_id=oid))
-        if obj is None:
-            raise KeyError, name
-        obj.set_context(self, name)
-        return obj
-
-    def get_obj_by_path(self, path):
-        if hasattr(path, 'get'):
-            path = path['new']
-        if path.startswith('/'):
-            path = path[1:]
-        path = path.split('/')
-        obj = self
-        for p in path:
-            try:
-                obj = obj[p]
-            except KeyError:
-                return None
-        return obj
-
-    def get_blob_by_path(self, path):
-        obj = self.get_obj_by_path(path)
-        return obj if isinstance(obj, Blob) else None
-
-    def set_context(self, commit_or_tree, name=None):
-        assert commit_or_tree is not self
-        self.repo = commit_or_tree.repo
-        if name:
-            self.commit = commit_or_tree.commit
-            self.parent = commit_or_tree
-            self.name = name
-        else:
-            self.commit = commit_or_tree
-
-    def readme(self):
-        'returns (filename, unicode text) if a readme file is found'
-        for x in self.blob_ids:
-            if README_RE.match(x.name):
-                name = x.name
-                blob = self[name]
-                return (x.name, h.really_unicode(blob.text))
-        return None, None
-
-    def ls(self):
-        '''
-        List the entries in this tree, with historical commit info for
-        each node.
-        '''
-        last_commit = LastCommit.get(self)
-        # ensure that the LCD is saved, even if
-        # there is an error later in the request
-        if last_commit:
-            session(last_commit).flush(last_commit)
-            return self._lcd_map(last_commit)
-        else:
-            return []
-
-    def _lcd_map(self, lcd):
-        if lcd is None:
-            return []
-        commit_ids = [e.commit_id for e in lcd.entries]
-        commits = list(Commit.query.find(dict(_id={'$in': commit_ids})))
-        for commit in commits:
-            commit.set_context(self.repo)
-        commit_infos = {c._id: c.info for c in commits}
-        by_name = lambda n: n.name
-        tree_names = sorted([n.name for n in self.tree_ids])
-        blob_names = sorted(
-            [n.name for n in chain(self.blob_ids, self.other_ids)])
-
-        results = []
-        for type, names in (('DIR', tree_names), ('BLOB', blob_names)):
-            for name in names:
-                commit_info = commit_infos.get(lcd.by_name.get(name))
-                if not commit_info:
-                    commit_info = defaultdict(str)
-                elif 'id' in commit_info:
-                    commit_info['href'] = self.repo.url_for_commit(
-                        commit_info['id'])
-                results.append(dict(
-                    kind=type,
-                    name=name,
-                    href=name,
-                    last_commit=dict(
-                        author=commit_info['author'],
-                        author_email=commit_info['author_email'],
-                        author_url=commit_info['author_url'],
-                        date=commit_info.get('date'),
-                        href=commit_info.get('href', ''),
-                        shortlink=commit_info['shortlink'],
-                        summary=commit_info['summary'],
-                    ),
-                ))
-        return results
-
-    def path(self):
-        if self.parent:
-            assert self.parent is not self
-            return self.parent.path() + self.name + '/'
-        else:
-            return '/'
-
-    def url(self):
-        return self.commit.url() + 'tree' + self.path()
-
-    @LazyProperty
-    def by_name(self):
-        d = Object((x.name, x) for x in self.other_ids)
-        d.update(
-            (x.name, Object(x, type='tree'))
-            for x in self.tree_ids)
-        d.update(
-            (x.name, Object(x, type='blob'))
-            for x in self.blob_ids)
-        return d
-
-    def is_blob(self, name):
-        return self.by_name[name]['type'] == 'blob'
-
-    def get_blob(self, name):
-        x = self.by_name[name]
-        return Blob(self, name, x.id)
-
-
-class Blob(object):
-
-    '''Lightweight object representing a file in the repo'''
-
-    def __init__(self, tree, name, _id):
-        self._id = _id
-        self.tree = tree
-        self.name = name
-        self.repo = tree.repo
-        self.commit = tree.commit
-        fn, ext = os.path.splitext(self.name)
-        self.extension = ext or fn
-
-    def path(self):
-        return self.tree.path() + h.really_unicode(self.name)
-
-    def url(self):
-        return self.tree.url() + h.really_unicode(self.name)
-
-    @LazyProperty
-    def _content_type_encoding(self):
-        return self.repo.guess_type(self.name)
-
-    @LazyProperty
-    def content_type(self):
-        return self._content_type_encoding[0]
-
-    @LazyProperty
-    def content_encoding(self):
-        return self._content_type_encoding[1]
-
-    @property
-    def has_pypeline_view(self):
-        if README_RE.match(self.name) or self.extension in PYPELINE_EXTENSIONS:
-            return True
-        return False
-
-    @property
-    def has_html_view(self):
-        if (self.content_type.startswith('text/') or
-                self.extension in VIEWABLE_EXTENSIONS or
-                self.extension in PYPELINE_EXTENSIONS or
-                self.extension in self.repo._additional_viewable_extensions or
-                utils.is_text_file(self.text)):
-            return True
-        return False
-
-    @property
-    def has_image_view(self):
-        return self.content_type.startswith('image/')
-
-    def open(self):
-        return self.repo.open_blob(self)
-
-    def __iter__(self):
-        return iter(self.open())
-
-    @LazyProperty
-    def size(self):
-        return self.repo.blob_size(self)
-
-    @LazyProperty
-    def text(self):
-        return self.open().read()
-
-    @classmethod
-    def diff(cls, v0, v1):
-        differ = SequenceMatcher(v0, v1)
-        return differ.get_opcodes()
-
-
-class LastCommit(RepoObject):
-
-    def __repr__(self):
-        return '<LastCommit /%s %s>' % (self.path, self.commit_id)
-
-    @classmethod
-    def _last_commit_id(cls, commit, path):
-        try:
-            rev = commit.repo.log(commit._id, path, id_only=True).next()
-            return commit.repo.rev_to_commit_id(rev)
-        except StopIteration:
-            log.error('Tree node not recognized by SCM: %s @ %s',
-                      path, commit._id)
-            return commit._id
-
-    @classmethod
-    def _prev_commit_id(cls, commit, path):
-        if not commit.parent_ids or path in commit.added_paths:
-            return None  # new paths by definition have no previous LCD
-        lcid_cache = getattr(c, 'lcid_cache', '')
-        if lcid_cache != '' and path in lcid_cache:
-            return lcid_cache[path]
-        try:
-            log_iter = commit.repo.log(commit._id, path, id_only=True)
-            log_iter.next()
-            rev = log_iter.next()
-            return commit.repo.rev_to_commit_id(rev)
-        except StopIteration:
-            return None
-
-    @classmethod
-    def get(cls, tree):
-        '''Find or build the LastCommitDoc for the given tree.'''
-        cache = getattr(c, 'model_cache', '') or ModelCache()
-        path = tree.path().strip('/')
-        last_commit_id = cls._last_commit_id(tree.commit, path)
-        lcd = cache.get(cls, {'path': path, 'commit_id': last_commit_id})
-        if lcd is None:
-            commit = cache.get(Commit, {'_id': last_commit_id})
-            commit.set_context(tree.repo)
-            lcd = cls._build(commit.get_path(path))
-        return lcd
-
-    @classmethod
-    def _build(cls, tree):
-        '''
-          Build the LCD record, presuming that this tree is where it was most
-          recently changed.
-        '''
-        model_cache = getattr(c, 'model_cache', '') or ModelCache()
-        path = tree.path().strip('/')
-        entries = []
-        prev_lcd = None
-        prev_lcd_cid = cls._prev_commit_id(tree.commit, path)
-        if prev_lcd_cid:
-            prev_lcd = model_cache.get(
-                cls, {'path': path, 'commit_id': prev_lcd_cid})
-        entries = {}
-        nodes = set(
-            [node.name for node in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
-        changed = set(
-            [node for node in nodes if os.path.join(path, node) in tree.commit.changed_paths])
-        unchanged = [os.path.join(path, node) for node in nodes - changed]
-        if prev_lcd:
-            # get unchanged entries from previously computed LCD
-            entries = prev_lcd.by_name
-        elif unchanged:
-            # no previously computed LCD, so get unchanged entries from SCM
-            # (but only ask for the ones that we know we need)
-            entries = tree.commit.repo.last_commit_ids(tree.commit, unchanged)
-            if entries is None:
-                # something strange went wrong; still show the list of files
-                # and possibly try again later
-                entries = {}
-            # paths are fully-qualified; shorten them back to just node names
-            entries = {
-                os.path.basename(path): commit_id for path, commit_id in entries.iteritems()}
-        # update with the nodes changed in this tree's commit
-        entries.update({node: tree.commit._id for node in changed})
-        # convert to a list of dicts, since mongo doesn't handle arbitrary keys
-        # well (i.e., . and $ not allowed)
-        entries = [{'name': name, 'commit_id': value}
-                   for name, value in entries.iteritems()]
-        lcd = cls(
-            commit_id=tree.commit._id,
-            path=path,
-            entries=entries,
-        )
-        model_cache.set(cls, {'path': path, 'commit_id': tree.commit._id}, lcd)
-        return lcd
-
-    @LazyProperty
-    def by_name(self):
-        return {n.name: n.commit_id for n in self.entries}
-
-mapper(Commit, CommitDoc, repository_orm_session)
-mapper(Tree, TreeDoc, repository_orm_session)
-mapper(LastCommit, LastCommitDoc, repository_orm_session)
-
-
-class ModelCache(object):
-
-    '''
-    Cache model instances based on query params passed to get.
-    '''
-
-    def __init__(self, max_instances=None, max_queries=None):
-        '''
-        By default, each model type can have 2000 instances and
-        8000 queries.  You can override these for specific model
-        types by passing in a dict() for either max_instances or
-        max_queries keyed by the class(es) with the max values.
-        Classes not in the dict() will use the default 2000/8000
-        default.
-
-        If you pass in a number instead of a dict, that value will
-        be used as the max for all classes.
-        '''
-        max_instances_default = 2000
-        max_queries_default = 8000
-        if isinstance(max_instances, int):
-            max_instances_default = max_instances
-        if isinstance(max_queries, int):
-            max_queries_default = max_queries
-        self._max_instances = defaultdict(lambda: max_instances_default)
-        self._max_queries = defaultdict(lambda: max_queries_default)
-        if hasattr(max_instances, 'items'):
-            self._max_instances.update(max_instances)
-        if hasattr(max_queries, 'items'):
-            self._max_queries.update(max_queries)
-
-        # keyed by query, holds _id
-        self._query_cache = defaultdict(OrderedDict)
-        self._instance_cache = defaultdict(OrderedDict)  # keyed by _id
-        self._synthetic_ids = defaultdict(set)
-        self._synthetic_id_queries = defaultdict(set)
-
-    def _normalize_query(self, query):
-        _query = query
-        if not isinstance(_query, tuple):
-            _query = tuple(sorted(_query.items(), key=lambda k: k[0]))
-        return _query
-
-    def _model_query(self, cls):
-        if hasattr(cls, 'query'):
-            return cls.query
-        elif hasattr(cls, 'm'):
-            return cls.m
-        else:
-            raise AttributeError(
-                '%s has neither "query" nor "m" attribute' % cls)
-
-    def get(self, cls, query):
-        _query = self._normalize_query(query)
-        self._touch(cls, _query)
-        if _query not in self._query_cache[cls]:
-            val = self._model_query(cls).get(**query)
-            self.set(cls, _query, val)
-            return val
-        _id = self._query_cache[cls][_query]
-        if _id is None:
-            return None
-        if _id not in self._instance_cache[cls]:
-            val = self._model_query(cls).get(**query)
-            self.set(cls, _query, val)
-            return val
-        return self._instance_cache[cls][_id]
-
-    def set(self, cls, query, val):
-        _query = self._normalize_query(query)
-        if val is not None:
-            _id = getattr(val, '_model_cache_id',
-                          getattr(val, '_id',
-                                  self._query_cache[cls].get(_query,
-                                                             None)))
-            if _id is None:
-                _id = val._model_cache_id = bson.ObjectId()
-                self._synthetic_ids[cls].add(_id)
-            if _id in self._synthetic_ids:
-                self._synthetic_id_queries[cls].add(_query)
-            self._query_cache[cls][_query] = _id
-            self._instance_cache[cls][_id] = val
-        else:
-            self._query_cache[cls][_query] = None
-        self._touch(cls, _query)
-        self._check_sizes(cls)
-
-    def _touch(self, cls, query):
-        '''
-        Keep track of insertion order, prevent duplicates,
-        and expire from the cache in a FIFO manner.
-        '''
-        _query = self._normalize_query(query)
-        if _query not in self._query_cache[cls]:
-            return
-        _id = self._query_cache[cls].pop(_query)
-        self._query_cache[cls][_query] = _id
-
-        if _id not in self._instance_cache[cls]:
-            return
-        val = self._instance_cache[cls].pop(_id)
-        self._instance_cache[cls][_id] = val
-
-    def _check_sizes(self, cls):
-        if self.num_queries(cls) > self._max_queries[cls]:
-            _id = self._remove_least_recently_used(self._query_cache[cls])
-            if _id in self._instance_cache[cls]:
-                instance = self._instance_cache[cls][_id]
-                self._try_flush(instance, expunge=False)
-        if self.num_instances(cls) > self._max_instances[cls]:
-            instance = self._remove_least_recently_used(
-                self._instance_cache[cls])
-            self._try_flush(instance, expunge=True)
-
-    def _try_flush(self, instance, expunge=False):
-        try:
-            inst_session = session(instance)
-        except AttributeError:
-            inst_session = None
-        if inst_session:
-            inst_session.flush(instance)
-            if expunge:
-                inst_session.expunge(instance)
-
-    def _remove_least_recently_used(self, cache):
-        # last-used (most-recently-used) is last in cache, so take first
-        key, val = cache.popitem(last=False)
-        return val
-
-    def expire_new_instances(self, cls):
-        '''
-        Expire any instances that were "new" or had no _id value.
-
-        If a lot of new instances of a class are being created, it's possible
-        for a query to pull a copy from mongo when a copy keyed by the synthetic
-        ID is still in the cache, potentially causing de-sync between the copies
-        leading to one with missing data overwriting the other.  Clear new
-        instances out of the cache relatively frequently (depending on the query
-        and instance cache sizes) to avoid this.
-        '''
-        for _query in self._synthetic_id_queries[cls]:
-            self._query_cache[cls].pop(_query)
-        self._synthetic_id_queries[cls] = set()
-        for _id in self._synthetic_ids[cls]:
-            instance = self._instance_cache[cls].pop(_id)
-            self._try_flush(instance, expunge=True)
-        self._synthetic_ids[cls] = set()
-
-    def num_queries(self, cls=None):
-        if cls is None:
-            return sum([len(c) for c in self._query_cache.values()])
-        else:
-            return len(self._query_cache[cls])
-
-    def num_instances(self, cls=None):
-        if cls is None:
-            return sum([len(c) for c in self._instance_cache.values()])
-        else:
-            return len(self._instance_cache[cls])
-
-    def instance_ids(self, cls):
-        return self._instance_cache[cls].keys()
-
-    def batch_load(self, cls, query, attrs=None):
-        '''
-        Load multiple results given a query.
-
-        Optionally takes a list of attribute names to use
-        as the cache key.  If not given, uses the keys of
-        the given query.
-        '''
-        if attrs is None:
-            attrs = query.keys()
-        for result in self._model_query(cls).find(query):
-            keys = {a: getattr(result, a) for a in attrs}
-            self.set(cls, keys, result)

http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index a2dc628..cdd71dc 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -31,9 +31,9 @@ from ming.orm import mapper, session, ThreadLocalORMSession
 
 from allura.lib import utils
 from allura.lib import helpers as h
-from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
-from allura.model.repo import CommitRunDoc
-from allura.model.repo import Commit, Tree, LastCommit, ModelCache
+from allura.model.repository import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
+from allura.model.repository import CommitRunDoc
+from allura.model.repository import Commit, Tree, LastCommit, ModelCache
 from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
 from allura.model.auth import User
 from allura.model.timeline import TransientActor
@@ -178,7 +178,7 @@ def refresh_commit_repos(all_commit_ids, repo):
                 repo_ids={'$ne': repo._id})):
             oid = ci._id
             ci.repo_ids.append(repo._id)
-            index_id = 'allura.model.repo.Commit#' + oid
+            index_id = 'allura.model.repository.Commit#' + oid
             ref = ArtifactReferenceDoc(dict(
                 _id=index_id,
                 artifact_reference=dict(

http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/model/repository.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repository.py b/Allura/allura/model/repository.py
index 81b4cd4..662ec59 100644
--- a/Allura/allura/model/repository.py
+++ b/Allura/allura/model/repository.py
@@ -25,10 +25,12 @@ from subprocess import Popen, PIPE
 from hashlib import sha1
 from datetime import datetime
 from time import time
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from urlparse import urljoin
 from threading import Thread
 from Queue import Queue
+from itertools import chain
+from difflib import SequenceMatcher, unified_diff
 
 import tg
 from paste.deploy.converters import asint
@@ -36,19 +38,25 @@ from pylons import tmpl_context as c
 from pylons import app_globals as g
 import pymongo
 import pymongo.errors
+import bson
 
 from ming import schema as S
+from ming import Field, collection, Index
 from ming.utils import LazyProperty
-from ming.orm import FieldProperty, session, Mapper
+from ming.orm import FieldProperty, session, Mapper, mapper
+from ming.base import Object
 
 from allura.lib import helpers as h
 from allura.lib import utils
+from allura.lib.security import has_access
 
 from .artifact import Artifact, VersionedArtifact
 from .auth import User
-from .repo_refresh import refresh_repo, unknown_commit_ids as unknown_commit_ids_repo
 from .timeline import ActivityObject
 from .monq_model import MonQTask
+from .project import AppConfig
+from .session import main_doc_session
+from .session import repository_orm_session
 
 log = logging.getLogger(__name__)
 config = utils.ConfigProxy(
@@ -62,20 +70,36 @@ VIEWABLE_EXTENSIONS = [
     '.pl', '.php4', '.php3', '.rhtml', '.svg', '.markdown', '.json', '.ini', '.tcl', '.vbs', '.xsl']
 
 
+# Some schema types
+SUser = dict(name=str, email=str, date=datetime)
+SObjType = S.OneOf('blob', 'tree', 'submodule')
+
+# Used for when we're going to batch queries using $in
+QSIZE = 100
+README_RE = re.compile('^README(\.[^.]*)?$', re.IGNORECASE)
+VIEWABLE_EXTENSIONS = [
+    '.php', '.py', '.js', '.java', '.html', '.htm', '.yaml', '.sh',
+    '.rb', '.phtml', '.txt', '.bat', '.ps1', '.xhtml', '.css', '.cfm', '.jsp', '.jspx',
+    '.pl', '.php4', '.php3', '.rhtml', '.svg', '.markdown', '.json', '.ini', '.tcl', '.vbs', '.xsl']
+PYPELINE_EXTENSIONS = utils.MARKDOWN_EXTENSIONS + ['.rst']
+
+DIFF_SIMILARITY_THRESHOLD = .5  # used for determining file renames
+
+
 class RepositoryImplementation(object):
 
     # Repository-specific code
     def init(self):  # pragma no cover
-        raise NotImplementedError, 'init'
+        raise NotImplementedError('init')
 
     def clone_from(self, source_url):  # pragma no cover
-        raise NotImplementedError, 'clone_from'
+        raise NotImplemented('clone_from')
 
     def commit(self, revision):  # pragma no cover
-        raise NotImplementedError, 'commit'
+        raise NotImplemented('commit')
 
     def all_commit_ids(self):  # pragma no cover
-        raise NotImplementedError, 'all_commit_ids'
+        raise NotImplemented('all_commit_ids')
 
     def new_commits(self, all_commits=False):  # pragma no cover
         '''Return a list of native commits in topological order (heads first).
@@ -83,21 +107,21 @@ class RepositoryImplementation(object):
         "commit" is a repo-native object, NOT a Commit object.
         If all_commits is False, only return commits not already indexed.
         '''
-        raise NotImplementedError, 'new_commits'
+        raise NotImplemented('new_commits')
 
     def commit_parents(self, commit):  # pragma no cover
         '''Return a list of native commits for the parents of the given (native)
         commit'''
-        raise NotImplementedError, 'commit_parents'
+        raise NotImplemented('commit_parents')
 
     def refresh_commit_info(self, oid, lazy=True):  # pragma no cover
         '''Refresh the data in the commit with id oid'''
-        raise NotImplementedError, 'refresh_commit_info'
+        raise NotImplemented('refresh_commit_info')
 
     def _setup_hooks(self, source_path=None):  # pragma no cover
         '''Install a hook in the repository that will ping the refresh url for
         the repo.  Optionally provide a path from which to copy existing hooks.'''
-        raise NotImplementedError, '_setup_hooks'
+        raise NotImplemented('_setup_hooks')
 
     # pragma no cover
     def log(self, revs=None, path=None, exclude=None, id_only=True, **kw):
@@ -120,31 +144,31 @@ class RepositoryImplementation(object):
         If id_only is True, returns only the commit ID (which can be faster),
         otherwise it returns detailed information about each commit.
         """
-        raise NotImplementedError, 'log'
+        raise NotImplemented('log')
 
     def compute_tree_new(self, commit, path='/'):  # pragma no cover
         '''Used in hg and svn to compute a git-like-tree lazily with the new models'''
-        raise NotImplementedError, 'compute_tree'
+        raise NotImplemented('compute_tree')
 
     def open_blob(self, blob):  # pragma no cover
         '''Return a file-like object that contains the contents of the blob'''
-        raise NotImplementedError, 'open_blob'
+        raise NotImplemented('open_blob')
 
     def blob_size(self, blob):
         '''Return a blob size in bytes'''
-        raise NotImplementedError, 'blob_size'
+        raise NotImplemented('blob_size')
 
     def tarball(self, revision, path=None):
         '''Create a tarball for the revision'''
-        raise NotImplementedError, 'tarball'
+        raise NotImplemented('tarball')
 
     def is_empty(self):
         '''Determine if the repository is empty by checking the filesystem'''
-        raise NotImplementedError, 'is_empty'
+        raise NotImplemented('is_empty')
 
     def is_file(self, path, rev=None):
         '''Determine if the repository is a file by checking the filesystem'''
-        raise NotImplementedError, 'is_file'
+        raise NotImplemented('is_file')
 
     @classmethod
     def shorthand_for_commit(cls, oid):
@@ -152,7 +176,7 @@ class RepositoryImplementation(object):
 
     def symbolics_for_commit(self, commit):
         '''Return symbolic branch and tag names for a commit.'''
-        raise NotImplementedError, 'symbolics_for_commit'
+        raise NotImplemented('symbolics_for_commit')
 
     def url_for_commit(self, commit, url_type='ci'):
         'return an URL, given either a commit or object id'
@@ -194,19 +218,19 @@ class RepositoryImplementation(object):
 
     @property
     def head(self):
-        raise NotImplementedError, 'head'
+        raise NotImplemented('head')
 
     @property
     def heads(self):
-        raise NotImplementedError, 'heads'
+        raise NotImplemented('heads')
 
     @property
     def branches(self):
-        raise NotImplementedError, 'branches'
+        raise NotImplemented('branches')
 
     @property
     def tags(self):
-        raise NotImplementedError, 'tags'
+        raise NotImplemented('tags')
 
     def last_commit_ids(self, commit, paths):
         '''
@@ -601,11 +625,13 @@ class Repository(Artifact, ActivityObject):
         return content_type, encoding
 
     def unknown_commit_ids(self):
+        from allura.model.repo_refresh import unknown_commit_ids as unknown_commit_ids_repo
         return unknown_commit_ids_repo(self.all_commit_ids())
 
     def refresh(self, all_commits=False, notify=True, new_clone=False):
         '''Find any new commits in the repository and update'''
         try:
+            from allura.model.repo_refresh import refresh_repo
             log.info('... %r analyzing', self)
             self.set_status('analyzing')
             refresh_repo(self, all_commits, notify, new_clone)
@@ -640,7 +666,7 @@ class Repository(Artifact, ActivityObject):
         self._impl.tarball(revision, path)
 
     def rev_to_commit_id(self, rev):
-        raise NotImplementedError, 'rev_to_commit_id'
+        raise NotImplemented('rev_to_commit_id')
 
     def set_status(self, status):
         '''
@@ -780,6 +806,923 @@ class MergeRequest(VersionedArtifact, ActivityObject):
         return result
 
 
+# Basic commit information
+# One of these for each commit in the physical repo on disk. The _id is the
+# hexsha of the commit (for Git and Hg).
+CommitDoc = collection(
+    'repo_ci', main_doc_session,
+    Field('_id', str),
+    Field('tree_id', str),
+    Field('committed', SUser),
+    Field('authored', SUser),
+    Field('message', str),
+    Field('parent_ids', [str], index=True),
+    Field('child_ids', [str], index=True),
+    Field('repo_ids', [S.ObjectId()], index=True))
+
+# Basic tree information (also see TreesDoc)
+TreeDoc = collection(
+    'repo_tree', main_doc_session,
+    Field('_id', str),
+    Field('tree_ids', [dict(name=str, id=str)]),
+    Field('blob_ids', [dict(name=str, id=str)]),
+    Field('other_ids', [dict(name=str, id=str, type=SObjType)]))
+
+# Information about the last commit to touch a tree
+LastCommitDoc = collection(
+    'repo_last_commit', main_doc_session,
+    Field('_id', S.ObjectId()),
+    Field('commit_id', str),
+    Field('path', str),
+    Index('commit_id', 'path'),
+    Field('entries', [dict(
+        name=str,
+        commit_id=str)]))
+
+# List of all trees contained within a commit
+# TreesDoc._id = CommitDoc._id
+# TreesDoc.tree_ids = [ TreeDoc._id, ... ]
+TreesDoc = collection(
+    'repo_trees', main_doc_session,
+    Field('_id', str),
+    Field('tree_ids', [str]))
+
+# Information about which things were added/removed in  commit
+# DiffInfoDoc._id = CommitDoc._id
+DiffInfoDoc = collection(
+    'repo_diffinfo', main_doc_session,
+    Field('_id', str),
+    Field(
+        'differences',
+        [dict(name=str, lhs_id=str, rhs_id=str)]))
+
+# List of commit runs (a run is a linear series of single-parent commits)
+# CommitRunDoc.commit_ids = [ CommitDoc._id, ... ]
+CommitRunDoc = collection(
+    'repo_commitrun', main_doc_session,
+    Field('_id', str),
+    Field('parent_commit_ids', [str], index=True),
+    Field('commit_ids', [str], index=True),
+    Field('commit_times', [datetime]))
+
+
+class RepoObject(object):
+
+    def __repr__(self):  # pragma no cover
+        return '<%s %s>' % (
+            self.__class__.__name__, self._id)
+
+    def primary(self):
+        return self
+
+    def index_id(self):
+        '''Globally unique artifact identifier.  Used for
+        SOLR ID, shortlinks, and maybe elsewhere
+        '''
+        id = '%s.%s#%s' % (
+            'allura.model.repo',  # preserve index_id after module consolidation
+            self.__class__.__name__,
+            self._id)
+        return id.replace('.', '/')
+
+    @classmethod
+    def upsert(cls, id, **kwargs):
+        isnew = False
+        r = cls.query.get(_id=id)
+        if r is not None:
+            return r, isnew
+        try:
+            r = cls(_id=id, **kwargs)
+            session(r).flush(r)
+            isnew = True
+        except pymongo.errors.DuplicateKeyError:  # pragma no cover
+            session(r).expunge(r)
+            r = cls.query.get(_id=id)
+        return r, isnew
+
+
+class Commit(RepoObject, ActivityObject):
+    type_s = 'Commit'
+    # Ephemeral attrs
+    repo = None
+
+    def __init__(self, **kw):
+        for k, v in kw.iteritems():
+            setattr(self, k, v)
+
+    @property
+    def activity_name(self):
+        return self.shorthand_id()
+
+    @property
+    def activity_extras(self):
+        d = ActivityObject.activity_extras.fget(self)
+        d.update(summary=self.summary)
+        if self.repo:
+            d.update(app_config_id=self.repo.app.config._id)
+        return d
+
+    def has_activity_access(self, perm, user, activity):
+        """
+        Check access against the original app.
+
+        Commits have no ACLs and are therefore always viewable by any user, if
+        they have access to the tool.
+        """
+        app_config_id = activity.obj.activity_extras.get('app_config_id')
+        if app_config_id:
+            app_config = AppConfig.query.get(_id=app_config_id)
+            return has_access(app_config, perm, user)
+        return True
+
+    def set_context(self, repo):
+        self.repo = repo
+
+    @LazyProperty
+    def author_url(self):
+        u = User.by_email_address(self.authored.email)
+        if u:
+            return u.url()
+
+    @LazyProperty
+    def committer_url(self):
+        u = User.by_email_address(self.committed.email)
+        if u:
+            return u.url()
+
+    @LazyProperty
+    def tree(self):
+        return self.get_tree(create=True)
+
+    def get_tree(self, create=True):
+        if self.tree_id is None and create:
+            self.tree_id = self.repo.compute_tree_new(self)
+        if self.tree_id is None:
+            return None
+        cache = getattr(c, 'model_cache', '') or ModelCache()
+        t = cache.get(Tree, dict(_id=self.tree_id))
+        if t is None and create:
+            self.tree_id = self.repo.compute_tree_new(self)
+            t = Tree.query.get(_id=self.tree_id)
+            cache.set(Tree, dict(_id=self.tree_id), t)
+        if t is not None:
+            t.set_context(self)
+        return t
+
+    @LazyProperty
+    def summary(self):
+        message = h.really_unicode(self.message)
+        first_line = message.split('\n')[0]
+        return h.text.truncate(first_line, 50)
+
+    def shorthand_id(self):
+        if self.repo is None:
+            self.repo = self.guess_repo()
+        if self.repo is None:
+            return repr(self)
+        return self.repo.shorthand_for_commit(self._id)
+
+    @LazyProperty
+    def symbolic_ids(self):
+        return self.repo.symbolics_for_commit(self)
+
+    def get_parent(self, index=0):
+        '''Get the parent of this commit.
+
+        If there is no parent commit, or if an invalid index is given,
+        returns None.
+        '''
+        try:
+            cache = getattr(c, 'model_cache', '') or ModelCache()
+            ci = cache.get(Commit, dict(_id=self.parent_ids[index]))
+            if not ci:
+                return None
+            ci.set_context(self.repo)
+            return ci
+        except IndexError:
+            return None
+
+    def climb_commit_tree(self, predicate=None):
+        '''
+        Returns a generator that walks up the commit tree along
+        the first-parent ancestory, starting with this commit,
+        optionally filtering by a predicate.'''
+        ancestor = self
+        while ancestor:
+            if predicate is None or predicate(ancestor):
+                yield ancestor
+            ancestor = ancestor.get_parent()
+
+    def url(self):
+        if self.repo is None:
+            self.repo = self.guess_repo()
+        if self.repo is None:
+            return '#'
+        return self.repo.url_for_commit(self)
+
+    def guess_repo(self):
+        import traceback
+        log.error('guess_repo: should not be called: %s' %
+                  ''.join(traceback.format_stack()))
+        for ac in c.project.app_configs:
+            try:
+                app = c.project.app_instance(ac)
+                if app.repo._id in self.repo_ids:
+                    return app.repo
+            except AttributeError:
+                pass
+        return None
+
+    def link_text(self):
+        '''The link text that will be used when a shortlink to this artifact
+        is expanded into an <a></a> tag.
+
+        By default this method returns type_s + shorthand_id(). Subclasses should
+        override this method to provide more descriptive link text.
+        '''
+        return self.shorthand_id()
+
+    def context(self):
+        result = dict(prev=None, next=None)
+        if self.parent_ids:
+            result['prev'] = self.query.find(
+                dict(_id={'$in': self.parent_ids})).all()
+            for ci in result['prev']:
+                ci.set_context(self.repo)
+        if self.child_ids:
+            result['next'] = self.query.find(
+                dict(_id={'$in': self.child_ids})).all()
+            for ci in result['next']:
+                ci.set_context(self.repo)
+        return result
+
+    @LazyProperty
+    def diffs(self):
+        return self.paged_diffs()
+
+    def paged_diffs(self, start=0, end=None):
+        di = DiffInfoDoc.m.get(_id=self._id)
+        if di is None:
+            return Object(added=[], removed=[], changed=[], copied=[], total=0)
+        added = []
+        removed = []
+        changed = []
+        copied = []
+        for change in di.differences[start:end]:
+            if change.rhs_id is None:
+                removed.append(change.name)
+            elif change.lhs_id is None:
+                added.append(change.name)
+            else:
+                changed.append(change.name)
+        copied = self._diffs_copied(added, removed)
+        return Object(
+            added=added, removed=removed,
+            changed=changed, copied=copied,
+            total=len(di.differences))
+
+    def _diffs_copied(self, added, removed):
+        '''Return list with file renames diffs.
+
+        Will change `added` and `removed` lists also.
+        '''
+        def _blobs_similarity(removed_blob, added):
+            best = dict(ratio=0, name='', blob=None)
+            for added_name in added:
+                added_blob = self.tree.get_obj_by_path(added_name)
+                if not isinstance(added_blob, Blob):
+                    continue
+                diff = SequenceMatcher(None, removed_blob.text,
+                                       added_blob.text)
+                ratio = diff.quick_ratio()
+                if ratio > best['ratio']:
+                    best['ratio'] = ratio
+                    best['name'] = added_name
+                    best['blob'] = added_blob
+
+                if ratio == 1:
+                    break  # we'll won't find better similarity than 100% :)
+
+            if best['ratio'] > DIFF_SIMILARITY_THRESHOLD:
+                diff = ''
+                if best['ratio'] < 1:
+                    added_blob = best['blob']
+                    rpath = ('a' + removed_blob.path()).encode('utf-8')
+                    apath = ('b' + added_blob.path()).encode('utf-8')
+                    diff = ''.join(unified_diff(list(removed_blob),
+                                                list(added_blob),
+                                                rpath, apath))
+                return dict(new=best['name'],
+                            ratio=best['ratio'], diff=diff)
+
+        def _trees_similarity(removed_tree, added):
+            for added_name in added:
+                added_tree = self.tree.get_obj_by_path(added_name)
+                if not isinstance(added_tree, Tree):
+                    continue
+                if removed_tree._id == added_tree._id:
+                    return dict(new=added_name,
+                                ratio=1, diff='')
+
+        if not removed:
+            return []
+        copied = []
+        prev_commit = self.get_parent()
+        for removed_name in removed[:]:
+            removed_blob = prev_commit.tree.get_obj_by_path(removed_name)
+            rename_info = None
+            if isinstance(removed_blob, Blob):
+                rename_info = _blobs_similarity(removed_blob, added)
+            elif isinstance(removed_blob, Tree):
+                rename_info = _trees_similarity(removed_blob, added)
+            if rename_info is not None:
+                rename_info['old'] = removed_name
+                copied.append(rename_info)
+                removed.remove(rename_info['old'])
+                added.remove(rename_info['new'])
+        return copied
+
+    def get_path(self, path, create=True):
+        path = path.lstrip('/')
+        parts = path.split('/')
+        cur = self.get_tree(create)
+        if cur is not None:
+            for part in parts:
+                if part != '':
+                    cur = cur[part]
+        return cur
+
+    def has_path(self, path):
+        try:
+            self.get_path(path)
+            return True
+        except KeyError:
+            return False
+
+    @LazyProperty
+    def changed_paths(self):
+        '''
+        Returns a list of paths changed in this commit.
+        Leading and trailing slashes are removed, and
+        the list is complete, meaning that if a sub-path
+        is changed, all of the parent paths are included
+        (including '' to represent the root path).
+
+        Example:
+
+            If the file /foo/bar is changed in the commit,
+            this would return ['', 'foo', 'foo/bar']
+        '''
+        changes = self.repo.get_changes(self._id)
+        changed_paths = set()
+        for change in changes:
+            node = change.strip('/')
+            changed_paths.add(node)
+            node_path = os.path.dirname(node)
+            while node_path:
+                changed_paths.add(node_path)
+                node_path = os.path.dirname(node_path)
+            changed_paths.add('')  # include '/' if there are any changes
+        return changed_paths
+
+    @LazyProperty
+    def added_paths(self):
+        '''
+        Returns a list of paths added in this commit.
+        Leading and trailing slashes are removed, and
+        the list is complete, meaning that if a directory
+        with subdirectories is added, all of the child
+        paths are included (this relies on the DiffInfoDoc
+        being complete).
+
+        Example:
+
+            If the directory /foo/bar/ is added in the commit
+            which contains a subdirectory /foo/bar/baz/ with
+            the file /foo/bar/baz/qux.txt, this would return:
+            ['foo/bar', 'foo/bar/baz', 'foo/bar/baz/qux.txt']
+        '''
+        diff_info = DiffInfoDoc.m.get(_id=self._id)
+        diffs = set()
+        if diff_info:
+            for d in diff_info.differences:
+                if d.lhs_id is None:
+                    diffs.add(d.name.strip('/'))
+        return diffs
+
+    @LazyProperty
+    def info(self):
+        return dict(
+            id=self._id,
+            author=self.authored.name,
+            author_email=self.authored.email,
+            date=self.authored.date,
+            author_url=self.author_url,
+            shortlink=self.shorthand_id(),
+            summary=self.summary
+        )
+
+
+class Tree(RepoObject):
+    # Ephemeral attrs
+    repo = None
+    commit = None
+    parent = None
+    name = None
+
+    def compute_hash(self):
+        '''Compute a hash based on the contents of the tree.  Note that this
+        hash does not necessarily correspond to any actual DVCS hash.
+        '''
+        lines = (
+            ['tree' + x.name + x.id for x in self.tree_ids]
+            + ['blob' + x.name + x.id for x in self.blob_ids]
+            + [x.type + x.name + x.id for x in self.other_ids])
+        sha_obj = sha1()
+        for line in sorted(lines):
+            sha_obj.update(line)
+        return sha_obj.hexdigest()
+
+    def __getitem__(self, name):
+        cache = getattr(c, 'model_cache', '') or ModelCache()
+        obj = self.by_name[name]
+        if obj['type'] == 'blob':
+            return Blob(self, name, obj['id'])
+        if obj['type'] == 'submodule':
+            log.info('Skipping submodule "%s"' % name)
+            raise KeyError(name)
+        obj = cache.get(Tree, dict(_id=obj['id']))
+        if obj is None:
+            oid = self.repo.compute_tree_new(
+                self.commit, self.path() + name + '/')
+            obj = cache.get(Tree, dict(_id=oid))
+        if obj is None:
+            raise KeyError(name)
+        obj.set_context(self, name)
+        return obj
+
+    def get_obj_by_path(self, path):
+        if hasattr(path, 'get'):
+            path = path['new']
+        if path.startswith('/'):
+            path = path[1:]
+        path = path.split('/')
+        obj = self
+        for p in path:
+            try:
+                obj = obj[p]
+            except KeyError:
+                return None
+        return obj
+
+    def get_blob_by_path(self, path):
+        obj = self.get_obj_by_path(path)
+        return obj if isinstance(obj, Blob) else None
+
+    def set_context(self, commit_or_tree, name=None):
+        assert commit_or_tree is not self
+        self.repo = commit_or_tree.repo
+        if name:
+            self.commit = commit_or_tree.commit
+            self.parent = commit_or_tree
+            self.name = name
+        else:
+            self.commit = commit_or_tree
+
+    def readme(self):
+        'returns (filename, unicode text) if a readme file is found'
+        for x in self.blob_ids:
+            if README_RE.match(x.name):
+                name = x.name
+                blob = self[name]
+                return (x.name, h.really_unicode(blob.text))
+        return None, None
+
+    def ls(self):
+        '''
+        List the entries in this tree, with historical commit info for
+        each node.
+        '''
+        last_commit = LastCommit.get(self)
+        # ensure that the LCD is saved, even if
+        # there is an error later in the request
+        if last_commit:
+            session(last_commit).flush(last_commit)
+            return self._lcd_map(last_commit)
+        else:
+            return []
+
+    def _lcd_map(self, lcd):
+        if lcd is None:
+            return []
+        commit_ids = [e.commit_id for e in lcd.entries]
+        commits = list(Commit.query.find(dict(_id={'$in': commit_ids})))
+        for commit in commits:
+            commit.set_context(self.repo)
+        commit_infos = {c._id: c.info for c in commits}
+        tree_names = sorted([n.name for n in self.tree_ids])
+        blob_names = sorted(
+            [n.name for n in chain(self.blob_ids, self.other_ids)])
+
+        results = []
+        for type, names in (('DIR', tree_names), ('BLOB', blob_names)):
+            for name in names:
+                commit_info = commit_infos.get(lcd.by_name.get(name))
+                if not commit_info:
+                    commit_info = defaultdict(str)
+                elif 'id' in commit_info:
+                    commit_info['href'] = self.repo.url_for_commit(
+                        commit_info['id'])
+                results.append(dict(
+                    kind=type,
+                    name=name,
+                    href=name,
+                    last_commit=dict(
+                        author=commit_info['author'],
+                        author_email=commit_info['author_email'],
+                        author_url=commit_info['author_url'],
+                        date=commit_info.get('date'),
+                        href=commit_info.get('href', ''),
+                        shortlink=commit_info['shortlink'],
+                        summary=commit_info['summary'],
+                    ),
+                ))
+        return results
+
+    def path(self):
+        if self.parent:
+            assert self.parent is not self
+            return self.parent.path() + self.name + '/'
+        else:
+            return '/'
+
+    def url(self):
+        return self.commit.url() + 'tree' + self.path()
+
+    @LazyProperty
+    def by_name(self):
+        d = Object((x.name, x) for x in self.other_ids)
+        d.update(
+            (x.name, Object(x, type='tree'))
+            for x in self.tree_ids)
+        d.update(
+            (x.name, Object(x, type='blob'))
+            for x in self.blob_ids)
+        return d
+
+    def is_blob(self, name):
+        return self.by_name[name]['type'] == 'blob'
+
+    def get_blob(self, name):
+        x = self.by_name[name]
+        return Blob(self, name, x.id)
+
+
+class Blob(object):
+
+    '''Lightweight object representing a file in the repo'''
+
+    def __init__(self, tree, name, _id):
+        self._id = _id
+        self.tree = tree
+        self.name = name
+        self.repo = tree.repo
+        self.commit = tree.commit
+        fn, ext = os.path.splitext(self.name)
+        self.extension = ext or fn
+
+    def path(self):
+        return self.tree.path() + h.really_unicode(self.name)
+
+    def url(self):
+        return self.tree.url() + h.really_unicode(self.name)
+
+    @LazyProperty
+    def _content_type_encoding(self):
+        return self.repo.guess_type(self.name)
+
+    @LazyProperty
+    def content_type(self):
+        return self._content_type_encoding[0]
+
+    @LazyProperty
+    def content_encoding(self):
+        return self._content_type_encoding[1]
+
+    @property
+    def has_pypeline_view(self):
+        if README_RE.match(self.name) or self.extension in PYPELINE_EXTENSIONS:
+            return True
+        return False
+
+    @property
+    def has_html_view(self):
+        if (self.content_type.startswith('text/') or
+                self.extension in VIEWABLE_EXTENSIONS or
+                self.extension in PYPELINE_EXTENSIONS or
+                self.extension in self.repo._additional_viewable_extensions or
+                utils.is_text_file(self.text)):
+            return True
+        return False
+
+    @property
+    def has_image_view(self):
+        return self.content_type.startswith('image/')
+
+    def open(self):
+        return self.repo.open_blob(self)
+
+    def __iter__(self):
+        return iter(self.open())
+
+    @LazyProperty
+    def size(self):
+        return self.repo.blob_size(self)
+
+    @LazyProperty
+    def text(self):
+        return self.open().read()
+
+    @classmethod
+    def diff(cls, v0, v1):
+        differ = SequenceMatcher(v0, v1)
+        return differ.get_opcodes()
+
+
+class LastCommit(RepoObject):
+
+    def __repr__(self):
+        return '<LastCommit /%s %s>' % (self.path, self.commit_id)
+
+    @classmethod
+    def _last_commit_id(cls, commit, path):
+        try:
+            rev = commit.repo.log(commit._id, path, id_only=True).next()
+            return commit.repo.rev_to_commit_id(rev)
+        except StopIteration:
+            log.error('Tree node not recognized by SCM: %s @ %s',
+                      path, commit._id)
+            return commit._id
+
+    @classmethod
+    def _prev_commit_id(cls, commit, path):
+        if not commit.parent_ids or path in commit.added_paths:
+            return None  # new paths by definition have no previous LCD
+        lcid_cache = getattr(c, 'lcid_cache', '')
+        if lcid_cache != '' and path in lcid_cache:
+            return lcid_cache[path]
+        try:
+            log_iter = commit.repo.log(commit._id, path, id_only=True)
+            log_iter.next()
+            rev = log_iter.next()
+            return commit.repo.rev_to_commit_id(rev)
+        except StopIteration:
+            return None
+
+    @classmethod
+    def get(cls, tree):
+        '''Find or build the LastCommitDoc for the given tree.'''
+        cache = getattr(c, 'model_cache', '') or ModelCache()
+        path = tree.path().strip('/')
+        last_commit_id = cls._last_commit_id(tree.commit, path)
+        lcd = cache.get(cls, {'path': path, 'commit_id': last_commit_id})
+        if lcd is None:
+            commit = cache.get(Commit, {'_id': last_commit_id})
+            commit.set_context(tree.repo)
+            lcd = cls._build(commit.get_path(path))
+        return lcd
+
+    @classmethod
+    def _build(cls, tree):
+        '''
+          Build the LCD record, presuming that this tree is where it was most
+          recently changed.
+        '''
+        model_cache = getattr(c, 'model_cache', '') or ModelCache()
+        path = tree.path().strip('/')
+        entries = []
+        prev_lcd = None
+        prev_lcd_cid = cls._prev_commit_id(tree.commit, path)
+        if prev_lcd_cid:
+            prev_lcd = model_cache.get(
+                cls, {'path': path, 'commit_id': prev_lcd_cid})
+        entries = {}
+        nodes = set(
+            [node.name for node in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
+        changed = set(
+            [node for node in nodes if os.path.join(path, node) in tree.commit.changed_paths])
+        unchanged = [os.path.join(path, node) for node in nodes - changed]
+        if prev_lcd:
+            # get unchanged entries from previously computed LCD
+            entries = prev_lcd.by_name
+        elif unchanged:
+            # no previously computed LCD, so get unchanged entries from SCM
+            # (but only ask for the ones that we know we need)
+            entries = tree.commit.repo.last_commit_ids(tree.commit, unchanged)
+            if entries is None:
+                # something strange went wrong; still show the list of files
+                # and possibly try again later
+                entries = {}
+            # paths are fully-qualified; shorten them back to just node names
+            entries = {
+                os.path.basename(path): commit_id for path, commit_id in entries.iteritems()}
+        # update with the nodes changed in this tree's commit
+        entries.update({node: tree.commit._id for node in changed})
+        # convert to a list of dicts, since mongo doesn't handle arbitrary keys
+        # well (i.e., . and $ not allowed)
+        entries = [{'name': name, 'commit_id': value}
+                   for name, value in entries.iteritems()]
+        lcd = cls(
+            commit_id=tree.commit._id,
+            path=path,
+            entries=entries,
+        )
+        model_cache.set(cls, {'path': path, 'commit_id': tree.commit._id}, lcd)
+        return lcd
+
+    @LazyProperty
+    def by_name(self):
+        return {n.name: n.commit_id for n in self.entries}
+
+
+class ModelCache(object):
+
+    '''
+    Cache model instances based on query params passed to get.
+    '''
+
+    def __init__(self, max_instances=None, max_queries=None):
+        '''
+        By default, each model type can have 2000 instances and
+        8000 queries.  You can override these for specific model
+        types by passing in a dict() for either max_instances or
+        max_queries keyed by the class(es) with the max values.
+        Classes not in the dict() will use the default 2000/8000
+        default.
+
+        If you pass in a number instead of a dict, that value will
+        be used as the max for all classes.
+        '''
+        max_instances_default = 2000
+        max_queries_default = 8000
+        if isinstance(max_instances, int):
+            max_instances_default = max_instances
+        if isinstance(max_queries, int):
+            max_queries_default = max_queries
+        self._max_instances = defaultdict(lambda: max_instances_default)
+        self._max_queries = defaultdict(lambda: max_queries_default)
+        if hasattr(max_instances, 'items'):
+            self._max_instances.update(max_instances)
+        if hasattr(max_queries, 'items'):
+            self._max_queries.update(max_queries)
+
+        # keyed by query, holds _id
+        self._query_cache = defaultdict(OrderedDict)
+        self._instance_cache = defaultdict(OrderedDict)  # keyed by _id
+        self._synthetic_ids = defaultdict(set)
+        self._synthetic_id_queries = defaultdict(set)
+
+    def _normalize_query(self, query):
+        _query = query
+        if not isinstance(_query, tuple):
+            _query = tuple(sorted(_query.items(), key=lambda k: k[0]))
+        return _query
+
+    def _model_query(self, cls):
+        if hasattr(cls, 'query'):
+            return cls.query
+        elif hasattr(cls, 'm'):
+            return cls.m
+        else:
+            raise AttributeError(
+                '%s has neither "query" nor "m" attribute' % cls)
+
+    def get(self, cls, query):
+        _query = self._normalize_query(query)
+        self._touch(cls, _query)
+        if _query not in self._query_cache[cls]:
+            val = self._model_query(cls).get(**query)
+            self.set(cls, _query, val)
+            return val
+        _id = self._query_cache[cls][_query]
+        if _id is None:
+            return None
+        if _id not in self._instance_cache[cls]:
+            val = self._model_query(cls).get(**query)
+            self.set(cls, _query, val)
+            return val
+        return self._instance_cache[cls][_id]
+
+    def set(self, cls, query, val):
+        _query = self._normalize_query(query)
+        if val is not None:
+            _id = getattr(val, '_model_cache_id',
+                          getattr(val, '_id',
+                                  self._query_cache[cls].get(_query,
+                                                             None)))
+            if _id is None:
+                _id = val._model_cache_id = bson.ObjectId()
+                self._synthetic_ids[cls].add(_id)
+            if _id in self._synthetic_ids:
+                self._synthetic_id_queries[cls].add(_query)
+            self._query_cache[cls][_query] = _id
+            self._instance_cache[cls][_id] = val
+        else:
+            self._query_cache[cls][_query] = None
+        self._touch(cls, _query)
+        self._check_sizes(cls)
+
+    def _touch(self, cls, query):
+        '''
+        Keep track of insertion order, prevent duplicates,
+        and expire from the cache in a FIFO manner.
+        '''
+        _query = self._normalize_query(query)
+        if _query not in self._query_cache[cls]:
+            return
+        _id = self._query_cache[cls].pop(_query)
+        self._query_cache[cls][_query] = _id
+
+        if _id not in self._instance_cache[cls]:
+            return
+        val = self._instance_cache[cls].pop(_id)
+        self._instance_cache[cls][_id] = val
+
+    def _check_sizes(self, cls):
+        if self.num_queries(cls) > self._max_queries[cls]:
+            _id = self._remove_least_recently_used(self._query_cache[cls])
+            if _id in self._instance_cache[cls]:
+                instance = self._instance_cache[cls][_id]
+                self._try_flush(instance, expunge=False)
+        if self.num_instances(cls) > self._max_instances[cls]:
+            instance = self._remove_least_recently_used(
+                self._instance_cache[cls])
+            self._try_flush(instance, expunge=True)
+
+    def _try_flush(self, instance, expunge=False):
+        try:
+            inst_session = session(instance)
+        except AttributeError:
+            inst_session = None
+        if inst_session:
+            inst_session.flush(instance)
+            if expunge:
+                inst_session.expunge(instance)
+
+    def _remove_least_recently_used(self, cache):
+        # last-used (most-recently-used) is last in cache, so take first
+        key, val = cache.popitem(last=False)
+        return val
+
+    def expire_new_instances(self, cls):
+        '''
+        Expire any instances that were "new" or had no _id value.
+
+        If a lot of new instances of a class are being created, it's possible
+        for a query to pull a copy from mongo when a copy keyed by the synthetic
+        ID is still in the cache, potentially causing de-sync between the copies
+        leading to one with missing data overwriting the other.  Clear new
+        instances out of the cache relatively frequently (depending on the query
+        and instance cache sizes) to avoid this.
+        '''
+        for _query in self._synthetic_id_queries[cls]:
+            self._query_cache[cls].pop(_query)
+        self._synthetic_id_queries[cls] = set()
+        for _id in self._synthetic_ids[cls]:
+            instance = self._instance_cache[cls].pop(_id)
+            self._try_flush(instance, expunge=True)
+        self._synthetic_ids[cls] = set()
+
+    def num_queries(self, cls=None):
+        if cls is None:
+            return sum([len(c) for c in self._query_cache.values()])
+        else:
+            return len(self._query_cache[cls])
+
+    def num_instances(self, cls=None):
+        if cls is None:
+            return sum([len(c) for c in self._instance_cache.values()])
+        else:
+            return len(self._instance_cache[cls])
+
+    def instance_ids(self, cls):
+        return self._instance_cache[cls].keys()
+
+    def batch_load(self, cls, query, attrs=None):
+        '''
+        Load multiple results given a query.
+
+        Optionally takes a list of attribute names to use
+        as the cache key.  If not given, uses the keys of
+        the given query.
+        '''
+        if attrs is None:
+            attrs = query.keys()
+        for result in self._model_query(cls).find(query):
+            keys = {a: getattr(result, a) for a in attrs}
+            self.set(cls, keys, result)
+
+
 class GitLikeTree(object):
 
     '''
@@ -922,4 +1865,7 @@ def zipdir(source, zipfile, exclude=None):
             "STDERR: {3}".format(command, p.returncode, stdout, stderr))
 
 
+mapper(Commit, CommitDoc, repository_orm_session)
+mapper(Tree, TreeDoc, repository_orm_session)
+mapper(LastCommit, LastCommitDoc, repository_orm_session)
 Mapper.compile_all()

http://git-wip-us.apache.org/repos/asf/allura/blob/839d9cfb/Allura/allura/scripts/refresh_last_commits.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/refresh_last_commits.py b/Allura/allura/scripts/refresh_last_commits.py
index 720af10..4673a2a 100644
--- a/Allura/allura/scripts/refresh_last_commits.py
+++ b/Allura/allura/scripts/refresh_last_commits.py
@@ -141,22 +141,22 @@ class RefreshLastCommits(ScriptTask):
         if options.diffs:
             print 'Processing diffs'
             for i, commit_id in enumerate(commit_ids):
-                commit = M.repo.Commit.query.get(_id=commit_id)
+                commit = M.repository.Commit.query.get(_id=commit_id)
                 with time(timings):
                     M.repo_refresh.compute_diffs(
                         c.app.repo._id, tree_cache, commit)
                 if i % 1000 == 0:
                     cls._print_stats(i, timings, 1000)
 
-        model_cache = M.repo.ModelCache(
-            max_instances={M.repo.LastCommit: 4000},
-            max_queries={M.repo.LastCommit: 4000},
+        model_cache = M.repository.ModelCache(
+            max_instances={M.repository.LastCommit: 4000},
+            max_queries={M.repository.LastCommit: 4000},
         )
         lcid_cache = {}
         timings = []
         print 'Processing last commits'
         for i, commit_id in enumerate(commit_ids):
-            commit = M.repo.Commit.query.get(_id=commit_id)
+            commit = M.repository.Commit.query.get(_id=commit_id)
             if commit is None:
                 print "Commit missing, skipping: %s" % commit_id
                 continue
@@ -174,18 +174,18 @@ class RefreshLastCommits(ScriptTask):
     def _clean(cls, commit_ids, clean_diffs):
         if clean_diffs:
             # delete DiffInfoDocs
-            i = M.repo.DiffInfoDoc.m.find(
+            i = M.repository.DiffInfoDoc.m.find(
                 dict(_id={'$in': commit_ids})).count()
             log.info("Deleting %i DiffInfoDoc docs for %i commits...",
                      i, len(commit_ids))
-            M.repo.DiffInfoDoc.m.remove(dict(_id={'$in': commit_ids}))
+            M.repository.DiffInfoDoc.m.remove(dict(_id={'$in': commit_ids}))
 
         # delete LastCommitDocs
-        i = M.repo.LastCommitDoc.m.find(
+        i = M.repository.LastCommitDoc.m.find(
             dict(commit_id={'$in': commit_ids})).count()
         log.info("Deleting %i LastCommitDoc docs for %i commits...",
                  i, len(commit_ids))
-        M.repo.LastCommitDoc.m.remove(dict(commit_id={'$in': commit_ids}))
+        M.repository.LastCommitDoc.m.remove(dict(commit_id={'$in': commit_ids}))
 
     @classmethod
     def _print_stats(cls, processed, timings, debug_step):