You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by tv...@apache.org on 2013/02/05 21:23:34 UTC
[28/42] git commit: [#4691] Improved memory footprint for ModelCache
[#4691] Improved memory footprint for ModelCache
Signed-off-by: Cory Johns <jo...@geek.net>
Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/508921b0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/508921b0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/508921b0
Branch: refs/heads/master
Commit: 508921b0f05dc31d129f0e184cbdeddba5ab3535
Parents: f3e8809
Author: Cory Johns <jo...@geek.net>
Authored: Thu Dec 13 02:28:34 2012 +0000
Committer: Tim Van Steenburgh <tv...@gmail.com>
Committed: Tue Feb 5 20:22:51 2013 +0000
----------------------------------------------------------------------
Allura/allura/model/repo.py | 148 ++++++++++++++++++--------
Allura/allura/model/repo_refresh.py | 2 +-
Allura/allura/tests/model/test_repo.py | 135 ++++++++++++++++---------
3 files changed, 190 insertions(+), 95 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/508921b0/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
index 67c016d..f1fe984 100644
--- a/Allura/allura/model/repo.py
+++ b/Allura/allura/model/repo.py
@@ -7,6 +7,7 @@ from itertools import chain
from datetime import datetime
from collections import defaultdict, OrderedDict
from difflib import SequenceMatcher, unified_diff
+import bson
from pylons import c
import pymongo.errors
@@ -830,17 +831,37 @@ class ModelCache(object):
'''
Cache model instances based on query params passed to get.
'''
- def __init__(self, max_size=2000):
+ def __init__(self, max_instances=None, max_queries=None):
'''
- The max_size of the cache is tracked separately for
- each model class stored. I.e., you can have 2000
- Commit instances and 2000 Tree instances in the cache
- at once with the default value.
+ By default, each model type can have 2000 instances and
+ 8000 queries. You can override these for specific model
+ types by passing in a dict() for either max_instances or
+ max_queries keyed by the class(es) with the max values.
+ Classes not in the dict() will use the default 2000/8000
+ default.
+
+ If you pass in a number instead of a dict, that value will
+ be used as the max for all classes.
'''
- self._cache = defaultdict(OrderedDict)
- self.max_size = max_size
+ max_instances_default = 2000
+ max_queries_default = 8000
+ if isinstance(max_instances, int):
+ max_instances_default = max_instances
+ if isinstance(max_queries, int):
+ max_queries_default = max_queries
+ self._max_instances = defaultdict(lambda:max_instances_default)
+ self._max_queries = defaultdict(lambda:max_queries_default)
+ if hasattr(max_instances, 'items'):
+ self._max_instances.update(max_instances)
+ if hasattr(max_queries, 'items'):
+ self._max_queries.update(max_queries)
+
+ self._query_cache = defaultdict(OrderedDict) # keyed by query, holds _id
+ self._instance_cache = defaultdict(OrderedDict) # keyed by _id
+
# temporary, for performance testing
- self._hits = defaultdict(int)
+ self._query_hits = defaultdict(int)
+ self._instance_hits = defaultdict(int)
self._accesses = defaultdict(int)
self._get_calls = 0
self._get_walks = 0
@@ -851,59 +872,94 @@ class ModelCache(object):
self._build_walks = 0
self._build_walks_max = 0
- def _normalize_key(self, key):
- _key = key
- if not isinstance(_key, tuple):
- _key = tuple(sorted(_key.items(), key=lambda k: k[0]))
- return _key
+ def _normalize_query(self, query):
+ _query = query
+ if not isinstance(_query, tuple):
+ _query = tuple(sorted(_query.items(), key=lambda k: k[0]))
+ return _query
+
+ def _model_query(self, cls):
+ if hasattr(cls, 'query'):
+ return cls.query
+ elif hasattr(cls, 'm'):
+ return cls.m
+ else:
+ raise AttributeError('%s has neither "query" nor "m" attribute' % cls)
- def get(self, cls, key):
- _key = self._normalize_key(key)
- self._manage_cache(cls, _key)
+ def get(self, cls, query):
self._accesses[cls] += 1
- if _key not in self._cache[cls]:
- query = getattr(cls, 'query', getattr(cls, 'm', None))
- self.set(cls, _key, query.get(**key))
+ _query = self._normalize_query(query)
+ self._touch(cls, _query)
+ if _query not in self._query_cache[cls]:
+ self.set(cls, _query, self._model_query(cls).get(**query))
else:
- self._hits[cls] += 1
- return self._cache[cls][_key]
-
- def set(self, cls, key, val):
- _key = self._normalize_key(key)
- self._manage_cache(cls, _key)
- self._cache[cls][_key] = val
-
- def _manage_cache(self, cls, key):
+ self._query_hits[cls] += 1
+ _id = self._query_cache[cls][_query]
+ if _id not in self._instance_cache[cls]:
+ model_query = getattr(cls, 'query', getattr(cls, 'm', None))
+ self.set(cls, _query, self._model_query(cls).get(**query))
+ else:
+ self._instance_hits[cls] += 1
+ return self._instance_cache[cls][_id]
+
+ def set(self, cls, query, val):
+ _query = self._normalize_query(query)
+ self._touch(cls, _query)
+ _id = self._query_cache[cls].get(_query, getattr(val, '_id', None))
+ if _id is None:
+ _id = 'None_%s' % bson.ObjectId()
+ self._query_cache[cls][_query] = _id
+ self._instance_cache[cls][_id] = val
+ self._check_sizes(cls)
+
+ def _touch(self, cls, query):
'''
Keep track of insertion order, prevent duplicates,
and expire from the cache in a FIFO manner.
'''
- if key in self._cache[cls]:
- # refresh access time in cache
- val = self._cache[cls].pop(key)
- self._cache[cls][key] = val
- elif len(self._cache[cls]) >= self.max_size:
- # remove the least-recently-used cache item
- key, instance = self._cache[cls].popitem(last=False)
+ _query = self._normalize_query(query)
+ if _query not in self._query_cache[cls]:
+ return
+ _id = self._query_cache[cls].pop(_query)
+ self._query_cache[cls][_query] = _id
+
+ if _id not in self._instance_cache[cls]:
+ return
+ val = self._instance_cache[cls].pop(_id)
+ self._instance_cache[cls][_id] = val
+
+ def _check_sizes(self, cls):
+ if self.num_queries(cls) > self._max_queries[cls]:
+ self._remove_least_recently_used(self._query_cache[cls])
+ if self.num_instances(cls) > self._max_instances[cls]:
+ instance = self._remove_least_recently_used(self._instance_cache[cls])
try:
inst_session = session(instance)
except AttributeError:
inst_session = None
if inst_session:
+ inst_session.flush(instance)
inst_session.expunge(instance)
- def size(self):
- return sum([len(c) for c in self._cache.values()])
+ def _remove_least_recently_used(self, cache):
+ # last-used (most-recently-used) is last in cache, so take first
+ key, val = cache.popitem(last=False)
+ return val
- def keys(self, cls, as_dict=True):
- '''
- Returns all the cache keys for a given class. Each
- cache key will be a dict.
- '''
- if as_dict:
- return [dict(k) for k in self._cache[cls].keys()]
+ def num_queries(self, cls=None):
+ if cls is None:
+ return sum([len(c) for c in self._query_cache.values()])
else:
- return self._cache[cls].keys()
+ return len(self._query_cache[cls])
+
+ def num_instances(self, cls=None):
+ if cls is None:
+ return sum([len(c) for c in self._instance_cache.values()])
+ else:
+ return len(self._instance_cache[cls])
+
+ def instance_ids(self, cls):
+ return self._instance_cache[cls].keys()
def batch_load(self, cls, query, attrs=None):
'''
@@ -915,6 +971,6 @@ class ModelCache(object):
'''
if attrs is None:
attrs = query.keys()
- for result in cls.query.find(query):
+ for result in self._model_query(cls).find(query):
keys = {a: getattr(result, a) for a in attrs}
self.set(cls, keys, result)
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/508921b0/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index dfabc5b..b827579 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -523,6 +523,6 @@ def _pull_tree(cache, tree_id, *context):
def _update_tree_cache(tree_ids, cache):
current_ids = set(tree_ids)
- cached_ids = set([k[0][1] for k in cache.keys(Tree, as_dict=False)])
+ cached_ids = set(cache.instance_ids(Tree))
new_ids = current_ids - cached_ids
cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/508921b0/Allura/allura/tests/model/test_repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py
index 040c750..40af7ce 100644
--- a/Allura/allura/tests/model/test_repo.py
+++ b/Allura/allura/tests/model/test_repo.py
@@ -525,99 +525,138 @@ class TestModelCache(unittest.TestCase):
def setUp(self):
self.cache = M.repo.ModelCache()
- def test_normalize_key(self):
- self.assertEqual(self.cache._normalize_key({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1)))
+ def test_normalize_query(self):
+ self.assertEqual(self.cache._normalize_query({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1)))
+
+ def test_model_query(self):
+ q = mock.Mock(spec_set=['query'], query='foo')
+ m = mock.Mock(spec_set=['m'], m='bar')
+ n = mock.Mock(spec_set=['foo'], foo='qux')
+ self.assertEquals(self.cache._model_query(q), 'foo')
+ self.assertEquals(self.cache._model_query(m), 'bar')
+ self.assertRaises(AttributeError, self.cache._model_query, [n])
@mock.patch.object(M.repo.Tree.query, 'get')
@mock.patch.object(M.repo.LastCommit.query, 'get')
def test_get(self, lc_get, tr_get):
- tr_get.return_value = 'bar'
- lc_get.return_value = 'qux'
+ tree = tr_get.return_value = mock.Mock(_id='foo', val='bar')
+ lcd = lc_get.return_value = mock.Mock(_id='foo', val='qux')
val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
tr_get.assert_called_with(_id='foo')
- self.assertEqual(val, 'bar')
+ self.assertEqual(val, tree)
val = self.cache.get(M.repo.LastCommit, {'_id': 'foo'})
lc_get.assert_called_with(_id='foo')
- self.assertEqual(val, 'qux')
+ self.assertEqual(val, lcd)
@mock.patch.object(M.repo.Tree.query, 'get')
- def test_get_no_dup(self, tr_get):
- tr_get.return_value = 'bar'
+ def test_get_no_query(self, tr_get):
+ tree1 = tr_get.return_value = mock.Mock(_id='foo', val='bar')
val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
tr_get.assert_called_once_with(_id='foo')
- self.assertEqual(val, 'bar')
+ self.assertEqual(val, tree1)
- tr_get.return_value = 'qux'
+ tree2 = tr_get.return_value = mock.Mock(_id='foo', val='qux')
val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
tr_get.assert_called_once_with(_id='foo')
- self.assertEqual(val, 'bar')
+ self.assertEqual(val, tree1)
@mock.patch.object(M.repo.TreesDoc.m, 'get')
def test_get_doc(self, tr_get):
- tr_get.return_value = 'bar'
+ trees = tr_get.return_value = mock.Mock(_id='foo', val='bar')
val = self.cache.get(M.repo.TreesDoc, {'_id': 'foo'})
tr_get.assert_called_once_with(_id='foo')
- self.assertEqual(val, 'bar')
+ self.assertEqual(val, trees)
def test_set(self):
- self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'test_set')
- self.assertEqual(self.cache._cache, {M.repo.Tree: {(('_id', 'foo'),): 'test_set'}})
-
- def test_keys(self):
- self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
- self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
- self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'}, {'fubar': 'scm'}])
- self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
-
- def test_keys_not_as_dict(self):
- self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
- self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
- self.assertEqual(self.cache.keys(M.repo.Tree, as_dict=False), [(('_id', 'test_keys'), ('text', 'tko')), (('fubar', 'scm'),)])
- self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
+ tree = mock.Mock(_id='foo', val='test_set')
+ self.cache.set(M.repo.Tree, {'val': 'test_set'}, tree)
+ self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'foo'}})
+ self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'foo': tree}})
+
+ @mock.patch('bson.ObjectId')
+ def test_set_none_id(self, obj_id):
+ obj_id.return_value = 'OBJID'
+ tree = mock.Mock(_id=None, val='test_set')
+ self.cache.set(M.repo.Tree, {'val': 'test_set'}, tree)
+ self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'None_OBJID'}})
+ self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'None_OBJID': tree}})
+
+ @mock.patch('bson.ObjectId')
+ def test_set_none_val(self, obj_id):
+ obj_id.return_value = 'OBJID'
+ self.cache.set(M.repo.Tree, {'val': 'test_set'}, None)
+ self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'None_OBJID'}})
+ self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'None_OBJID': None}})
+
+ def test_instance_ids(self):
+ tree1 = mock.Mock(_id='id1', val='tree1')
+ tree2 = mock.Mock(_id='id2', val='tree2')
+ self.cache.set(M.repo.Tree, {'val': 'tree1'}, tree1)
+ self.cache.set(M.repo.Tree, {'val': 'tree2'}, tree2)
+ self.assertEqual(set(self.cache.instance_ids(M.repo.Tree)), set(['id1', 'id2']))
+ self.assertEqual(self.cache.instance_ids(M.repo.LastCommit), [])
@mock.patch.object(M.repo.Tree.query, 'find')
def test_batch_load(self, tr_find):
# cls, query, attrs
- m1 = mock.Mock(foo=1, qux=3)
- m2 = mock.Mock(foo=2, qux=5)
+ m1 = mock.Mock(_id='id1', foo=1, qux=3)
+ m2 = mock.Mock(_id='id2', foo=2, qux=5)
tr_find.return_value = [m1, m2]
self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}})
tr_find.assert_called_with({'foo': {'$in': 'bar'}})
- self.assertEqual(self.cache._cache[M.repo.Tree], {
- (('foo', 1),): m1,
- (('foo', 2),): m2,
+ self.assertEqual(self.cache._query_cache[M.repo.Tree], {
+ (('foo', 1),): 'id1',
+ (('foo', 2),): 'id2',
+ })
+ self.assertEqual(self.cache._instance_cache[M.repo.Tree], {
+ 'id1': m1,
+ 'id2': m2,
})
@mock.patch.object(M.repo.Tree.query, 'find')
def test_batch_load_attrs(self, tr_find):
# cls, query, attrs
- m1 = mock.Mock(foo=1, qux=3)
- m2 = mock.Mock(foo=2, qux=5)
+ m1 = mock.Mock(_id='id1', foo=1, qux=3)
+ m2 = mock.Mock(_id='id2', foo=2, qux=5)
tr_find.return_value = [m1, m2]
self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}, ['qux'])
tr_find.assert_called_with({'foo': {'$in': 'bar'}})
- self.assertEqual(self.cache._cache[M.repo.Tree], {
- (('qux', 3),): m1,
- (('qux', 5),): m2,
+ self.assertEqual(self.cache._query_cache[M.repo.Tree], {
+ (('qux', 3),): 'id1',
+ (('qux', 5),): 'id2',
+ })
+ self.assertEqual(self.cache._instance_cache[M.repo.Tree], {
+ 'id1': m1,
+ 'id2': m2,
})
def test_pruning(self):
- self.cache.max_size = 3
+ cache = M.repo.ModelCache(max_queries=3, max_instances=2)
# ensure cache expires as LRU
- self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'bar')
- self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz')
- self.cache.set(M.repo.Tree, {'_id': 'f00'}, 'b4r')
- self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'zaz')
- self.cache.get(M.repo.Tree, {'_id': 'f00'})
- self.cache.set(M.repo.Tree, {'_id': 'mee'}, 'you')
- self.assertEqual(self.cache._cache, {
+ tree1 = mock.Mock(_id='foo', val='bar')
+ tree2 = mock.Mock(_id='qux', val='fuz')
+ tree3 = mock.Mock(_id='f00', val='b4r')
+ tree4 = mock.Mock(_id='foo', val='zaz')
+ cache.set(M.repo.Tree, {'_id': 'foo'}, tree1)
+ cache.set(M.repo.Tree, {'_id': 'qux'}, tree2)
+ cache.set(M.repo.Tree, {'_id': 'f00'}, tree3)
+ cache.set(M.repo.Tree, {'_id': 'foo'}, tree4)
+ cache.get(M.repo.Tree, {'_id': 'f00'})
+ cache.set(M.repo.Tree, {'val': 'b4r'}, tree3)
+ self.assertEqual(cache._query_cache, {
+ M.repo.Tree: {
+ (('_id', 'foo'),): 'foo',
+ (('_id', 'f00'),): 'f00',
+ (('val', 'b4r'),): 'f00',
+ },
+ })
+ self.assertEqual(cache._instance_cache, {
M.repo.Tree: {
- (('_id', 'foo'),): 'zaz',
- (('_id', 'f00'),): 'b4r',
- (('_id', 'mee'),): 'you',
+ 'f00': tree3,
+ 'foo': tree4,
},
})