You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by je...@apache.org on 2014/09/03 16:00:13 UTC
[3/5] git commit: [#7628] ticket:646 Command to remove duplicate
trove categories
[#7628] ticket:646 Command to remove duplicate trove categories
Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/487de12f
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/487de12f
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/487de12f
Branch: refs/heads/je/42cc_7628
Commit: 487de12fbded799c03b92bfb7e525c0233707418
Parents: da026b8
Author: Igor Bondarenko <je...@gmail.com>
Authored: Wed Sep 3 16:14:43 2014 +0300
Committer: Igor Bondarenko <je...@gmail.com>
Committed: Wed Sep 3 16:14:43 2014 +0300
----------------------------------------------------------------------
.../allura/scripts/remove_duplicate_troves.py | 103 +++++++++++++++++++
1 file changed, 103 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/allura/blob/487de12f/Allura/allura/scripts/remove_duplicate_troves.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/remove_duplicate_troves.py b/Allura/allura/scripts/remove_duplicate_troves.py
new file mode 100644
index 0000000..aab0bf1
--- /dev/null
+++ b/Allura/allura/scripts/remove_duplicate_troves.py
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+from itertools import groupby
+from collections import defaultdict
+from operator import itemgetter
+
+from ming.odm import ThreadLocalORMSession
+
+from allura.scripts import ScriptTask
+from allura import model as M
+
+
+log = logging.getLogger(__name__)
+
+
+class RemoveDuplicateTroves(ScriptTask):
+
+ @classmethod
+ def execute(cls, options):
+ duplicates = cls._find_duplicates()
+ log.info('Found %s duplicate categories: %s', len(duplicates), duplicates.keys())
+ for name, dups in duplicates.iteritems():
+ projects_with_category = {}
+ for dup in dups:
+ projects = cls._projects_with_category(dup._id)
+ projects_with_category[dup._id] = projects
+ log.info('Following projects are using category %s:', name)
+ for _id, ps in projects_with_category.iteritems():
+ log.info(' with id %s: %s', _id, [p.shortname for p in ps])
+ priority = [(i, len(ps)) for i, ps in projects_with_category.items()]
+ priority = sorted(priority, key=itemgetter(1), reverse=True)
+ priority = [p[0] for p in priority]
+ live, kill = priority[0], priority[1:]
+ log.info('%s will live %s will die', live, kill)
+ if sum([len(projects_with_category[_id]) for _id in kill]) == 0:
+ # Duplicates are used nowhere
+ log.info('Removing categories %s', kill)
+ if not options.dry_run:
+ M.TroveCategory.query.remove({'_id': {'$in': kill}})
+ else:
+ # Duplicates are used somewhere, need to reasign for all projects that use them
+ pass
+ ThreadLocalORMSession.flush_all()
+
+ @classmethod
+ def _find_duplicates(cls):
+ dups = []
+ for cat in M.TroveCategory.query.find():
+ if M.TroveCategory.query.find({
+ 'shortname': cat.shortname,
+ 'trove_cat_id': cat.trove_cat_id,
+ 'trove_parent_id': cat.trove_parent_id,
+ 'fullname': cat.fullname,
+ 'fullpath': cat.fullpath,
+ }).count() > 1:
+ dups.append(cat)
+ result = defaultdict(list)
+ for k, v in groupby(dups, lambda x: x.shortname):
+ result[k].extend(list(v))
+ return result
+
+ @classmethod
+ def _projects_with_category(cls, _id):
+ p = M.Project.query.find({'$or': [
+ {'trove_root_database': _id},
+ {'trove_developmentstatus': _id},
+ {'trove_audience': _id},
+ {'trove_license': _id},
+ {'trove_os': _id},
+ {'trove_language': _id},
+ {'trove_topic': _id},
+ {'trove_natlanguage': _id},
+ {'trove_environment':_id},
+ ]})
+ return p.all()
+
+ @classmethod
+ def parser(cls):
+ parser = argparse.ArgumentParser(description='Remove duplicate troves')
+ parser.add_argument('--dry-run', action='store_true', dest='dry_run',
+ default=False, help='Print what will be changed but do not change anything')
+ return parser
+
+
+if __name__ == '__main__':
+ RemoveDuplicateTroves.main()