You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by je...@apache.org on 2014/09/03 16:00:13 UTC
[3/5] git commit: [#7628] ticket:646 Command to remove duplicate trove categories

[#7628] ticket:646 Command to remove duplicate trove categories


Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/487de12f
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/487de12f
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/487de12f

Branch: refs/heads/je/42cc_7628
Commit: 487de12fbded799c03b92bfb7e525c0233707418
Parents: da026b8
Author: Igor Bondarenko <je...@gmail.com>
Authored: Wed Sep 3 16:14:43 2014 +0300
Committer: Igor Bondarenko <je...@gmail.com>
Committed: Wed Sep 3 16:14:43 2014 +0300

----------------------------------------------------------------------
 .../allura/scripts/remove_duplicate_troves.py   | 103 +++++++++++++++++++
 1 file changed, 103 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/allura/blob/487de12f/Allura/allura/scripts/remove_duplicate_troves.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/remove_duplicate_troves.py b/Allura/allura/scripts/remove_duplicate_troves.py
new file mode 100644
index 0000000..aab0bf1
--- /dev/null
+++ b/Allura/allura/scripts/remove_duplicate_troves.py
@@ -0,0 +1,103 @@
+#       Licensed to the Apache Software Foundation (ASF) under one
+#       or more contributor license agreements.  See the NOTICE file
+#       distributed with this work for additional information
+#       regarding copyright ownership.  The ASF licenses this file
+#       to you under the Apache License, Version 2.0 (the
+#       "License"); you may not use this file except in compliance
+#       with the License.  You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#       Unless required by applicable law or agreed to in writing,
+#       software distributed under the License is distributed on an
+#       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#       KIND, either express or implied.  See the License for the
+#       specific language governing permissions and limitations
+#       under the License.
+
+import argparse
+import logging
+from itertools import groupby
+from collections import defaultdict
+from operator import itemgetter
+
+from ming.odm import ThreadLocalORMSession
+
+from allura.scripts import ScriptTask
+from allura import model as M
+
+
+log = logging.getLogger(__name__)
+
+
+class RemoveDuplicateTroves(ScriptTask):
+
+    @classmethod
+    def execute(cls, options):
+        duplicates = cls._find_duplicates()
+        log.info('Found %s duplicate categories: %s', len(duplicates), duplicates.keys())
+        for name, dups in duplicates.iteritems():
+            projects_with_category = {}
+            for dup in dups:
+                projects = cls._projects_with_category(dup._id)
+                projects_with_category[dup._id] = projects
+            log.info('Following projects are using category %s:', name)
+            for _id, ps in projects_with_category.iteritems():
+                log.info('  with id %s: %s', _id, [p.shortname for p in ps])
+            priority = [(i, len(ps)) for i, ps in projects_with_category.items()]
+            priority = sorted(priority, key=itemgetter(1), reverse=True)
+            priority = [p[0] for p in priority]
+            live, kill = priority[0], priority[1:]
+            log.info('%s will live %s will die', live, kill)
+            if sum([len(projects_with_category[_id]) for _id in kill]) == 0:
+                # Duplicates are used nowhere
+                log.info('Removing categories %s', kill)
+                if not options.dry_run:
+                    M.TroveCategory.query.remove({'_id': {'$in': kill}})
+            else:
+                # Duplicates are used somewhere, need to reasign for all projects that use them
+                pass
+            ThreadLocalORMSession.flush_all()
+
+    @classmethod
+    def _find_duplicates(cls):
+        dups = []
+        for cat in M.TroveCategory.query.find():
+            if M.TroveCategory.query.find({
+                'shortname': cat.shortname,
+                'trove_cat_id': cat.trove_cat_id,
+                'trove_parent_id': cat.trove_parent_id,
+                'fullname': cat.fullname,
+                'fullpath': cat.fullpath,
+            }).count() > 1:
+                dups.append(cat)
+        result = defaultdict(list)
+        for k, v in groupby(dups, lambda x: x.shortname):
+            result[k].extend(list(v))
+        return result
+
+    @classmethod
+    def _projects_with_category(cls, _id):
+        p = M.Project.query.find({'$or': [
+            {'trove_root_database': _id},
+            {'trove_developmentstatus': _id},
+            {'trove_audience': _id},
+            {'trove_license': _id},
+            {'trove_os': _id},
+            {'trove_language': _id},
+            {'trove_topic': _id},
+            {'trove_natlanguage': _id},
+            {'trove_environment':_id},
+        ]})
+        return p.all()
+
+    @classmethod
+    def parser(cls):
+        parser = argparse.ArgumentParser(description='Remove duplicate troves')
+        parser.add_argument('--dry-run', action='store_true', dest='dry_run',
+                            default=False, help='Print what will be changed but do not change anything')
+        return parser
+
+
+if __name__ == '__main__':
+    RemoveDuplicateTroves.main()