You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by hb...@apache.org on 2015/05/31 00:10:40 UTC

svn commit: r1682659 - /comdev/projects.apache.org/scripts/cronjobs/parsereleases.py

Author: hboutemy
Date: Sat May 30 22:10:40 2015
New Revision: 1682659

URL: http://svn.apache.org/r1682659
Log:
imroved data filtering from /dist/ content

Modified:
    comdev/projects.apache.org/scripts/cronjobs/parsereleases.py

Modified: comdev/projects.apache.org/scripts/cronjobs/parsereleases.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/cronjobs/parsereleases.py?rev=1682659&r1=1682658&r2=1682659&view=diff
==============================================================================
--- comdev/projects.apache.org/scripts/cronjobs/parsereleases.py (original)
+++ comdev/projects.apache.org/scripts/cronjobs/parsereleases.py Sat May 30 22:10:40 2015
@@ -3,16 +3,18 @@ import json
 import os
 
 releases = {}
+orig = {}
 mainurl = "http://www.apache.org/dist/"
 
 x = 0
 
-try:
-    with open("../../site/json/foundation/releases.json") as f:
-        releases = json.loads(f.read())
-        f.close()
-except Exception as err:
-    print("Could not read releases.json, assuming blank slate")
+# don't try to maintain history for the moment...
+#try:
+#    with open("../../site/json/foundation/releases.json") as f:
+#        releases = json.loads(f.read())
+#        f.close()
+#except Exception as err:
+#    print("Could not read releases.json, assuming blank slate")
 
 def getDirList(url):
     try:
@@ -22,38 +24,67 @@ def getDirList(url):
     except:
         pass
 
+def cleanFilename(filename):
+    for suffix in ['.tgz', '.gz', '.bz2', '.xz', '.zip', '.rar', '.tar', 'tar', '.deb', '.rpm', '.dmg', '.egg', '.gem', '.pom', '.war', '.exe',
+                   '-scala2.11', '-cdh4', '-hadoop1', '-hadoop2', '-hadoop2.3', '-hadoop2.4', '-all',
+                   '-src', '_src', '.src', '-sources', '_sources', '-source', '-bin', '-dist',
+                   '-source-release', '-source-relase', '-apidocs', '-javadocs', '-javadoc', '_javadoc', '-tests', '-test', '-debug', '-uber',
+                   '-macosx', '-distribution', '-example', '-manual', '-native', '-win', '-win32', '-linux', '-pack', '-packaged', '-lib', '-current', '-embedded',
+                   '-py', '-py2', '-py2.6', '-py2.7', '-no', 'unix-distro', 'windows-distro', 'with', '-dep', '-standalone', '-war', '-webapp', '-dom', '-om', '-manual', '-site',
+                   '-32bit', '-64bit', '-amd64', '-i386', '_i386', '.i386', '-x86_64', '-minimal', '-jettyconfig', '-py2.py3-none-any', 'newkey', 'oldkey', 'jars', '-jre13', '-hadoop1', '-hadoop2', '-project',
+                   '-with-dependencies', '-client', '-server', '-doc', '-docs', 'server-webapps', '-full', '-all', '-standard', '-for-javaee', '-for-tomcat',
+                   'hadoop1-scala2', '-deployer', '-fulldocs', '-windows-i64', '-windows-x64', '-embed', '-apps', '-app', '-ref', '-installer', '-bundle', '-java']:
+        if filename[len(filename)-len(suffix):] == suffix:
+            filename = filename[0:len(filename)-len(suffix)]
+    for repl in ['-assembly-', '-minimal-', '-doc-', '-src-', '-webapp-', '-standalone-', '-parent-', '-project-']:
+        filename = filename.replace(repl, '-')
+    return filename
+
+def cleanReleases(committeeId):
+    if len(releases[committeeId]) == 0:
+        del releases[committeeId]
+        del orig[committeeId]
+
 def parseDir(project, path):
-    print("Parsing %s..." % path)
-    if len(path) < 100:
-        for f, d, xd in getDirList("%s/%s" % (mainurl, path)):
-            if xd:
+    print("              %s..." % path)
+    if len(path) > 100:
+        print("WARN too long path: recursion?")
+        return
+    for f, d, xd in getDirList("%s/%s" % (mainurl, path)):
+        if xd:
+            if ("/%s" % f) not in path and f.lower() not in ['binaries', 'repos', 'updatesite', 'current', 'stable', 'stable1', 'stable2', 'binary', 'notes', 'doc', 'eclipse', 'patches', 'docs', 'changes', 'features', 'tmp', 'cpp', 'php', 'ruby', 'py', 'py3', 'issuesfixed', 'images', 'styles', 'wikipages']:
                 parseDir(project, "%s/%s" % (path, f))
-            elif not re.search(r"(md5|asc|sig)", f, flags=re.IGNORECASE):
-                match =  re.match(r"^(.+?)(\.(\S{1,4}))?\.(\S{1,7})$", f)
-                if match:
-                    filename = match.group(1)
-                    filename = re.sub(r"[-_.]*([Aa]pache|%s|src|bin|deps)[-_.]*" % project, "", filename, count=10, flags=re.IGNORECASE|re.UNICODE)
-                    releases[project][filename] = d
-
-
-for project, d, xdir in getDirList(mainurl):
-    if project != "incubator":
-        print("Parsing %s" % project)
-        releases[project] = releases[project] if project in releases else {}
-        parseDir(project, project)
+        elif not re.search(r"(MD5SUM|SHA1SUM|\.md5|\.mds|\.sh1|\.sh2|\.sha|\.asc|\.sig|\.bin|\.pom|\.jar|\.whl|\.pdf|\.xml|\.html|\.txt|\.cfg|\.ish|\.pl|RELEASE.NOTES|LICENSE|KEYS|CHANGELOG|NOTICE|MANIFEST|Changes|readme|x86|amd64|-docs-|Announcement|current|-deps|-dependencies|binary|-bin-|-javadoc-|-distro|rat_report)", f, flags=re.IGNORECASE):
+            filename = cleanFilename(f)
+            if len(filename) > 1 and filename not in releases[project]:
+                releases[project][filename] = d
+                orig[project][filename] = "%s/%s" % (path, filename)
+                print("                  - %s\t\t\t%s" % (filename, f))
+
+
+for committeeId, d, xdir in getDirList(mainurl):
+    if committeeId != 'incubator':
+        if committeeId not in ['xml', 'zzz', 'maven-repository']:
+            print("Parsing /dist/%s content:" % committeeId)
+            releases[committeeId] = releases[committeeId] if committeeId in releases else {}
+            orig[committeeId] = {}
+            parseDir(committeeId, committeeId)
+            cleanReleases(committeeId)
     else:
         for podling, d, xd in getDirList("%s/incubator/" % mainurl):
-            print("Parsing incubator-%s" % podling)
-            project = "incubator-%s" % podling
-            releases[project] = releases[project] if project in releases else {}
-            parseDir(project, "incubator/%s" % podling)
-    with open("../../site/json/foundation/releases.json", "w") as f:
-        f.write(json.dumps(releases, sort_keys=True, indent=0))
-        f.close()
+            print("Parsing /dist/incubator-%s content:" % podling)
+            committeeId = "incubator-%s" % podling
+            releases[committeeId] = releases[committeeId] if committeeId in releases else {}
+            orig[committeeId] = {}
+            parseDir(committeeId, "incubator/%s" % podling)
+            cleanReleases(committeeId)
 
 print("Writing releases.json")
 with open("../../site/json/foundation/releases.json", "w") as f:
     f.write(json.dumps(releases, sort_keys=True, indent=0))
     f.close()
+with open("../../site/json/foundation/releases-orig.json", "w") as f:
+    f.write(json.dumps(orig, sort_keys=True, indent=0))
+    f.close()
 
 print("All done!")
\ No newline at end of file