You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by hb...@apache.org on 2015/05/31 00:10:40 UTC
svn commit: r1682659 -
/comdev/projects.apache.org/scripts/cronjobs/parsereleases.py
Author: hboutemy
Date: Sat May 30 22:10:40 2015
New Revision: 1682659
URL: http://svn.apache.org/r1682659
Log:
imroved data filtering from /dist/ content
Modified:
comdev/projects.apache.org/scripts/cronjobs/parsereleases.py
Modified: comdev/projects.apache.org/scripts/cronjobs/parsereleases.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/cronjobs/parsereleases.py?rev=1682659&r1=1682658&r2=1682659&view=diff
==============================================================================
--- comdev/projects.apache.org/scripts/cronjobs/parsereleases.py (original)
+++ comdev/projects.apache.org/scripts/cronjobs/parsereleases.py Sat May 30 22:10:40 2015
@@ -3,16 +3,18 @@ import json
import os
releases = {}
+orig = {}
mainurl = "http://www.apache.org/dist/"
x = 0
-try:
- with open("../../site/json/foundation/releases.json") as f:
- releases = json.loads(f.read())
- f.close()
-except Exception as err:
- print("Could not read releases.json, assuming blank slate")
+# don't try to maintain history for the moment...
+#try:
+# with open("../../site/json/foundation/releases.json") as f:
+# releases = json.loads(f.read())
+# f.close()
+#except Exception as err:
+# print("Could not read releases.json, assuming blank slate")
def getDirList(url):
try:
@@ -22,38 +24,67 @@ def getDirList(url):
except:
pass
+def cleanFilename(filename):
+ for suffix in ['.tgz', '.gz', '.bz2', '.xz', '.zip', '.rar', '.tar', 'tar', '.deb', '.rpm', '.dmg', '.egg', '.gem', '.pom', '.war', '.exe',
+ '-scala2.11', '-cdh4', '-hadoop1', '-hadoop2', '-hadoop2.3', '-hadoop2.4', '-all',
+ '-src', '_src', '.src', '-sources', '_sources', '-source', '-bin', '-dist',
+ '-source-release', '-source-relase', '-apidocs', '-javadocs', '-javadoc', '_javadoc', '-tests', '-test', '-debug', '-uber',
+ '-macosx', '-distribution', '-example', '-manual', '-native', '-win', '-win32', '-linux', '-pack', '-packaged', '-lib', '-current', '-embedded',
+ '-py', '-py2', '-py2.6', '-py2.7', '-no', 'unix-distro', 'windows-distro', 'with', '-dep', '-standalone', '-war', '-webapp', '-dom', '-om', '-manual', '-site',
+ '-32bit', '-64bit', '-amd64', '-i386', '_i386', '.i386', '-x86_64', '-minimal', '-jettyconfig', '-py2.py3-none-any', 'newkey', 'oldkey', 'jars', '-jre13', '-hadoop1', '-hadoop2', '-project',
+ '-with-dependencies', '-client', '-server', '-doc', '-docs', 'server-webapps', '-full', '-all', '-standard', '-for-javaee', '-for-tomcat',
+ 'hadoop1-scala2', '-deployer', '-fulldocs', '-windows-i64', '-windows-x64', '-embed', '-apps', '-app', '-ref', '-installer', '-bundle', '-java']:
+ if filename[len(filename)-len(suffix):] == suffix:
+ filename = filename[0:len(filename)-len(suffix)]
+ for repl in ['-assembly-', '-minimal-', '-doc-', '-src-', '-webapp-', '-standalone-', '-parent-', '-project-']:
+ filename = filename.replace(repl, '-')
+ return filename
+
+def cleanReleases(committeeId):
+ if len(releases[committeeId]) == 0:
+ del releases[committeeId]
+ del orig[committeeId]
+
def parseDir(project, path):
- print("Parsing %s..." % path)
- if len(path) < 100:
- for f, d, xd in getDirList("%s/%s" % (mainurl, path)):
- if xd:
+ print(" %s..." % path)
+ if len(path) > 100:
+ print("WARN too long path: recursion?")
+ return
+ for f, d, xd in getDirList("%s/%s" % (mainurl, path)):
+ if xd:
+ if ("/%s" % f) not in path and f.lower() not in ['binaries', 'repos', 'updatesite', 'current', 'stable', 'stable1', 'stable2', 'binary', 'notes', 'doc', 'eclipse', 'patches', 'docs', 'changes', 'features', 'tmp', 'cpp', 'php', 'ruby', 'py', 'py3', 'issuesfixed', 'images', 'styles', 'wikipages']:
parseDir(project, "%s/%s" % (path, f))
- elif not re.search(r"(md5|asc|sig)", f, flags=re.IGNORECASE):
- match = re.match(r"^(.+?)(\.(\S{1,4}))?\.(\S{1,7})$", f)
- if match:
- filename = match.group(1)
- filename = re.sub(r"[-_.]*([Aa]pache|%s|src|bin|deps)[-_.]*" % project, "", filename, count=10, flags=re.IGNORECASE|re.UNICODE)
- releases[project][filename] = d
-
-
-for project, d, xdir in getDirList(mainurl):
- if project != "incubator":
- print("Parsing %s" % project)
- releases[project] = releases[project] if project in releases else {}
- parseDir(project, project)
+ elif not re.search(r"(MD5SUM|SHA1SUM|\.md5|\.mds|\.sh1|\.sh2|\.sha|\.asc|\.sig|\.bin|\.pom|\.jar|\.whl|\.pdf|\.xml|\.html|\.txt|\.cfg|\.ish|\.pl|RELEASE.NOTES|LICENSE|KEYS|CHANGELOG|NOTICE|MANIFEST|Changes|readme|x86|amd64|-docs-|Announcement|current|-deps|-dependencies|binary|-bin-|-javadoc-|-distro|rat_report)", f, flags=re.IGNORECASE):
+ filename = cleanFilename(f)
+ if len(filename) > 1 and filename not in releases[project]:
+ releases[project][filename] = d
+ orig[project][filename] = "%s/%s" % (path, filename)
+ print(" - %s\t\t\t%s" % (filename, f))
+
+
+for committeeId, d, xdir in getDirList(mainurl):
+ if committeeId != 'incubator':
+ if committeeId not in ['xml', 'zzz', 'maven-repository']:
+ print("Parsing /dist/%s content:" % committeeId)
+ releases[committeeId] = releases[committeeId] if committeeId in releases else {}
+ orig[committeeId] = {}
+ parseDir(committeeId, committeeId)
+ cleanReleases(committeeId)
else:
for podling, d, xd in getDirList("%s/incubator/" % mainurl):
- print("Parsing incubator-%s" % podling)
- project = "incubator-%s" % podling
- releases[project] = releases[project] if project in releases else {}
- parseDir(project, "incubator/%s" % podling)
- with open("../../site/json/foundation/releases.json", "w") as f:
- f.write(json.dumps(releases, sort_keys=True, indent=0))
- f.close()
+ print("Parsing /dist/incubator-%s content:" % podling)
+ committeeId = "incubator-%s" % podling
+ releases[committeeId] = releases[committeeId] if committeeId in releases else {}
+ orig[committeeId] = {}
+ parseDir(committeeId, "incubator/%s" % podling)
+ cleanReleases(committeeId)
print("Writing releases.json")
with open("../../site/json/foundation/releases.json", "w") as f:
f.write(json.dumps(releases, sort_keys=True, indent=0))
f.close()
+with open("../../site/json/foundation/releases-orig.json", "w") as f:
+ f.write(json.dumps(orig, sort_keys=True, indent=0))
+ f.close()
print("All done!")
\ No newline at end of file