You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by se...@apache.org on 2015/09/23 18:23:42 UTC
svn commit: r1704890 -
/comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py
Author: sebb
Date: Wed Sep 23 16:23:41 2015
New Revision: 1704890
URL: http://svn.apache.org/viewvc?rev=1704890&view=rev
Log:
Initial checkin of committee-info.json parser
Not yet ready for production use
Added:
comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py (with props)
Added: comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py?rev=1704890&view=auto
==============================================================================
--- comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py (added)
+++ comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py Wed Sep 23 16:23:41 2015
@@ -0,0 +1,335 @@
+import re
+import json
+import sys
+if sys.hexversion < 0x03000000:
+ raise ImportError("This script requires Python 3")
+import io
+import os
+import urllib.request
+import xml.etree.ElementTree as ET
+import xml.dom.minidom as minidom
+import datetime
+import subprocess
+import glob
+
+sys.path.append("..") # module committee_info is in parent directory
+import committee_info
+
+"""
+
+ THIS IS A PRELIMINARY VERSION
+ DO NOT USE YET
+ =============================
+Reads:
+../../site/json/foundation/people.json
+../../data/committees.xml
+committee-info.txt from Whimsy
+
+Updates:
+../../site/json/foundation/committees.json
+../../site/json/foundation/committees-retired.json
+
+Writes:
+../../site/json/foundation/pmcs.json - for all PMCs
+../../site/doap/<committeeId>/pmc.rdf
+
+Copies:
+../../site/doap/<committeeId>/pmc-doap.rdf from PMC RDF file
+
+"""
+
+# LDAP group ids not matching committee id; convert group to committeeId
+group_ids = {
+ 'ws': 'webservices'
+}
+
+# homepages not matching http://<committee id>.apache.org/
+homepages = {
+ 'comdev': 'http://community.apache.org/',
+ 'httpcomponents': 'http://hc.apache.org/',
+ 'whimsy': 'http://whimsical.apache.org/',
+ 'tika': 'http://tika.apache.org/',
+}
+
+# short description for non-classical committees, that are not listed in http://www.apache.org/#projects-list
+shortdescs = {
+ 'attic': 'A home for dormant projects',
+ 'comdev': 'Ressources to help people become involved with Apache projects',
+ 'incubator': "Entry path into The Apache Software Foundation (ASF) for projects and codebases wishing to become part of the Foundation's efforts",
+ 'labs': 'A place for innovation where committers of the foundation can experiment with new ideas',
+ # Temporary until Whimsy catches up with updated home page
+ 'aurora': 'Mesos framework for long-running services and cron jobs',
+ 'bookkeeper': 'Replicated log service which can be used to build replicated state machines',
+ 'celix': 'Implementation of the OSGi specification adapted to C',
+ 'devicemap': 'data repository containing device information, images and other relevant information for all sorts of mobile devices',
+ 'drill': 'Schema-free SQL Query Engine for Hadoop, NoSQL and Cloud Storage',
+ 'flink': 'platform for scalable batch and stream data processing',
+ 'ignite': 'High-performance, integrated and distributed in-memory platform for computing and transacting on large-scale data sets in real-time',
+ 'metamodel': 'common interface for discovery, exploration of metadata and querying of different types of data sources',
+ 'nifi': 'Easy to use, powerful, and reliable system to process and distribute data',
+ 'orc': 'the smallest, fastest columnar storage for Hadoop workloads',
+ 'parquet': 'columnar storage format available to any project in the Hadoop ecosystem',
+ 'phoenix': 'High performance relational database layer over HBase for low latency applications',
+ 'samza': 'distributed stream processing framework',
+ 'serf': 'High performance C-based HTTP client library built upon the Apache Portable Runtime (APR) library',
+ 'stratos': 'highly-extensible Platform-as-a-Service (PaaS) framework',
+ 'zest': 'community based effort exploring Composite Oriented Programming for domain centric application development',
+}
+
+# Delete a path using SVN
+def svndel(path):
+ try:
+ subprocess.check_call(["svn", "del", path])
+ except subprocess.CalledProcessError as err:
+ print("ERROR: error trying to svn del %s : %s" % (path, err), file=sys.stderr)
+
+def handleChild(el):
+ retval = None
+ hasKids = False
+ for child in list(el):
+ hasKids = True
+ attribs = {}
+ for key in el.attrib:
+ xkey = re.sub(r"\{.+\}", "", key)
+ attribs[xkey] = el.attrib[key]
+ tag = re.sub(r"\{.+\}", "", el.tag)
+ value = attribs['resource'] if 'resource' in attribs else el.text
+ if not hasKids:
+ retval = value
+ else:
+ retval = {}
+ for child in list(el):
+ k, v = handleChild(child)
+ retval[k] = v
+ return tag, retval
+
+print("reading people Data (site/json/foundation/people.json)")
+with open("../../site/json/foundation/people.json", "r") as f:
+ people = json.loads(f.read())
+ f.close()
+
+pmcs = {}
+pmcDataUrls = {} # id -> url
+
+# get PMC Data from /data/committees.xml
+print("reading PMC Data (/data/committees.xml)")
+with open("../../data/committees.xml", "r") as f:
+ xmldoc = minidom.parseString(f.read())
+ f.close()
+
+print("Copying PMC DOAP files to doap/<committeeId>/pmc-doap.rdf...")
+for loc in xmldoc.getElementsByTagName('location') :
+ url = loc.childNodes[0].data
+ try:
+ if url.startswith('http'):
+ rdf = urllib.request.urlopen(url).read()
+ else:
+ rdf = open("../../data/%s" % url, 'r').read()
+ url = "https://svn.apache.org/repos/asf/comdev/projects.apache.org/data/%s" % url
+ rdfxml = ET.fromstring(rdf)
+ rdfdata = rdfxml[0]
+ committeeId = rdfdata.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
+ pmcDataUrls[committeeId] = url
+
+ # transform PMC data RDF to json
+ pmcjson = {
+ 'rdf': url
+ }
+ pmcname = None
+ for el in rdfdata:
+ k, v = handleChild(el)
+ if k in pmcjson:
+ # merge multiple values
+ if type(pmcjson[k]) is str:
+ pmcjson[k] = "%s, %s" % (pmcjson[k], v)
+ else:
+ for xk in v:
+ pmcjson[k][xk] = v[xk]
+ else:
+ pmcjson[k] = v
+
+ pmcs[committeeId] = pmcjson
+
+ # copy PMC RDF data to /doap/{committeeId}/pmc-doap.rdf
+ if type(rdf) is str:
+ mode = "w"
+ else:
+ mode = "wb"
+ path = "../../site/doap/%s" % committeeId
+ try:
+ os.stat(path)
+ except:
+ print("INFO: creating new directory %s for %s" % (path, url))
+ os.mkdir(path)
+ with open("%s/pmc-doap.rdf" % path, mode) as f:
+ f.write(rdf)
+ f.close()
+
+ except Exception as err:
+ print("ERROR: %s" % err, file=sys.stderr)
+
+committeeCount = 0
+committeesList = []
+committeesMap = {}
+addedCommittees = []
+
+# temporary fix to ensure comparisons of generated files work better
+# The original code relied on the order in the physical file
+def keyorder(s):
+# print("key=%s" % s)
+ if s == 'apr':
+ return 'portableruntime'
+ if s == 'climate':
+ return 'openclimate'
+ if s == 'comdev':
+ return 'communitydevelopment'
+ if s == 'httpd':
+ return 'http' # so it sorts before HTTP Components (it's wrong in CI)
+ if s == 'ws':
+ return 'webservices'
+ return s
+
+# extract committees composition
+print("Reading committee-info")
+committees = committee_info.committees()
+
+print("Writing generated doap/<committeeId>/pmc.rdf...")
+for group in sorted(committees, key=keyorder):
+
+# if group == 'apr' or group == 'whimsy':
+# print("DEBUG: see what happens when CI entry %s is removed" % group)
+# continue
+# print(group)
+ ctte = committees[group]
+ fullName = ctte['fullname'] # Full name including Apache prefix
+ if ctte['pmc']: # we only want PMCs
+ if ctte['established']: # only want ones with entries in section 3
+ # Fix up name where PMC RDF does not agree with LDAP group
+ if group in group_ids:
+ committeeId = group_ids[group]
+ else:
+ committeeId = group
+
+ committeeCount += 1
+ committee={}
+ committee['id'] = committeeId
+ committee['chair'] = ctte['chair']['nick']
+ try:
+ committee['reporting'] = ctte['reporting']
+ except KeyError:
+ pass
+ committee['group'] = group
+ committee['name'] = fullName
+ committee['established'] = ctte['established']
+ homepage = None
+ if group in homepages:
+ homepage = homepages[group]
+ else:
+ if ctte['site']:
+ homepage = ctte['site']
+ else:
+ homepage = 'http://%s.apache.org/' % group
+ committee['homepage'] = homepage
+
+ if ctte['description']:
+ committee['shortdesc'] = ctte['description']
+ else:
+ if committeeId in shortdescs:
+ committee['shortdesc'] = shortdescs[committeeId]
+ else:
+ print("WARN: %s (%s) missing from http://www.apache.org/#projects-list" % (group, fullName))
+
+ if committeeId in pmcDataUrls:
+ committee['rdf'] = pmcDataUrls[committeeId]
+ else:
+ print("WARN: %s (%s) missing from /data/committees.xml" % (fullName, committeeId))
+
+ committeesList.append(committee)
+ committeesMap[committeeId] = committee;
+ # generate TLP PMC DOAP file at http://projects-new.apache.org/doap/{committeeId}/pmc.rdf
+ doap = ET.Element('rdf:RDF', attrib= { 'xml:lang': 'en',
+ 'xmlns': 'http://usefulinc.com/ns/doap#',
+ 'xmlns:rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+ 'xmlns:asfext': 'http://projects.apache.org/ns/asfext#',
+ 'xmlns:foaf': 'http://xmlns.com/foaf/0.1/'
+ })
+ doap_pmc = ET.SubElement(doap, 'asfext:pmc')
+ ET.SubElement(doap_pmc, 'asfext:name').text = fullName
+ ET.SubElement(doap_pmc, 'homepage', attrib = { 'rdf:resource': homepage})
+ doap_chair = ET.SubElement(doap_pmc, 'asfext:chair')
+ doap_chair_person = ET.SubElement(doap_chair, 'foaf:Person')
+ ET.SubElement(doap_chair_person, 'foaf:nick').text = committee['chair']
+ ET.SubElement(doap_chair_person, 'foaf:name').text = people[committee['chair']]['name']
+ directory = "../../site/doap/%s" % committeeId
+ if not os.path.exists(directory):
+ print("INFO: creating directory %s" % directory)
+ os.makedirs(directory)
+# print("INFO: creating %s/pmc.pdf" % directory)
+ with open("%s/pmc.rdf" % directory, "w") as f:
+ f.write(minidom.parseString(ET.tostring(doap, encoding="utf-8")).toprettyxml(indent="\t"))
+ f.close()
+ else:
+ print("INFO: %s ignored - not yet in section 3" % fullName)
+ else:
+ # Special Committee (Officer's, President's or Board)
+ print("INFO: %s ignored - not a PMC" % fullName)
+
+
+# detect retired committees to add to committees-retired.json
+with open("../../site/json/foundation/committees-retired.json", "r") as f:
+ committeesRetired = json.loads(f.read())
+ f.close()
+
+with open("../../site/json/foundation/committees.json", "r") as f:
+ committeesPrevious = json.loads(f.read())
+ f.close()
+
+for currId in committeesMap:
+ if not currId in [item['id'] for item in committeesPrevious]:
+ addedCommittees.append(currId)
+
+print("found %s new committees from %s committees in committee_info.txt" % (len(addedCommittees), committeeCount))
+addedCommittees.sort()
+for added in addedCommittees:
+ print("- %s" % added)
+
+for previous in committeesPrevious:
+ prevId = previous['id']
+ if not prevId in committeesMap:
+ print("found retired committee: %s %s" % (prevId, previous['name']))
+ try:
+ subprocess.check_call(["svn", "mv",
+ "../../data/committees/%s.rdf" % prevId,
+ "../../data/committees-retired/"])
+ except subprocess.CalledProcessError as err:
+ print("ERROR: error trying to svn mv %s.rdf : %s" % (prevId, err), file=sys.stderr)
+ svndel("../../site/doap/%s" % prevId)
+ projJson = "../../site/json/projects/%s.json" % prevId
+ if os.path.isfile(projJson):
+ svndel(projJson)
+ else:
+ projJson = "../../site/json/projects/%s-*.json" % prevId
+ for f in glob.glob(projJson):
+ svndel(f)
+ previous['retired'] = datetime.date.today().strftime('%Y-%m')
+ # remove data that is not useful in a retired committee
+ previous.pop('chair', None)
+ previous.pop('group', None)
+ previous.pop('rdf', None)
+ previous.pop('reporting', None)
+ committeesRetired.append(previous)
+
+print("Writing json/foundation/committees.json...")
+with open("../../site/json/foundation/committees.json", "w") as f:
+ json.dump(committeesList, f, sort_keys=True, indent=0)
+ f.close()
+
+print("Writing json/foundation/committees-retired.json...")
+with open("../../site/json/foundation/committees-retired.json", "w") as f:
+ json.dump(committeesRetired, f, sort_keys=True, indent=0)
+ f.close()
+
+print("Writing json/foundation/pmcs.json...")
+with open ("../../site/json/foundation/pmcs.json", "w") as f:
+ json.dump(pmcs, f, sort_keys=True, indent=0)
+ f.close()
Propchange: comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py
------------------------------------------------------------------------------
svn:eol-style = native