You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by se...@apache.org on 2015/09/23 18:23:42 UTC
svn commit: r1704890 - /comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py

Author: sebb
Date: Wed Sep 23 16:23:41 2015
New Revision: 1704890

URL: http://svn.apache.org/viewvc?rev=1704890&view=rev
Log:
Initial checkin of committee-info.json parser
Not yet ready for production use

Added:
    comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py   (with props)

Added: comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py?rev=1704890&view=auto
==============================================================================
--- comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py (added)
+++ comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py Wed Sep 23 16:23:41 2015
@@ -0,0 +1,335 @@
+import re
+import json
+import sys
+if sys.hexversion < 0x03000000:
+    raise ImportError("This script requires Python 3")
+import io
+import os
+import urllib.request
+import xml.etree.ElementTree as ET
+import xml.dom.minidom as minidom
+import datetime
+import subprocess
+import glob
+
+sys.path.append("..") # module committee_info is in parent directory
+import committee_info
+
+"""
+
+               THIS IS A PRELIMINARY VERSION
+                     DO NOT USE YET
+               =============================
+Reads:
+../../site/json/foundation/people.json
+../../data/committees.xml
+committee-info.txt from Whimsy
+
+Updates:
+../../site/json/foundation/committees.json
+../../site/json/foundation/committees-retired.json
+
+Writes:
+../../site/json/foundation/pmcs.json - for all PMCs
+../../site/doap/<committeeId>/pmc.rdf
+
+Copies:
+../../site/doap/<committeeId>/pmc-doap.rdf from PMC RDF file
+
+"""
+
+# LDAP group ids not matching committee id; convert group to committeeId
+group_ids = {
+    'ws': 'webservices'
+}
+
+# homepages not matching http://<committee id>.apache.org/
+homepages = {
+    'comdev': 'http://community.apache.org/',
+    'httpcomponents': 'http://hc.apache.org/',
+    'whimsy': 'http://whimsical.apache.org/',
+    'tika': 'http://tika.apache.org/',
+}
+
+# short description for non-classical committees, that are not listed in http://www.apache.org/#projects-list
+shortdescs = {
+    'attic': 'A home for dormant projects',
+    'comdev': 'Ressources to help people become involved with Apache projects',
+    'incubator': "Entry path into The Apache Software Foundation (ASF) for projects and codebases wishing to become part of the Foundation's efforts",
+    'labs': 'A place for innovation where committers of the foundation can experiment with new ideas',
+    # Temporary until Whimsy catches up with updated home page
+    'aurora': 'Mesos framework for long-running services and cron jobs',
+    'bookkeeper': 'Replicated log service which can be used to build replicated state machines',
+    'celix': 'Implementation of the OSGi specification adapted to C',
+    'devicemap': 'data repository containing device information, images and other relevant information for all sorts of mobile devices',
+    'drill': 'Schema-free SQL Query Engine for Hadoop, NoSQL and Cloud Storage',
+    'flink': 'platform for scalable batch and stream data processing',
+    'ignite': 'High-performance, integrated and distributed in-memory platform for computing and transacting on large-scale data sets in real-time',
+    'metamodel': 'common interface for discovery, exploration of metadata and querying of different types of data sources',
+    'nifi': 'Easy to use, powerful, and reliable system to process and distribute data',
+    'orc': 'the smallest, fastest columnar storage for Hadoop workloads',
+    'parquet': 'columnar storage format available to any project in the Hadoop ecosystem',
+    'phoenix': 'High performance relational database layer over HBase for low latency applications',
+    'samza': 'distributed stream processing framework',
+    'serf': 'High performance C-based HTTP client library built upon the Apache Portable Runtime (APR) library',
+    'stratos': 'highly-extensible Platform-as-a-Service (PaaS) framework',
+    'zest': 'community based effort exploring Composite Oriented Programming for domain centric application development',
+}
+
+# Delete a path using SVN
+def svndel(path):
+    try:
+        subprocess.check_call(["svn", "del", path])
+    except subprocess.CalledProcessError as err:
+        print("ERROR: error trying to svn del %s : %s" % (path, err), file=sys.stderr)
+
+def handleChild(el):
+    retval = None
+    hasKids = False
+    for child in list(el):
+        hasKids = True
+    attribs = {}
+    for key in el.attrib:
+        xkey = re.sub(r"\{.+\}", "", key)
+        attribs[xkey] = el.attrib[key]
+    tag = re.sub(r"\{.+\}", "", el.tag)
+    value = attribs['resource'] if 'resource' in attribs else el.text
+    if not hasKids:
+        retval = value
+    else:
+        retval = {}
+        for child in list(el):
+            k, v = handleChild(child)
+            retval[k] = v
+    return tag, retval
+
+print("reading people Data (site/json/foundation/people.json)")
+with open("../../site/json/foundation/people.json", "r") as f:
+    people = json.loads(f.read())
+    f.close()
+
+pmcs = {}
+pmcDataUrls = {} # id -> url
+
+# get PMC Data from /data/committees.xml
+print("reading PMC Data (/data/committees.xml)")
+with open("../../data/committees.xml", "r") as f:
+    xmldoc = minidom.parseString(f.read())
+    f.close()
+
+print("Copying PMC DOAP files to doap/<committeeId>/pmc-doap.rdf...")
+for loc in xmldoc.getElementsByTagName('location') :
+    url = loc.childNodes[0].data
+    try:
+        if url.startswith('http'):
+            rdf = urllib.request.urlopen(url).read()
+        else:
+            rdf = open("../../data/%s" % url, 'r').read()
+            url = "https://svn.apache.org/repos/asf/comdev/projects.apache.org/data/%s" % url
+        rdfxml = ET.fromstring(rdf)
+        rdfdata = rdfxml[0]
+        committeeId = rdfdata.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
+        pmcDataUrls[committeeId] = url
+
+        # transform PMC data RDF to json
+        pmcjson = {
+            'rdf': url
+        }
+        pmcname = None
+        for el in rdfdata:
+            k, v = handleChild(el)
+            if k in pmcjson:
+                # merge multiple values
+                if type(pmcjson[k]) is str:
+                    pmcjson[k] = "%s, %s" % (pmcjson[k], v)
+                else:
+                    for xk in v:
+                        pmcjson[k][xk] = v[xk]
+            else:
+                pmcjson[k] = v
+
+        pmcs[committeeId] = pmcjson
+
+        # copy PMC RDF data to /doap/{committeeId}/pmc-doap.rdf
+        if type(rdf) is str:
+            mode = "w"
+        else:
+            mode = "wb"
+        path = "../../site/doap/%s" % committeeId
+        try:
+            os.stat(path)
+        except:
+            print("INFO: creating new directory %s for %s" % (path, url))
+            os.mkdir(path)
+        with open("%s/pmc-doap.rdf" % path, mode) as f:
+            f.write(rdf)
+            f.close()
+
+    except Exception as err:
+        print("ERROR: %s" % err, file=sys.stderr)
+
+committeeCount = 0
+committeesList = []
+committeesMap = {}
+addedCommittees = []
+
+# temporary fix to ensure comparisons of generated files work better
+# The original code relied on the order in the physical file
+def keyorder(s):
+#     print("key=%s" % s)
+    if s == 'apr':
+        return 'portableruntime'
+    if s == 'climate':
+        return 'openclimate'
+    if s == 'comdev':
+        return 'communitydevelopment'
+    if s == 'httpd':
+        return 'http' # so it sorts before HTTP Components (it's wrong in CI)
+    if s == 'ws':
+        return 'webservices'
+    return s
+
+# extract committees composition
+print("Reading committee-info")
+committees = committee_info.committees()
+
+print("Writing generated doap/<committeeId>/pmc.rdf...")
+for group in sorted(committees, key=keyorder):
+
+#     if group == 'apr' or group == 'whimsy':
+#         print("DEBUG: see what happens when CI entry %s is removed" % group)
+#         continue
+#     print(group)
+    ctte = committees[group]
+    fullName = ctte['fullname'] # Full name including Apache prefix
+    if ctte['pmc']: # we only want PMCs
+        if ctte['established']: # only want ones with entries in section 3
+            # Fix up name where PMC RDF does not agree with LDAP group
+            if group in group_ids:
+                committeeId = group_ids[group]
+            else:
+                committeeId = group
+
+            committeeCount += 1
+            committee={}
+            committee['id'] = committeeId
+            committee['chair'] = ctte['chair']['nick']
+            try:
+                committee['reporting'] = ctte['reporting']
+            except KeyError:
+                pass
+            committee['group'] = group
+            committee['name'] = fullName
+            committee['established'] = ctte['established']
+            homepage = None
+            if group in homepages:
+                homepage = homepages[group]
+            else:
+                if ctte['site']:
+                    homepage = ctte['site']
+                else:
+                    homepage = 'http://%s.apache.org/' % group
+            committee['homepage'] = homepage
+
+            if ctte['description']:
+                committee['shortdesc'] = ctte['description']
+            else:
+                if committeeId in shortdescs:
+                    committee['shortdesc'] = shortdescs[committeeId]
+                else:
+                    print("WARN: %s (%s) missing from http://www.apache.org/#projects-list" % (group, fullName))
+
+            if committeeId in pmcDataUrls:
+                committee['rdf'] = pmcDataUrls[committeeId]
+            else:
+                print("WARN: %s (%s) missing from /data/committees.xml" % (fullName, committeeId))
+
+            committeesList.append(committee)
+            committeesMap[committeeId] = committee;
+            # generate TLP PMC DOAP file at http://projects-new.apache.org/doap/{committeeId}/pmc.rdf
+            doap = ET.Element('rdf:RDF', attrib= { 'xml:lang': 'en',
+                                                   'xmlns': 'http://usefulinc.com/ns/doap#',
+                                                   'xmlns:rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+                                                   'xmlns:asfext': 'http://projects.apache.org/ns/asfext#',
+                                                   'xmlns:foaf': 'http://xmlns.com/foaf/0.1/'
+                                                   })
+            doap_pmc = ET.SubElement(doap, 'asfext:pmc')
+            ET.SubElement(doap_pmc, 'asfext:name').text = fullName
+            ET.SubElement(doap_pmc, 'homepage', attrib = { 'rdf:resource': homepage})
+            doap_chair = ET.SubElement(doap_pmc, 'asfext:chair')
+            doap_chair_person = ET.SubElement(doap_chair, 'foaf:Person')
+            ET.SubElement(doap_chair_person, 'foaf:nick').text = committee['chair']
+            ET.SubElement(doap_chair_person, 'foaf:name').text = people[committee['chair']]['name']
+            directory = "../../site/doap/%s" % committeeId
+            if not os.path.exists(directory):
+                print("INFO: creating directory %s" % directory)
+                os.makedirs(directory)
+#             print("INFO: creating %s/pmc.pdf" % directory)
+            with open("%s/pmc.rdf" % directory, "w") as f:
+                f.write(minidom.parseString(ET.tostring(doap, encoding="utf-8")).toprettyxml(indent="\t"))
+                f.close()
+        else:
+            print("INFO: %s ignored - not yet in section 3" % fullName)
+    else:
+        # Special Committee (Officer's, President's or Board)
+        print("INFO: %s ignored - not a PMC" % fullName)
+
+
+# detect retired committees to add to committees-retired.json
+with open("../../site/json/foundation/committees-retired.json", "r") as f:
+    committeesRetired = json.loads(f.read())
+    f.close()
+
+with open("../../site/json/foundation/committees.json", "r") as f:
+    committeesPrevious = json.loads(f.read())
+    f.close()
+
+for currId in committeesMap:
+    if not currId in [item['id'] for item in committeesPrevious]:
+        addedCommittees.append(currId)
+
+print("found %s new committees from %s committees in committee_info.txt" % (len(addedCommittees), committeeCount))
+addedCommittees.sort()
+for added in addedCommittees:
+    print("- %s" % added)
+
+for previous in committeesPrevious:
+    prevId = previous['id']
+    if not prevId in committeesMap:
+        print("found retired committee: %s %s" % (prevId, previous['name']))
+        try:
+            subprocess.check_call(["svn", "mv",
+                               "../../data/committees/%s.rdf" % prevId,
+                               "../../data/committees-retired/"])
+        except subprocess.CalledProcessError as err:
+            print("ERROR: error trying to svn mv %s.rdf : %s" % (prevId, err), file=sys.stderr)
+        svndel("../../site/doap/%s" % prevId)
+        projJson = "../../site/json/projects/%s.json" % prevId
+        if os.path.isfile(projJson):
+            svndel(projJson)
+        else:
+            projJson = "../../site/json/projects/%s-*.json" % prevId
+            for f in glob.glob(projJson):
+                svndel(f)
+        previous['retired'] = datetime.date.today().strftime('%Y-%m')
+        # remove data that is not useful in a retired committee
+        previous.pop('chair', None)
+        previous.pop('group', None)
+        previous.pop('rdf', None)
+        previous.pop('reporting', None)
+        committeesRetired.append(previous)
+
+print("Writing json/foundation/committees.json...")
+with open("../../site/json/foundation/committees.json", "w") as f:
+    json.dump(committeesList, f, sort_keys=True, indent=0)
+    f.close()
+
+print("Writing json/foundation/committees-retired.json...")
+with open("../../site/json/foundation/committees-retired.json", "w") as f:
+    json.dump(committeesRetired, f, sort_keys=True, indent=0)
+    f.close()
+
+print("Writing json/foundation/pmcs.json...")
+with open ("../../site/json/foundation/pmcs.json", "w") as f:
+    json.dump(pmcs, f, sort_keys=True, indent=0)
+    f.close()

Propchange: comdev/projects.apache.org/scripts/cronjobs/parsecommitteeinfoBETA.py
------------------------------------------------------------------------------
    svn:eol-style = native