You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by se...@apache.org on 2015/11/04 01:21:31 UTC
svn commit: r1712445 - /comdev/reporter.apache.org/trunk/scripts/parsepmcs.py

Author: sebb
Date: Wed Nov  4 00:21:31 2015
New Revision: 1712445

URL: http://svn.apache.org/viewvc?rev=1712445&view=rev
Log:
Use url cache to speed up re-runs
Refactor to simplify main loop
Add some progress prints

Modified:
    comdev/reporter.apache.org/trunk/scripts/parsepmcs.py

Modified: comdev/reporter.apache.org/trunk/scripts/parsepmcs.py
URL: http://svn.apache.org/viewvc/comdev/reporter.apache.org/trunk/scripts/parsepmcs.py?rev=1712445&r1=1712444&r2=1712445&view=diff
==============================================================================
--- comdev/reporter.apache.org/trunk/scripts/parsepmcs.py (original)
+++ comdev/reporter.apache.org/trunk/scripts/parsepmcs.py Wed Nov  4 00:21:31 2015
@@ -19,11 +19,15 @@ if sys.hexversion < 0x030000F0:
          time.time() when entry was added to an existing group
          time.time() when entry was last seen,
          ]
+    N.B. The timestamps are now saved as an int (the fractional part is not useful)
+    However existing entry times have not (yet) been trimmed.
+    This would cause a large change to the historical files,
+    so to avoid mixing this with a genuine change, it needs to be planned, and
+    done between normal updates.
 """
 import errtee
 import re
-import urllib.request
-import csv
+from urlutils import UrlCache
 import json
 import os
 import datetime
@@ -33,10 +37,12 @@ __HOME = '../data/'
 
 pmcs = {}
 
+print("Reading pmcs.json")
 with open(__HOME + "pmcs.json", "r", encoding='utf-8') as f:
     pmcs = json.loads(f.read())
 
 projects = {}
+print("Reading projects.json")
 with open(__HOME + "projects.json", "r", encoding='utf-8') as f:
     projects = json.loads(f.read())
 
@@ -49,9 +55,44 @@ for key in sorted(projects.keys()):
 people = {}
 newgroups = []
 
-data = urllib.request.urlopen("http://people.apache.org/committer-index.html").read().decode('utf-8')
+def updateProjects(stamp, group, cid, cname):
+    now = stamp
+    if not group in projects:
+        print("New unx group %s" % group)
+        projects[group] = {}
+        newgroups.append(group)
+    if not cid in projects[group]: # new to the group
+        if group in newgroups: # the group is also new
+            now = 0
+        print("New unx entry %s %s %s %u" % (group, cid, cname, now))
+        projects[group][cid] = [cname, now, stamp]
+    else:
+        # update the entry last seen time (and the public name, which may have changed)
+        projects[group][cid] = [cname, projects[group][cid][1], stamp]
+
+def updatePmcs(stamp, group, cid, cname):
+    now = stamp
+    project = group[0:-4] # drop the "-pmc" suffix
+    if not project in pmcs: # a new project
+        print("New pmc group %s" % project)
+        pmcs[project] = {}
+        newgroups.append(group)
+    if not cid in pmcs[project]: # new to the group
+        if group in newgroups: # the group is also new
+            now = 0
+        print("New pmc entry %s %s %s %u" % (project, cid, cname, now))
+        pmcs[project][cid] = [cname, now, stamp]
+    else:
+        # update the entry last seen time (and the public name, which may have changed)
+        pmcs[project][cid] = [cname, pmcs[project][cid][1], stamp]
+    
+print("Reading committer-index.html")
+uc = UrlCache()
+data = uc.get("http://people.apache.org/committer-index.html","committer-index.html").read().decode('utf-8')
 
-stamp = time.time()
+stamp = int(time.time())
+
+print("Scanning committer-index.html")
 for committer in re.findall(r"<tr>([\S\s]+?)</tr>", data, re.MULTILINE | re.UNICODE):
 
 ##    print(committer)
@@ -84,38 +125,10 @@ for committer in re.findall(r"<tr>([\S\s
             isMember = True
         # process the groups
         for group in re.findall(r"#([-a-z0-9._]+)'", cproj):
-            now = stamp
             if group.endswith("-pmc"):
-                project = group[0:-4] # drop the "-pmc" suffix
-#                 print("PMC %s %s => %s" % (cid, group, project))
-                if not project in pmcs: # a new project
-                    print("New pmc group %s" % project)
-                    pmcs[project] = {}
-                    newgroups.append(group)
-                if not cid in pmcs[project]: # new to the group
-                    if group in newgroups: # the group is also new
-                        now = 0
-                    print("New pmc entry %s %s %s %u" % (project, cid, cname, now))
-                    pmcs[project][cid] = [cname, now, stamp]
-                else:
-                    # update the entry last seen time (and the public name, which may have changed)
-                    pmcs[project][cid] = [cname, pmcs[project][cid][1], stamp]
+                updatePmcs(stamp, group, cid, cname)
             else:
-                project = group
-#                 print("Unx %s %s" % (cid, project))
-                now = stamp
-                if not project in projects:
-                    print("New unx group %s" % project)
-                    projects[project] = {}
-                    newgroups.append(group)
-                if not cid in projects[project]: # new to the group
-                    if group in newgroups: # the group is also new
-                        now = 0
-                    print("New unx entry %s %s %s %u" % (project, cid, cname, now))
-                    projects[project][cid] = [cname, now, stamp]
-                else:
-                    # update the entry last seen time (and the public name, which may have changed)
-                    projects[project][cid] = [cname, projects[project][cid][1], stamp]
+                updateProjects(stamp, group, cid, cname)
 
 
 # Delete retired members