You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@community.apache.org by se...@apache.org on 2015/11/01 20:07:08 UTC
svn commit: r1711841 - /comdev/reporter.apache.org/trunk/mailglomper2.py
Author: sebb
Date: Sun Nov 1 19:07:08 2015
New Revision: 1711841
URL: http://svn.apache.org/viewvc?rev=1711841&view=rev
Log:
Testing new version of mailglomper
TODO - fix output file name if run is successful
Added:
comdev/reporter.apache.org/trunk/mailglomper2.py (with props)
Added: comdev/reporter.apache.org/trunk/mailglomper2.py
URL: http://svn.apache.org/viewvc/comdev/reporter.apache.org/trunk/mailglomper2.py?rev=1711841&view=auto
==============================================================================
--- comdev/reporter.apache.org/trunk/mailglomper2.py (added)
+++ comdev/reporter.apache.org/trunk/mailglomper2.py Sun Nov 1 19:07:08 2015
@@ -0,0 +1,193 @@
+"""
+ Reads public mailing list data from
+ http://mail-archives.us.apache.org/mod_mbox/
+ - listing of mailboxes
+ and from each:
+ http://mail-archives.us.apache.org/mod_mbox/<list>/yyyymm.mbox
+ - messages per week and per last two rolling quarters (92 days)
+
+ Updates:
+ data/maildata_extended.json
+"""
+import sys
+if sys.hexversion < 0x03000000:
+ raise ImportError("This script requires Python 3")
+import re, json, os, time, email.utils
+from datetime import datetime
+import urlutils
+import traceback
+import data.errtee
+
+SECS_PER_DAY = 86400
+SECS_PER_WEEK = 604800
+
+def tsprint(s): # print with timestamp
+ msg = "%s %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s)
+ if isinstance(s, Exception):
+ print(msg, file=sys.stderr)
+ type, value, tb = sys.exc_info()
+ traceback.print_exception(type, value, tb)
+ else:
+ print(msg)
+
+tsprint("Start")
+
+__MAILDATA_EXTENDED = "data/maildata_extended2.json" # TODO change to normal name
+
+__MAILDATA_CACHE = "data/cache/maildata_weekly.json"
+
+try:
+ with open(__MAILDATA_EXTENDED,'r') as f:
+ mls = json.loads(f.read())
+ tsprint("Read JSON successfully")
+except:
+ mls = {}
+
+try:
+ with open(__MAILDATA_CACHE,'r') as f:
+ mldcacheold = json.loads(f.read())
+ tsprint("Read maildata cache successfully")
+except:
+ tsprint("Created empty maildata cache")
+ mldcacheold = {}
+
+DTNOW = datetime.now()
+currentMonth = DTNOW.month
+currentYear = DTNOW.year
+
+NOW = time.time()
+after = NOW - (SECS_PER_DAY*92)
+wayafter = NOW - (SECS_PER_DAY*92*2)
+
+months = []
+for i in range(0,7):
+ date = "%04u%02u" % (currentYear, currentMonth)
+ currentMonth -= 1
+ if currentMonth == 0:
+ currentMonth = 12
+ currentYear -= 1
+ months.append(date)
+
+
+fc = urlutils.UrlCache(interval=30)
+
+# Get the index of mailing lists
+# Not strictly necessary to cache this, but it makes testing easier
+data = fc.get("http://mail-archives.us.apache.org/mod_mbox/", "mod_mbox.html", encoding='utf-8').read()
+tsprint("Fetched %u bytes of main data" % len(data))
+y = 0
+
+"""
+N.B. The project name empire-db is truncated to empire in the main list
+
+Rather than fixing this here, it is done in the scripts that read the output file
+This is because those scripts assume that the first hyphen separates the
+project name from the mailing list name.
+Since list names may contain hyphens (e.g. lucene-net-dev) that is a necessary assumption.
+
+Potentially the generated file could use a separator that is not allowed in project names,
+but this would require converting the input file and potentially allowing both separators in
+the files that process the output for a short while.
+"""
+
+ # These are the entries we actually used, so we write this copy
+ # This ensures that entries are dropped when no longer needed
+mldcachenew={}
+
+"""
+ Read the weekly stats from a mbox file, caching the counts.
+"""
+def weekly_stats(ml, date):
+ fname = "%s-%s" % (ml, date)
+ stamp = None
+ if fname+"x" in mldcacheold:
+ tsprint("Have json cache for: " + fname)
+ entry = mldcacheold[fname]
+ ct = entry['ct']
+ stamp = entry['stamp']
+ weekly = {}
+ # JSON keys are always stored as strings; fix these up for main code
+ for w in entry['weekly']:
+ weekly[int(w)] = entry['weekly'][w]
+ mldcachenew[fname] = entry # copy the entry for later storage
+ else:
+ tsprint("Not cached: " + fname)
+
+ url = "http://mail-archives.us.apache.org/mod_mbox/%s/%s.mbox" % (ml, date)
+ stamp, mldata = urlutils.getIfNewer(url, stamp) # read binary URL
+
+ if mldata: # we have a new/updated file to process
+ tsprint("Processing new/updated version of %s" % fname)
+ ct = 0
+ weekly = {}
+ l = 0
+ for line in mldata:
+ l += 1
+ c = re.match(b"^From \S+ (.+)", line) # match as binary
+ if c:
+ ct += 1
+ try:
+ d = email.utils.parsedate(c.group(1).decode('latin1')) # convert match to string
+ timestamp = int(time.mktime(d))
+ rounded = timestamp - (timestamp % SECS_PER_WEEK) + SECS_PER_WEEK
+ weekly[rounded] = (weekly[rounded] if rounded in weekly else 0) + 1
+ except Exception as err:
+ tsprint(err)
+ # create the cache entry
+ mldcachenew[fname] = {}
+ mldcachenew[fname]['ct'] = ct
+ mldcachenew[fname]['weekly'] = weekly
+ mldcachenew[fname]['stamp'] = stamp
+ else:
+ tsprint("Returning cache for: " + fname)
+ # return the new or cached values
+ return ct, weekly
+
+def add_weeks(total, add):
+ for e in add:
+ if e in total:
+ total[e] += add[e]
+ else:
+ total[e] = add[e]
+
+tsprint("Started")
+
+for mlist in re.finditer(r"<a href='([-a-z0-9]+)/'", data):
+ ml = mlist.group(1)
+ tsprint("Processing: " + ml)
+ start = time.time()
+ y += 1
+ mls[ml] = {}
+ mls[ml]['quarterly'] = [0, 0];
+ mls[ml]['weekly'] = {}
+
+ mlct = 0
+ for date in months:
+ try:
+ ct, weeks = weekly_stats(ml, date)
+ add_weeks(mls[ml]['weekly'], weeks)
+ for week in weeks:
+ if week >= after:
+ mls[ml]['quarterly'][0] += weeks[week]
+ elif week >= wayafter:
+ mls[ml]['quarterly'][1] += weeks[week]
+ tsprint("Debug: %s %s: has %u mails" % (ml, date, ct)) # total for month
+ mlct += ct
+ except Exception as err:
+ tsprint(err)
+
+ tsprint("Info: %s has %u mails (%u secs)" % (ml, mlct, time.time() - start)) # total for mail group
+ if y == 50: # write data as we go to avoid losing it
+ y = 0
+ tsprint("Creating checkpoint of JSON files")
+ with open(__MAILDATA_EXTENDED,'w+') as f:
+ json.dump(mls, f, indent=1) # sort_keys is expensive
+ with open(__MAILDATA_CACHE,"w") as f:
+ json.dump(mldcachenew, f, indent=1) # sort_keys is expensive
+
+tsprint("Completed scanning, writing JSON files")
+with open(__MAILDATA_EXTENDED,'w+') as f:
+ json.dump(mls, f, indent=1, sort_keys=True)
+with open(__MAILDATA_CACHE,"w") as f:
+ json.dump(mldcachenew, f, indent=1, sort_keys=True)
+tsprint("Dumped JSON files")
Propchange: comdev/reporter.apache.org/trunk/mailglomper2.py
------------------------------------------------------------------------------
svn:eol-style = native