You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by cu...@apache.org on 2018/02/06 00:26:09 UTC
[whimsy] branch master updated: Improve documentation; deprecate old functions; reorganize

This is an automated email from the ASF dual-hosted git repository.

curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new f5c484e  Improve documentation; deprecate old functions; reorganize
f5c484e is described below

commit f5c484ece78361fd2baabfb803ef4c94a8f00e6c
Author: Shane Curcuru <as...@shanecurcuru.org>
AuthorDate: Mon Feb 5 19:26:01 2018 -0500

    Improve documentation; deprecate old functions; reorganize
---
 tools/mboxhdr2csv.rb | 357 ++++++++++++++++++++++++---------------------------
 1 file changed, 170 insertions(+), 187 deletions(-)

diff --git a/tools/mboxhdr2csv.rb b/tools/mboxhdr2csv.rb
index dff5aed..c7ca8c9 100644
--- a/tools/mboxhdr2csv.rb
+++ b/tools/mboxhdr2csv.rb
@@ -1,5 +1,12 @@
 #!/usr/bin/env ruby
-# Analyze mbox files either by headers or by content lines
+# Analyze mbox files for general statistics into CSV
+# - Per list messages per month over time (PMOT)
+# - Count messages group by list -> graph months as time
+# - Per list contentlines per lists PMOT
+# - Per user statistics
+# Count lines of text content in mail body, roughly attempting to 
+#   count just new content (not automated, not > replies)
+
 $LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
 require 'whimsy/asf'
 require 'mail'
@@ -11,18 +18,40 @@ require 'date'
 MBOX_EXT = '.mbox'
 MEMBER = 'member'
 COMMITTER = 'committer'
+COUNSEL = 'counsel'
 
-# Analyzing mbox files for interesting statistics to report:
-# Contentlines are only counted when ! has_key? nondiscuss
-#   Rationale: svn, JIRA, automated messages are primarly tool-created
-# Per list messages per month over time (PMOT)
-# Count messages group by list -> graph months as time
-# Per list contentlines per lists PMOT
-# User messages per lists PMOT
-# User contentlines per lists PMOT
-# User msgs/lines by day of week; hour of day
+# Subject regexes that are non-discussion oriented
+# Analysis: don't bother with content lines in these messages, 
+#   because most of the content is tool-generated
+NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
+  '<board.apache.org>' => {
+    missing: /\AMissing\s\S+\sBoard/,
+    feedback: /\ABoard\sfeedback\son\s20/,
+    notice: /\A\[NOTICE\]/i,
+    report: /\A\[REPORT\]/i,
+    resolution: /\A\[RESOLUTION\]/i,
+    svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
+    svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
+  },
+  '<operations.apache.org>' => {
+    notice: /\A\[NOTICE\]/i,
+    report: /\A\[REPORT\]/i,
+    svn_general: %r{\Asvn commit: r/},
+    svn_bills: %r{\Abills: r\d{4,8} -}
+  },
+  '<trademarks.apache.org>' => {
+    report: /\A\[REPORT\]/i,
+    svn_general: %r{\Asvn commit: r/}
+  },
+  '<fundraising.apache.org>' => {
+    report: /\A\[REPORT\]/i,
+    svn_bills: %r{\Abills: r\d{4,8} -}
+  }
+}
 
-# Read f.mbox or f.mbox.gz and return [message, message2, ...] or raise error
+# Read a ponyapi.rb mbox file and return mails (text content only)
+# @param f path to .mbox or .mbox.gz
+# @return [mail1, mail2, ...]
 def read_mbox(f)
   if f.end_with? '.gz'
     stream = StringIO.new(mbox)
@@ -39,8 +68,12 @@ def read_mbox(f)
   return messages
 end
 
-# Split mbox into [MailHash, Mail2Hash,...], [ [parseerr, order], ...]
-# Returns nil, [read, errors2...] if mbox file can't be read
+# Process an mbox file into mailhash of selected headers and lines of text
+# @param f path to .mbox or .mbox.gz
+# @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...]
+# @return nil, [read, errors2...] if mbox file can't be read
+# mailhash contains :from, :subject, :listid, :date, :messageid, 
+#   :inreplyto, :lines (count), plus :who and :committer
 def mbox2stats(f)
   begin
     mails = read_mbox(f)
@@ -95,7 +128,30 @@ def mbox2stats(f)
         end
       end
       mdata[:lines] = ctr
+      # Annotate various other precomputable data
       find_who_from mdata
+      begin
+        d = DateTime.parse(mdata[:date])
+        mdata[:y] = d.year
+        mdata[:m] = d.month
+        mdata[:d] = d.day
+        mdata[:w] = d.wday
+        mdata[:h] = d.hour
+        mdata[:z] = d.zone
+      rescue => noop
+        # no-op - not critical
+        puts "DEBUG: #{e.message} parsing: #{mdata[:date]}"
+      end
+      regex = NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any)
+      if regex
+        regex.each do |typ, rx|
+          if mdata[:subject] =~ rx
+            mdata[:nondiscuss] = typ
+            break # regex.each
+          end
+        end
+      end
+      # Push our hash 
       messages << mdata
     rescue => e
       errs << [e, mdata[:order]]
@@ -104,165 +160,139 @@ def mbox2stats(f)
   return messages, errs
 end
 
-# Scan dir of mbox and output json of statistics for each; return meta-array of all mdata
-def scan_dir_mbox2stats(dir, ext)
-  Dir["#{dir}/**/*#{ext}".untaint].each do |f|
-    mails, errs = mbox2stats(f.untaint)
-    puts "scan_mbox(#{f}) mails: #{mails.length} errors: #{errs.length}"
-    File.open("#{f.chomp(ext)}.json", "w") do |fout|
-      fout.puts JSON.pretty_generate([mails, errs])
-    end
-  end
-end
-
-# Add :who field and Apache committer status
-def find_who_from(msg)
+# Annotate mailhash by adding :who and :committer (where known)
+# @param mdata Hash to evaluate and annotate
+# Side effect: adds :who and :committer from ASF::Person.find_by_email
+# :committer = 'n' if not found; 'N' if error, 'counsel' for special case
+def find_who_from(mdata)
   # Micro-optimize unique names
-  case msg[:from]
+  case mdata[:from]
+  when /Mark.Radcliffe/i
+    mdata[:who] = 'Mark.Radcliffe'
+    mdata[:committer] = COUNSEL
   when /mattmann/i
-    msg[:who] = 'Chris Mattmann'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Chris Mattmann'
+    mdata[:committer] = MEMBER
   when /jagielski/i
-    msg[:who] = 'Jim Jagielski'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Jim Jagielski'
+    mdata[:committer] = MEMBER
   when /delacretaz/i
-    msg[:who] = 'Bertrand Delacretaz'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Bertrand Delacretaz'
+    mdata[:committer] = MEMBER
   when /curcuru/i
-    msg[:who] = 'Shane Curcuru'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Shane Curcuru'
+    mdata[:committer] = MEMBER
   when /steitz/i
-    msg[:who] = 'Phil Steitz'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Phil Steitz'
+    mdata[:committer] = MEMBER
   when /gardler/i  # Effectively unique (see: Heidi)
-    msg[:who] = 'Ross Gardler'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Ross Gardler'
+    mdata[:committer] = MEMBER
   when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
-    msg[:who] = 'Craig L Russell'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Craig L Russell'
+    mdata[:committer] = MEMBER
   when /McGrail/i
-    msg[:who] = 'Kevin A. McGrail'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Kevin A. McGrail'
+    mdata[:committer] = MEMBER
   when /sallykhudairi@yahoo/i 
-    msg[:who] = 'Sally Khudairi'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Sally Khudairi'
+    mdata[:committer] = MEMBER
   when /sk@haloworldwide.com/i
-    msg[:who] = 'Sally Khudairi'
-    msg[:committer] = MEMBER
+    mdata[:who] = 'Sally Khudairi'
+    mdata[:committer] = MEMBER
   else
     begin
       # TODO use Real Name (JIRA) to attempt to lookup some notifications
-      tmp = liberal_email_parser(msg[:from])
+      tmp = liberal_email_parser(mdata[:from])
       person = ASF::Person.find_by_email(tmp.address.dup)
       if person
-        msg[:who] = person.cn
+        mdata[:who] = person.cn
         if person.asf_member?
-          msg[:committer] = MEMBER
+          mdata[:committer] = MEMBER
         else
-          msg[:committer] = COMMITTER
+          mdata[:committer] = COMMITTER
         end
       else
-        msg[:who] = "#{tmp.display_name} <#{tmp.address}>"
-        msg[:committer] = 'n'
+        mdata[:who] = "#{tmp.display_name} <#{tmp.address}>"
+        mdata[:committer] = 'n'
       end
     rescue
-      msg[:who] = msg[:from]
-      msg[:committer] = 'N'
+      mdata[:who] = mdata[:from]
+      mdata[:committer] = 'N'
     end
   end
 end
 
-# Subject regexes that are non-discussion oriented
-# Analysis: don't bother with content lines in these messages, 
-#   because most of the content is tool-generated
-NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
-  '<board.apache.org>' => {
-    missing: /\AMissing\s\S+\sBoard/,
-    feedback: /\ABoard\sfeedback\son\s20/,
-    notice: /\A\[NOTICE\]/i,
-    report: /\A\[REPORT\]/i,
-    resolution: /\A\[RESOLUTION\]/i,
-    svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
-    svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
-  },
-  '<operations.apache.org>' => {
-    notice: /\A\[NOTICE\]/i,
-    report: /\A\[REPORT\]/i,
-    svn_general: %r{\Asvn commit: r/},
-    svn_bills: %r{\Abills: r\d{4,8} -}
-  },
-  '<trademarks.apache.org>' => {
-    report: /\A\[REPORT\]/i,
-    svn_general: %r{\Asvn commit: r/}
-  },
-  '<fundraising.apache.org>' => {
-    report: /\A\[REPORT\]/i,
-    svn_bills: %r{\Abills: r\d{4,8} -}
-  }
-}
-
-# Annotate mbox stats hash w/nondiscussion marker (hint: don't count content lines)
-def mark_nondiscussion(mails)
-  ctr = 0
-  mails.each do |msg|
-    regex = NONDISCUSSION_SUBJECTS[msg['listid']] # Use subject regex for this list (if any)
-    if regex
-      regex.each do |typ, rx|
-        if msg['subject'] =~ rx
-          msg[:nondiscuss] = typ
-          ctr += 1
-          break
-        end
-      end
+# @see www/secretary/workbench/models/message.rb
+# @see https://github.com/mikel/mail/issues/39
+def liberal_email_parser(addr)
+  begin
+    addr = Mail::Address.new(addr)
+  rescue
+    if addr =~ /^"([^"]*)" <(.*)>$/
+      addr = Mail::Address.new
+      addr.address = $2
+      addr.display_name = $1
+    elsif addr =~ /^([^"]*) <(.*)>$/
+      addr = Mail::Address.new
+      addr.address = $2
+      addr.display_name = $1
+    else
+      raise
     end
   end
+  return addr
 end
 
-# Annotate mbox stats hash with various precomputed data
-def annotate_stats(mails)
-  mails.each do |msg|
-    # Translate date into y, m, d, w (day of week), h, z (timezone), (no minutes)
-    begin
-      d = DateTime.parse(msg['date'])
-      msg[:y] = d.year
-      msg[:m] = d.month
-      msg[:d] = d.day
-      msg[:w] = d.wday
-      msg[:h] = d.hour
-      msg[:z] = d.zone
-    rescue => e
-      # no-op
-      puts "DEBUG: #{e.message} parsing: #{msg['date']}"
+# Scan dir tree for mboxes and output individual mailhash as JSONs
+# @param dir to scan (whole tree)
+# @param ext file extension to glob for
+# Side effect: writes out f.chomp(ext).json files
+def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
+  Dir["#{dir}/**/*#{ext}".untaint].each do |f|
+    mails, errs = mbox2stats(f.untaint)
+    File.open("#{f.chomp(ext)}.json", "w") do |fout|
+      fout.puts JSON.pretty_generate([mails, errs])
     end
   end
 end
 
-# Scan dir of mbox .json stats and annotate with nondiscussion markers
-# TODO: this should really be done on first parse pass, not later on
-# @return array of any errors
-def scan_dir_json_nondiscussion(dir)
+# Scan dir tree for mailhash JSONs and output an overview CSV of all
+# @return [ error1, error2, ...] if any errors
+# Side effect: writes out dir/outname CSV file
+def scan_dir_stats2csv(dir, outname)
   errors = []
-  Dir["#{dir}/**/*.json".untaint].each do |f|
+  filenames = Dir["#{dir}/**/*.json".untaint]
+  raise ArgumentError, "#{__method__} called with no files in #{dir}" if filenames.length == 0
+  puts "#{__method__} processing #{filenames.length} files"
+  firstfile = filenames.shift
+  jzon = JSON.parse(File.read(firstfile))
+  # Write out headers and the first file in new csv
+  csvfile = File.join("#{dir}", outname)
+  csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines committer messageid inreplyto ), write_headers: true)
+  jzon[0].each do |m|
+    csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['committer'], m['messageid'], m['inreplyto']  ]
+  end
+  
+  # Write out all remaining files, without headers, appending
+  filenames.each do |f|
     begin
-      jzon = JSON.parse(File.read(f))
-      msgs = jzon[0]  # Should be an array of [[msgs...], [errs...]}
-      # Run both annotations
-      mark_nondiscussion msgs
-      annotate_stats msgs
-      # Now re-write the same file with this included data
-      File.open("#{f}", "w") do |fout|
-        fout.puts JSON.pretty_generate(jzon)
+      j = JSON.parse(File.read(f))
+      j[0].each do |m|
+        csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['committer'], m['messageid'], m['inreplyto']  ]
       end
     rescue => e
-      puts "ERROR:scan_dir_json_nondiscussion(#{f}) raised #{e.message[0..255]}"
+      puts "ERROR: parse/write of #{f} raised #{e.message[0..255]}"
       errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
       next
     end
   end
+  csv.close # Just in case
   return errors
 end
 
 # Aggregate selected header fields from an mbox
+# @deprecated TODO use mbox2stats et al instead
 def scan_mbox_headers(f, headers)
   begin
     messages = read_mbox(f)
@@ -304,6 +334,7 @@ def scan_mbox_headers(f, headers)
 end
 
 # Return headers for a directory of mboxes
+# @deprecated TODO use mbox2stats et al instead
 def scan_dir_headers(dir, ext)
   headers = []
   errs = []
@@ -314,28 +345,9 @@ def scan_dir_headers(dir, ext)
   return headers
 end
 
-# Copied from www/secretary/workbench/models/message.rb
-# see https://github.com/mikel/mail/issues/39
-def liberal_email_parser(addr)
-  begin
-    addr = Mail::Address.new(addr)
-  rescue
-    if addr =~ /^"([^"]*)" <(.*)>$/
-      addr = Mail::Address.new
-      addr.address = $2
-      addr.display_name = $1
-    elsif addr =~ /^([^"]*) <(.*)>$/
-      addr = Mail::Address.new
-      addr.address = $2
-      addr.display_name = $1
-    else
-      raise
-    end
-  end
-  return addr
-end
-
 # Simple header annotations for best guesses of ASF attributes related to trademarks@
+# @deprecated TODO use mbox2stats et al instead
+# TODO Only additional feature is setting header[:type], which is list-specific
 SHANE = 'Shane'
 def annotate_headers(headers)
   headers.each do |header|
@@ -387,6 +399,7 @@ def annotate_headers(headers)
 end
 
 # Common use case - analyze headers in mbox files to see who asks questions on trademarks@
+# @deprecated TODO use mbox2stats et al instead
 def do_mbox2csv_hdr(dir)
   headers = scan_dir_headers(dir, MBOX_EXT)
   CSV.open(File.join("#{dir}", "mboxhdr2csv.csv"), "w", headers: %w( date who subject messageid committer question ), write_headers: true) do |csv|
@@ -396,45 +409,15 @@ def do_mbox2csv_hdr(dir)
   end
 end
 
-# Combine all jsons of mbox stats into single csv, step by step
-# @return [ error1, error2, ...]
-def scan_stats_to_csv(dir, outname)
-  errors = []
-  filenames = Dir["#{dir}/**/*.json".untaint]
-  puts "scan_stats_to_csv() processing #{filenames.length} files"
-  firstfile = filenames.shift
-  jzon = JSON.parse(File.read(firstfile))
-  # Write out headers and the first file in new csv
-  csvfile = File.join("#{dir}", outname)
-  csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines committer messageid inreplyto ), write_headers: true)
-  jzon[0].each do |m|
-    csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['committer'], m['messageid'], m['inreplyto']  ]
+#### TODO Sample code
+path = '~/src/lists'
+output = 'listdata.csv'
+puts "START: #{path} into #{output}"
+scan_dir_mbox2stats(path)
+errs = scan_dir_stats2csv(path, output)
+if errs
+  errs.each do |e|
+    puts "ERROR: #{e}"
   end
-  
-  # Write out all remaining files, without headers, appending
-  filenames.each do |f|
-    begin
-      j = JSON.parse(File.read(f))
-      j[0].each do |m|
-        csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['committer'], m['messageid'], m['inreplyto']  ]
-      end
-    rescue => e
-      puts "ERROR:parse/write of #{f} raised #{e.message[0..255]}"
-      errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
-      next
-    end
-  end
-  # TODO ensure files closed?
-  return errors
-end
-
-# Common use case - analyze mbox files to see how much everyone writes
-puts "DEBUG-TESTcsv"
-# scan_dir_mbox2stats('/Users/curcuru/src/mail/', MBOX_EXT)
-# e = scan_dir_json_nondiscussion('/Users/curcuru/src/mail3/')
-# e = fixup_sally_mail('/Users/curcuru/src/mail/')
-e = scan_stats_to_csv('/Users/curcuru/src/mail/', 'governance_mboxes2010-2017.csv')
-e.each do |x|
-  p x
 end
-puts "DEBUG-END11 e.length #{e.length}"
+puts "END"
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
curcuru@apache.org.