You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by cu...@apache.org on 2019/06/20 21:55:13 UTC
[whimsy] branch master updated: Namespaces for MailUtils

This is an automated email from the ASF dual-hosted git repository.

curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new 2265343  Namespaces for MailUtils
2265343 is described below

commit 22653438068b118d07f86803c6902bfc6a416a77
Author: Shane Curcuru <as...@shanecurcuru.org>
AuthorDate: Thu Jun 20 17:53:59 2019 -0400

    Namespaces for MailUtils
---
 tools/mboxhdr2csv.rb          | 544 +++++++++++++++++++++---------------------
 www/officers/list-traffic.cgi | 112 +--------
 2 files changed, 281 insertions(+), 375 deletions(-)

diff --git a/tools/mboxhdr2csv.rb b/tools/mboxhdr2csv.rb
index 9bc0e59..0987d37 100644
--- a/tools/mboxhdr2csv.rb
+++ b/tools/mboxhdr2csv.rb
@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-# Analyze mbox files for general statistics into CSV
+# Analyze mbox files (downloaded by PonyAPI) for general statistics into CSV
 # - Per list messages per month over time (PMOT)
 # - Count messages group by list -> graph months as time
 # - Per list contentlines per lists PMOT
@@ -18,299 +18,307 @@ require 'json'
 require 'date'
 require 'optparse'
 
-MBOX_EXT = '.mbox'
-MEMBER = 'member'
-COMMITTER = 'committer'
-COUNSEL = 'counsel'
-INVALID = '.INVALID'
-VERSION = 'mboxhdr2json'
-URIRX = URI.regexp(['http', 'https'])
+# Various utility functions/data for mailing list analysis
+module MailUtils
+  extend self
+  MEMBER = 'member'
+  COMMITTER = 'committer'
+  COUNSEL = 'counsel'
+  INVALID = '.INVALID'
 
-# Subject regexes that are non-discussion oriented
-# Analysis: don't bother with content lines in these messages, 
-#   because most of the content is tool-generated
-NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
-  '<board.apache.org>' => {
-    missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb
-    feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
-    notice: /\A\[NOTICE\]/i,
-    report: /\A\[REPORT\]/i,
-    resolution: /\A\[RESOLUTION\]/i,
-    svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
-    svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
-  },
-  '<operations.apache.org>' => {
-    notice: /\A\[NOTICE\]/i,
-    report: /\A\[REPORT\]/i,
-    svn_general: %r{\Asvn commit: r/},
-    svn_bills: %r{\Abills: r\d{4,8} -}
-  },
-  '<trademarks.apache.org>' => {
-    report: /\A\[REPORT\]/i,
-    svn_general: %r{\Asvn commit: r/}
-  },
-  '<fundraising.apache.org>' => {
-    report: /\A\[REPORT\]/i,
-    svn_bills: %r{\Abills: r\d{4,8} -}
+  # Subject regexes that are non-discussion oriented
+  # Analysis: don't bother with content lines in these messages, 
+  #   because most of the content is tool-generated
+  NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
+    '<board.apache.org>' => {
+      missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb
+      feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
+      notice: /\A\[NOTICE\]/i,
+      report: /\A\[REPORT\]/i,
+      resolution: /\A\[RESOLUTION\]/i,
+      svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
+      svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
+    },
+    '<operations.apache.org>' => {
+      notice: /\A\[NOTICE\]/i,
+      report: /\A\[REPORT\]/i,
+      svn_general: %r{\Asvn commit: r/},
+      svn_bills: %r{\Abills: r\d{4,8} -}
+    },
+    '<trademarks.apache.org>' => {
+      report: /\A\[REPORT\]/i,
+      svn_general: %r{\Asvn commit: r/}
+    },
+    '<fundraising.apache.org>' => {
+      report: /\A\[REPORT\]/i,
+      svn_bills: %r{\Abills: r\d{4,8} -}
+    }
   }
-}
 
-# Read a ponyapi.rb mbox file and return mails (text content only)
-# @param f path to .mbox or .mbox.gz
-# @return [mail1, mail2, ...]
-def read_mbox(f)
-  if f.end_with? '.gz'
-    stream = StringIO.new(mbox)
-    reader = Zlib::GzipReader.new(stream)
-    mbox = reader.read
-    reader.close
-    stream.close rescue nil
-  else
-    mbox = File.read(f)
-  end
-  mbox.force_encoding Encoding::ASCII_8BIT
-  messages = mbox.split(/^From .*/)
-  messages.shift # Drop first item (not a message)
-  return messages
-end
-
-# Process an mbox file into mailhash of selected headers and lines of text
-# @param f path to .mbox or .mbox.gz
-# @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...]
-# @return nil, [read, errors2...] if mbox file can't be read
-# mailhash contains :from, :subject, :listid, :date, :messageid, 
-#   :inreplyto, :lines (count), plus :who and :committer
-def mbox2stats(f)
-  begin
-    mails = read_mbox(f)
-  rescue => e
-    return nil, e
-  end
-  errs = []
-  messages = []
-  order = 0
-  mails.each do |message|
-    mdata = {}
-    mail = nil
+  # @see www/secretary/workbench/models/message.rb
+  # @see https://github.com/mikel/mail/issues/39
+  def liberal_email_parser(addr)
     begin
-      # Preserve message order in case it's important
-      order += 1
-      # Enforce linefeeds; makes Mail happy; borks binary attachments (not used in this script)
-      mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n"))
-      mdata[:order] = order
-      begin # HACK for cases where some values don't parse, try to get good enough values in rescue
-        mdata[:from] = mail[:from].value
-        mdata[:subject] = mail[:subject].value
-        mdata[:listid] = mail[:List_Id].value
-        mdata[:date] = mail.date.to_s
-      rescue => ee
-        mdata[:from] = mail[:from]
-        mdata[:subject] = mail[:subject]
-        mdata[:listid] = mail[:List_Id]
-        mdata[:date] = mail.date.to_s
-        mdata[:parseerr] = mail.errors
-      end
-      mdata[:messageid] = mail.message_id
-      mdata[:inreplyto] = mail.in_reply_to
-      if mail.multipart?
-        text_part = mail.text_part.decoded.split(/\r?\n/)
+      addr = Mail::Address.new(addr)
+    rescue
+      if addr =~ /^"([^"]*)" <(.*)>$/
+        addr = Mail::Address.new
+        addr.address = $2
+        addr.display_name = $1
+      elsif addr =~ /^([^"]*) <(.*)>$/
+        addr = Mail::Address.new
+        addr.address = $2
+        addr.display_name = $1
       else
-        text_part = mail.body.decoded.split(/\r?\n/)
-      end
-      ctr = 0 # Count text lines of nonblank, nonreply content
-      links = 0 # Count number of apparent hyperlinks
-      text_part.each do |l|
-        case l
-        when /\A\s*>/
-          # Don't count reply lines, even when indented
-        when /\A\s*\z/
-          # Don't count blank lines
-        when /\AOn.*wrote:\z/
-          # Don't count most common reply header
-        when /\A-----Original Message-----/
-          # Stop counting if it seems like a forwarded message
-          break
-          # TODO: figure out if we're in a .sig block, and stop counting
-        else
-          links += 1 if l =~ URIRX
-          ctr += 1
-        end
-      end
-      mdata[:lines] = ctr
-      mdata[:links] = links
-      # Annotate various other precomputable data
-      find_who_from mdata
-      begin
-        d = DateTime.parse(mdata[:date])
-        mdata[:y] = d.year
-        mdata[:m] = d.month
-        mdata[:d] = d.day
-        mdata[:w] = d.wday
-        mdata[:h] = d.hour
-        mdata[:z] = d.zone
-      rescue => noop
-        # no-op - not critical
-        puts "DEBUG: #{e.message} parsing: #{mdata[:date]}"
-      end
-      regex = NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any)
-      if regex
-        regex.each do |typ, rx|
-          if mdata[:subject] =~ rx
-            mdata[:nondiscuss] = typ
-            break # regex.each
-          end
-        end
+        raise
       end
-      # Push our hash 
-      messages << mdata
-    rescue => e
-      errs << [e, mdata[:order]]
     end
+    return addr
   end
-  return messages, errs
-end
 
-# Annotate mailhash by adding :who and :committer (where known)
-# @param mdata Hash to evaluate and annotate
-# Side effect: adds :who and :committer from ASF::Person.find_by_email
-# :committer = 'n' if not found; 'N' if error, 'counsel' for special case
-def find_who_from(mdata)
-  # Remove bogus INVALID before doing lookups
-  from = mdata[:from].sub(INVALID, '')
-  # Micro-optimize unique names
-  case from
-  when /Mark.Radcliffe/i
-    mdata[:who] = 'Mark.Radcliffe'
-    mdata[:committer] = COUNSEL
-  when /mattmann/i
-    mdata[:who] = 'Chris Mattmann'
-    mdata[:committer] = MEMBER
-  when /jagielski/i
-    mdata[:who] = 'Jim Jagielski'
-    mdata[:committer] = MEMBER
-  when /delacretaz/i
-    mdata[:who] = 'Bertrand Delacretaz'
-    mdata[:committer] = MEMBER
-  when /curcuru/i
-    mdata[:who] = 'Shane Curcuru'
-    mdata[:committer] = MEMBER
-  when /steitz/i
-    mdata[:who] = 'Phil Steitz'
-    mdata[:committer] = MEMBER
-  when /gardler/i  # Effectively unique (see: Heidi)
-    mdata[:who] = 'Ross Gardler'
-    mdata[:committer] = MEMBER
-  when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
-    mdata[:who] = 'Craig L Russell'
-    mdata[:committer] = MEMBER
-  when /McGrail/i
-    mdata[:who] = 'Kevin A. McGrail'
-    mdata[:committer] = MEMBER
-  when /sallykhudairi@yahoo/i 
-    mdata[:who] = 'Sally Khudairi'
-    mdata[:committer] = MEMBER
-  when /sk@haloworldwide.com/i
-    mdata[:who] = 'Sally Khudairi'
-    mdata[:committer] = MEMBER
-  else
-    begin
-      # TODO use Real Name (JIRA) to attempt to lookup some notifications
-      tmp = liberal_email_parser(from)
-      person = ASF::Person.find_by_email(tmp.address.dup)
-      if person
-        mdata[:who] = person.cn
-        if person.asf_member?
-          mdata[:committer] = MEMBER
+  # Annotate mailhash by adding :who and :committer (where known)
+  # @param mdata Hash to evaluate and annotate
+  # Side effect: adds :who and :committer from ASF::Person.find_by_email
+  # :committer = 'n' if not found; 'N' if error, 'counsel' for special case
+  def find_who_from(mdata)
+    # Remove bogus INVALID before doing lookups
+    from = mdata[:from].sub(INVALID, '')
+    # Micro-optimize unique names
+    case from
+    when /Mark.Radcliffe/i
+      mdata[:who] = 'Mark.Radcliffe'
+      mdata[:committer] = COUNSEL
+    when /mattmann/i
+      mdata[:who] = 'Chris Mattmann'
+      mdata[:committer] = MEMBER
+    when /jagielski/i
+      mdata[:who] = 'Jim Jagielski'
+      mdata[:committer] = MEMBER
+    when /delacretaz/i
+      mdata[:who] = 'Bertrand Delacretaz'
+      mdata[:committer] = MEMBER
+    when /curcuru/i
+      mdata[:who] = 'Shane Curcuru'
+      mdata[:committer] = MEMBER
+    when /steitz/i
+      mdata[:who] = 'Phil Steitz'
+      mdata[:committer] = MEMBER
+    when /gardler/i  # Effectively unique (see: Heidi)
+      mdata[:who] = 'Ross Gardler'
+      mdata[:committer] = MEMBER
+    when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
+      mdata[:who] = 'Craig L Russell'
+      mdata[:committer] = MEMBER
+    when /McGrail/i
+      mdata[:who] = 'Kevin A. McGrail'
+      mdata[:committer] = MEMBER
+    when /sallykhudairi@yahoo/i 
+      mdata[:who] = 'Sally Khudairi'
+      mdata[:committer] = MEMBER
+    when /sk@haloworldwide.com/i
+      mdata[:who] = 'Sally Khudairi'
+      mdata[:committer] = MEMBER
+    else
+      begin
+        # TODO use Real Name (JIRA) to attempt to lookup some notifications
+        tmp = liberal_email_parser(from)
+        person = ASF::Person.find_by_email(tmp.address.dup)
+        if person
+          mdata[:who] = person.cn
+          if person.asf_member?
+            mdata[:committer] = MEMBER
+          else
+            mdata[:committer] = COMMITTER
+          end
         else
-          mdata[:committer] = COMMITTER
+          mdata[:who] = "#{tmp.display_name} <#{tmp.address}>"
+          mdata[:committer] = 'n'
         end
-      else
-        mdata[:who] = "#{tmp.display_name} <#{tmp.address}>"
-        mdata[:committer] = 'n'
+      rescue
+        mdata[:who] = mdata[:from] # Use original value here
+        mdata[:committer] = 'N'
       end
-    rescue
-      mdata[:who] = mdata[:from] # Use original value here
-      mdata[:committer] = 'N'
     end
   end
 end
 
-# @see www/secretary/workbench/models/message.rb
-# @see https://github.com/mikel/mail/issues/39
-def liberal_email_parser(addr)
-  begin
-    addr = Mail::Address.new(addr)
-  rescue
-    if addr =~ /^"([^"]*)" <(.*)>$/
-      addr = Mail::Address.new
-      addr.address = $2
-      addr.display_name = $1
-    elsif addr =~ /^([^"]*) <(.*)>$/
-      addr = Mail::Address.new
-      addr.address = $2
-      addr.display_name = $1
+module MboxUtils
+  extend self
+  MBOX_EXT = '.mbox'
+  VERSION = 'mboxhdr2json'
+  URIRX = URI.regexp(['http', 'https'])
+
+  # Read a ponyapi.rb mbox file and return mails (text content only)
+  # @param f path to .mbox or .mbox.gz
+  # @return [mail1, mail2, ...]
+  def read_mbox(f)
+    if f.end_with? '.gz'
+      stream = StringIO.new(mbox)
+      reader = Zlib::GzipReader.new(stream)
+      mbox = reader.read
+      reader.close
+      stream.close rescue nil
     else
-      raise
+      mbox = File.read(f)
     end
+    mbox.force_encoding Encoding::ASCII_8BIT
+    messages = mbox.split(/^From .*/)
+    messages.shift # Drop first item (not a message)
+    return messages
   end
-  return addr
-end
 
-# Scan dir tree for mboxes and output individual mailhash as JSONs
-# @param dir to scan (whole tree)
-# @param ext file extension to glob for
-# Side effect: writes out f.chomp(ext).json files
-# @note writes string VERSION for differentiating from other *.json
-def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
-  Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
-    mails, errs = mbox2stats(f.untaint)
-    File.open("#{f.chomp(ext)}.json", "w") do |fout|
-      fout.puts JSON.pretty_generate(["#{VERSION}", mails, errs])
+  # Process an mbox file into mailhash of selected headers and lines of text
+  # @param f path to .mbox or .mbox.gz
+  # @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...]
+  # @return nil, [read, errors2...] if mbox file can't be read
+  # mailhash contains :from, :subject, :listid, :date, :messageid, 
+  #   :inreplyto, :lines (count), plus :who and :committer
+  def mbox2stats(f)
+    begin
+      mails = read_mbox(f)
+    rescue => e
+      return nil, e
     end
+    errs = []
+    messages = []
+    order = 0
+    mails.each do |message|
+      mdata = {}
+      mail = nil
+      begin
+        # Preserve message order in case it's important
+        order += 1
+        # Enforce linefeeds; makes Mail happy; borks binary attachments (not used in this script)
+        mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n"))
+        mdata[:order] = order
+        begin # HACK for cases where some values don't parse, try to get good enough values in rescue
+          mdata[:from] = mail[:from].value
+          mdata[:subject] = mail[:subject].value
+          mdata[:listid] = mail[:List_Id].value
+          mdata[:date] = mail.date.to_s
+        rescue => ee
+          mdata[:from] = mail[:from]
+          mdata[:subject] = mail[:subject]
+          mdata[:listid] = mail[:List_Id]
+          mdata[:date] = mail.date.to_s
+          mdata[:parseerr] = mail.errors
+        end
+        mdata[:messageid] = mail.message_id
+        mdata[:inreplyto] = mail.in_reply_to
+        if mail.multipart?
+          text_part = mail.text_part.decoded.split(/\r?\n/)
+        else
+          text_part = mail.body.decoded.split(/\r?\n/)
+        end
+        ctr = 0 # Count text lines of nonblank, nonreply content
+        links = 0 # Count number of apparent hyperlinks
+        text_part.each do |l|
+          case l
+          when /\A\s*>/
+            # Don't count reply lines, even when indented
+          when /\A\s*\z/
+            # Don't count blank lines
+          when /\AOn.*wrote:\z/
+            # Don't count most common reply header
+          when /\A-----Original Message-----/
+            # Stop counting if it seems like a forwarded message
+            break
+            # TODO: figure out if we're in a .sig block, and stop counting
+          else
+            links += 1 if l =~ URIRX
+            ctr += 1
+          end
+        end
+        mdata[:lines] = ctr
+        mdata[:links] = links
+        # Annotate various other precomputable data
+        MailUtils.find_who_from(mdata)
+        begin
+          d = DateTime.parse(mdata[:date])
+          mdata[:y] = d.year
+          mdata[:m] = d.month
+          mdata[:d] = d.day
+          mdata[:w] = d.wday
+          mdata[:h] = d.hour
+          mdata[:z] = d.zone
+        rescue => noop
+          # no-op - not critical
+          puts "DEBUG: #{e.message} parsing: #{mdata[:date]}"
+        end
+        regex = MailUtils::NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any)
+        if regex
+          regex.each do |typ, rx|
+            if mdata[:subject] =~ rx
+              mdata[:nondiscuss] = typ
+              break # regex.each
+            end
+          end
+        end
+        # Push our hash 
+        messages << mdata
+      rescue => e
+        errs << [e, mdata[:order]]
+      end
+    end
+    return messages, errs
   end
-end
 
-# Scan dir tree for mailhash JSONs and output an overview CSV of all
-# @return [ error1, error2, ...] if any errors
-# Side effect: writes out dir/outname CSV file
-# @note reads string VERSION for differentiating from other *.json
-def scan_dir_stats2csv(dir, outname, ext = '.json')
-  errors = []
-  jzons = []
-  Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
-    begin
-      tmp = JSON.parse(File.read(f))
-      if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION)
-        jzons << tmp.drop(1)
+  # Scan dir tree for mboxes and output individual mailhash as JSONs
+  # @param dir to scan (whole tree)
+  # @param ext file extension to glob for
+  # Side effect: writes out f.chomp(ext).json files
+  # @note writes string VERSION for differentiating from other *.json
+  def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
+    Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
+      mails, errs = mbox2stats(f.untaint)
+      File.open("#{f.chomp(ext)}.json", "w") do |fout|
+        fout.puts JSON.pretty_generate(["#{VERSION}", mails, errs])
       end
-    rescue => e
-      puts "ERROR: parse of #{f} raised #{e.message[0..255]}"
-      errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
-      next
     end
   end
-  raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0
-  puts "#{__method__} processing #{jzons.length} mbox json files"
-  # Write out headers and the first array in new csv
-  csvfile = File.join("#{dir}", outname)
-  csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines links committer messageid inreplyto ), write_headers: true)
-  jzons.shift[0].each do |m|
-    csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
-  end
-  # Write out all remaining arrays, without headers, appending
-  jzons.each do |j|
-    begin
-      j[0].each do |m|
-        csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
+
+  # Scan dir tree for mailhash JSONs and output an overview CSV of all
+  # @return [ error1, error2, ...] if any errors
+  # Side effect: writes out dir/outname CSV file
+  # @note reads string VERSION for differentiating from other *.json
+  def scan_dir_stats2csv(dir, outname, ext = '.json')
+    errors = []
+    jzons = []
+    Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
+      begin
+        tmp = JSON.parse(File.read(f))
+        if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION)
+          jzons << tmp.drop(1)
+        end
+      rescue => e
+        puts "ERROR: parse of #{f} raised #{e.message[0..255]}"
+        errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
+        next
+      end
+    end
+    raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0
+    puts "#{__method__} processing #{jzons.length} mbox json files"
+    # Write out headers and the first array in new csv
+    csvfile = File.join("#{dir}", outname)
+    csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines links committer messageid inreplyto ), write_headers: true)
+    jzons.shift[0].each do |m|
+      csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
+    end
+    # Write out all remaining arrays, without headers, appending
+    jzons.each do |j|
+      begin
+        j[0].each do |m|
+          csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
+        end
+      rescue => e
+        puts "ERROR: write of #{f} raised #{e.message[0..255]}"
+        errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
+        next
       end
-    rescue => e
-      puts "ERROR: write of #{f} raised #{e.message[0..255]}"
-      errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
-      next
     end
+    csv.close # Just in case
+    return errors
   end
-  csv.close # Just in case
-  return errors
 end
 
 # ## ### #### ##### ######
@@ -352,11 +360,11 @@ end
 if __FILE__ == $PROGRAM_NAME
   options = optparse
   if options[:json]
-    puts "START: Parsing #{options[:dir]}/*#{MBOX_EXT} into *.json"
-    scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files
+    puts "START: Parsing #{options[:dir]}/*#{MboxUtils::MBOX_EXT} into *.json"
+    MboxUtils.scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files
   end
   puts "START: Analyzing #{options[:dir]}/*.json into #{options[:output]}"
-  errs = scan_dir_stats2csv(options[:dir], options[:output])
+  errs = MboxUtils.scan_dir_stats2csv(options[:dir], options[:output])
   if errs
     errs.each do |e|
       puts "ERROR: #{e}"
diff --git a/www/officers/list-traffic.cgi b/www/officers/list-traffic.cgi
index b54a93d..0d27e86 100755
--- a/www/officers/list-traffic.cgi
+++ b/www/officers/list-traffic.cgi
@@ -9,6 +9,7 @@ require 'whimsy/asf'
 require 'whimsy/asf/agenda'
 require 'date'
 require 'mail'
+require '../../tools/mboxhdr2csv.rb'
 
 user = ASF::Person.new($USER)
 unless user.asf_member? or ASF.pmc_chairs.include? user
@@ -29,109 +30,6 @@ MAILCOUNT = 'mailcount'
 WEEK_TOTAL = '@@total' # Use @@ so it can't match who name/emails
 WEEK_START = '@@start'
 
-### ---- Copied from tools/mboxhdr2csv.rb; should be refactored ----
-MEMBER = 'member'
-COMMITTER = 'committer'
-COUNSEL = 'counsel'
-# Subject regexes that are non-discussion oriented for flagging
-NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
-  '<board.apache.org>' => {
-    missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb
-    feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
-    notice: /\A\[NOTICE\]/i,
-    report: /\A\[REPORT\]/i,
-    resolution: /\A\[RESOLUTION\]/i,
-    svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
-    svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
-  }
-}
-# Annotate mailhash by adding :who and COMMITTER (where known)
-# @param email address to check
-# @returns ['Full Name', 'committer-flag'
-# COMMITTER = 'n' if not found; 'N' if error, 'counsel' for special case
-def find_who_from(email)
-  # Remove bogus INVALID before doing lookups
-  from = email.sub('.INVALID', '')
-  who = nil
-  committer = nil
-  # Micro-optimize unique names
-  case from
-  when /Mark.Radcliffe/i
-    who = 'Mark.Radcliffe'
-    committer = COUNSEL
-  when /mattmann/i
-    who = 'Chris Mattmann'
-    committer = MEMBER
-  when /jagielski/i
-    who = 'Jim Jagielski'
-    committer = MEMBER
-  when /delacretaz/i
-    who = 'Bertrand Delacretaz'
-    committer = MEMBER
-  when /curcuru/i
-    who = 'Shane Curcuru'
-    committer = MEMBER
-  when /steitz/i
-    who = 'Phil Steitz'
-    committer = MEMBER
-  when /gardler/i  # Effectively unique (see: Heidi)
-    who = 'Ross Gardler'
-    committer = MEMBER
-  when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
-    who = 'Craig L Russell'
-    committer = MEMBER
-  when /McGrail/i
-    who = 'Kevin A. McGrail'
-    committer = MEMBER
-  when /khudairi/i 
-    who = 'Sally Khudairi'
-    committer = MEMBER
-  else
-    begin
-      # TODO use Real Name (JIRA) to attempt to lookup some notifications
-      tmp = liberal_email_parser(from)
-      person = ASF::Person.find_by_email(tmp.address.dup)
-      if person
-        who = person.cn
-        if person.asf_member?
-          committer = MEMBER
-        else
-          committer = COMMITTER
-        end
-      else
-        who = "#{tmp.display_name} <#{tmp.address}>"
-        committer = 'n'
-      end
-    rescue
-      who = from # Use original value here
-      committer = 'N'
-    end
-  end
-  return who, committer
-end
-
-# @see www/secretary/workbench/models/message.rb
-# @see https://github.com/mikel/mail/issues/39
-def liberal_email_parser(addr)
-  begin
-    addr = Mail::Address.new(addr)
-  rescue
-    if addr =~ /^"([^"]*)" <(.*)>$/
-      addr = Mail::Address.new
-      addr.address = $2
-      addr.display_name = $1
-    elsif addr =~ /^([^"]*) <(.*)>$/
-      addr = Mail::Address.new
-      addr.address = $2
-      addr.display_name = $1
-    else
-      raise
-    end
-  end
-  return addr
-end
-### ---- Copied from tools/mboxhdr2csv.rb; should be refactored ----
-
 # Get {MAILS: [{date, who, subject, flag},...\, TOOLS: [{...},...] } from the specified list for a month
 # May cache data in SRV_MAIL/yearmonth.json
 # Returns empty hash if error or if can't find month
@@ -152,7 +50,7 @@ def get_mails_month(yearmonth:, nondiscuss:)
       data = {}
       data[DATE] = DateTime.parse(message[/^Date: (.*)/, 1]).iso8601
       data[FROM] = message[/^From: (.*)/, 1]
-      data[WHO], data[COMMITTER] = find_who_from(data[FROM])
+      data[WHO], data[MailUtils::COMMITTER] = MailUtils.find_who_from(data[FROM])
       data[SUBJECT] = message[/^Subject: (.*)/, 1]
       if nondiscuss
         nondiscuss.each do |typ, rx|
@@ -196,7 +94,7 @@ end
 # Display monthly statistics for all available data
 def display_monthly(months:, nondiscuss:)
   months.sort.reverse.each do |month|
-    data = get_mails_month(yearmonth: month, nondiscuss: NONDISCUSSION_SUBJECTS['<board.apache.org>'])
+    data = get_mails_month(yearmonth: month, nondiscuss: nondiscuss)
     next if data.empty?
     _h1 "board@ statistics for #{month} (total mails: #{data[MAILS].length + data[TOOLS].length})", id: "#{month}"
     _div.row do
@@ -308,9 +206,9 @@ _html do
     ) do
       months = Dir["#{SRV_MAIL}/*"].map {|path| File.basename(path).untaint}.grep(/^\d+$/)
       if ENV['QUERY_STRING'].include? 'week'
-        display_weekly(months: months, nondiscuss: NONDISCUSSION_SUBJECTS['<board.apache.org>'])
+        display_weekly(months: months, nondiscuss: MailUtils::NONDISCUSSION_SUBJECTS['<board.apache.org>'])
       else
-        display_monthly(months: months, nondiscuss: NONDISCUSSION_SUBJECTS['<board.apache.org>'])
+        display_monthly(months: months, nondiscuss: MailUtils::NONDISCUSSION_SUBJECTS['<board.apache.org>'])
       end
     end
   end