You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by cu...@apache.org on 2019/05/09 16:19:51 UTC

[whimsy] branch master updated: Fix options; scrub .INVALID from; only parse our .jsons

This is an automated email from the ASF dual-hosted git repository.

curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new bb34743  Fix options; scrub .INVALID from; only parse our .jsons
bb34743 is described below

commit bb347435dc31a3db50c0a432c80f15b8a578025e
Author: Shane Curcuru <as...@shanecurcuru.org>
AuthorDate: Thu May 9 12:19:43 2019 -0400

    Fix options; scrub .INVALID from; only parse our .jsons
---
 tools/mboxhdr2csv.rb | 111 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 27 deletions(-)

diff --git a/tools/mboxhdr2csv.rb b/tools/mboxhdr2csv.rb
index c4fe5b9..002d2f4 100644
--- a/tools/mboxhdr2csv.rb
+++ b/tools/mboxhdr2csv.rb
@@ -6,6 +6,7 @@
 # - Per user statistics
 # Count lines of text content in mail body, roughly attempting to 
 #   count just new content (not automated, not > replies)
+# Attempt to normalize/map email addresses to committer/member status
 
 $LOAD_PATH.unshift '/srv/whimsy/lib'
 require 'whimsy/asf'
@@ -15,18 +16,22 @@ require 'stringio'
 require 'zlib'
 require 'json'
 require 'date'
+require 'optparse'
+
 MBOX_EXT = '.mbox'
 MEMBER = 'member'
 COMMITTER = 'committer'
 COUNSEL = 'counsel'
+INVALID = '.INVALID'
+VERSION = 'mboxhdr2json'
 
 # Subject regexes that are non-discussion oriented
 # Analysis: don't bother with content lines in these messages, 
 #   because most of the content is tool-generated
 NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
   '<board.apache.org>' => {
-    missing: /\AMissing\s\S+\sBoard/,
-    feedback: /\ABoard\sfeedback\son\s20/,
+    missing: /\AMissing\s\S+\sBoard/, # whimsy/www/board/agenda/views/buttons/email.js.rb
+    feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
     notice: /\A\[NOTICE\]/i,
     report: /\A\[REPORT\]/i,
     resolution: /\A\[RESOLUTION\]/i,
@@ -165,8 +170,10 @@ end
 # Side effect: adds :who and :committer from ASF::Person.find_by_email
 # :committer = 'n' if not found; 'N' if error, 'counsel' for special case
 def find_who_from(mdata)
+  # Remove bogus INVALID before doing lookups
+  from = mdata[:from].sub(INVALID, '')
   # Micro-optimize unique names
-  case mdata[:from]
+  case from
   when /Mark.Radcliffe/i
     mdata[:who] = 'Mark.Radcliffe'
     mdata[:committer] = COUNSEL
@@ -203,7 +210,7 @@ def find_who_from(mdata)
   else
     begin
       # TODO use Real Name (JIRA) to attempt to lookup some notifications
-      tmp = liberal_email_parser(mdata[:from])
+      tmp = liberal_email_parser(from)
       person = ASF::Person.find_by_email(tmp.address.dup)
       if person
         mdata[:who] = person.cn
@@ -217,7 +224,7 @@ def find_who_from(mdata)
         mdata[:committer] = 'n'
       end
     rescue
-      mdata[:who] = mdata[:from]
+      mdata[:who] = mdata[:from] # Use original value here
       mdata[:committer] = 'N'
     end
   end
@@ -248,11 +255,12 @@ end
 # @param dir to scan (whole tree)
 # @param ext file extension to glob for
 # Side effect: writes out f.chomp(ext).json files
+# @note writes string VERSION for differentiating from other *.json
 def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
   Dir["#{dir}/**/*#{ext}".untaint].each do |f|
     mails, errs = mbox2stats(f.untaint)
     File.open("#{f.chomp(ext)}.json", "w") do |fout|
-      fout.puts JSON.pretty_generate([mails, errs])
+      fout.puts JSON.pretty_generate(["#{VERSION}", mails, errs])
     end
   end
 end
@@ -260,29 +268,39 @@ end
 # Scan dir tree for mailhash JSONs and output an overview CSV of all
 # @return [ error1, error2, ...] if any errors
 # Side effect: writes out dir/outname CSV file
+# @note reads string VERSION for differentiating from other *.json
 def scan_dir_stats2csv(dir, outname)
   errors = []
   filenames = Dir["#{dir}/**/*.json".untaint]
-  raise ArgumentError, "#{__method__} called with no files in #{dir}" if filenames.length == 0
-  puts "#{__method__} processing #{filenames.length} files"
-  firstfile = filenames.shift
-  jzon = JSON.parse(File.read(firstfile))
-  # Write out headers and the first file in new csv
+  jzons = []
+  filenames.each do |f|
+    begin
+      tmp = JSON.parse(File.read(f))
+      if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION)
+        jzons << tmp.drop(1)
+      end
+    rescue => e
+      puts "ERROR: parse of #{f} raised #{e.message[0..255]}"
+      errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
+      next
+    end
+  end
+  raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0
+  puts "#{__method__} processing #{jzons.length} mbox json files"
+  # Write out headers and the first array in new csv
   csvfile = File.join("#{dir}", outname)
   csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines committer messageid inreplyto ), write_headers: true)
-  jzon[0].each do |m|
+  jzons.shift[0].each do |m|
     csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['committer'], m['messageid'], m['inreplyto']  ]
   end
-  
-  # Write out all remaining files, without headers, appending
-  filenames.each do |f|
+  # Write out all remaining arrays, without headers, appending
+  jzons.each do |j|
     begin
-      j = JSON.parse(File.read(f))
       j[0].each do |m|
         csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['committer'], m['messageid'], m['inreplyto']  ]
       end
     rescue => e
-      puts "ERROR: parse/write of #{f} raised #{e.message[0..255]}"
+      puts "ERROR: write of #{f} raised #{e.message[0..255]}"
       errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
       next
     end
@@ -409,15 +427,54 @@ def do_mbox2csv_hdr(dir)
   end
 end
 
-#### TODO Sample code
-path = '~/src/lists'
-output = 'listdata.csv'
-puts "START: #{path} into #{output}"
-scan_dir_mbox2stats(path)
-errs = scan_dir_stats2csv(path, output)
-if errs
-  errs.each do |e|
-    puts "ERROR: #{e}"
+# ## ### #### ##### ######
+# Check options and call needed methods
+DEFAULT_OUTPUT = 'mbox-analysis.csv'
+def optparse
+  options = {}
+  OptionParser.new do |opts|
+    opts.on('-h') { puts opts; exit }
+    
+    opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to read existing mboxes and dump output in (default: .)') do |d|
+      if File.directory?(d)
+        options[:dir] = d
+      else
+        raise ArgumentError, "-d #{d} is not a valid directory" 
+      end
+    end
+    opts.on('-oOUTPUT.CSV', '--output OUTPUT.CSV', "Filename to output rows into; default #{DEFAULT_OUTPUT}") do |o|
+      options[:output] = o
+    end
+    opts.on('-j', '--json', "Process .mbox to .json (optional)") do |j|
+      options[:json] = true
+    end
+    begin
+      opts.parse!
+      options[:dir] = '.' if options[:dir].nil?
+      options[:output] = DEFAULT_OUTPUT if options[:output].nil?
+    rescue StandardError => e
+      $stderr.puts "#{e.message}; try -h for valid options, or see code"
+      exit 1
+    end
+  end
+  
+  return options
+end
+
+# ## ### #### ##### ######
+# Main method for command line use
+if __FILE__ == $PROGRAM_NAME
+  options = optparse
+  if options[:json]
+    puts "START: Parsing #{options[:dir]}/*#{MBOX_EXT} into *.json"
+    scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files
+  end
+  puts "START: Analyzing #{options[:dir]}/*#{MBOX_EXT} into #{options[:output]}"
+  errs = scan_dir_stats2csv(options[:dir], options[:output])
+  if errs
+    errs.each do |e|
+      puts "ERROR: #{e}"
+    end
   end
+  puts "END"
 end
-puts "END"
\ No newline at end of file