You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by cu...@apache.org on 2017/12/13 17:30:38 UTC
[whimsy] branch master updated: Futureproof svn revs,
fixup comment for sebb
This is an automated email from the ASF dual-hosted git repository.
curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new 5c08ac6 Futureproof svn revs, fixup comment for sebb
5c08ac6 is described below
commit 5c08ac60f8b20ba0fedd7759331604f387044559
Author: Shane Curcuru <as...@shanecurcuru.org>
AuthorDate: Wed Dec 13 12:32:46 2017 -0500
Futureproof svn revs, fixup comment for sebb
---
tools/mboxhdr2csv.rb | 458 +++++++++++++++++++++++++++++++++++++++++++++++++++
tools/ponypoop.rb | 18 +-
www/docs/index.cgi | 32 ++++
3 files changed, 498 insertions(+), 10 deletions(-)
diff --git a/tools/mboxhdr2csv.rb b/tools/mboxhdr2csv.rb
new file mode 100644
index 0000000..368e752
--- /dev/null
+++ b/tools/mboxhdr2csv.rb
@@ -0,0 +1,458 @@
+#!/usr/bin/env ruby
+# Analyze mbox files either by headers or by content lines
+$LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
+require 'whimsy/asf'
+require 'mail'
+require 'csv'
+require 'stringio'
+require 'zlib'
+require 'json'
+require 'date'
+MBOX_EXT = '.mbox'
+
+# Analysis Ideas
+# Contentlines are only counted when ! has_key? nondiscuss
+# Per list messages per month over time (PMOT)
+# count messages group by list -> graph months as time
+# Per list contentlines per lists PMOT
+# User messages per lists PMOT
+# User contentlines per lists PMOT
+# User msgs/lines by day of week; hour of day
+
+# Read f.mbox or f.mbox.gz and return [message, message2, ...] or raise error
+def read_mbox(f)
+ mbox = File.read(f)
+ if f.end_with? '.gz'
+ stream = StringIO.new(mbox)
+ reader = Zlib::GzipReader.new(stream)
+ mbox = reader.read
+ reader.close
+ stream.close rescue nil
+ end
+ mbox.force_encoding Encoding::ASCII_8BIT
+ messages = mbox.split(/^From .*/)
+ messages.shift # Drop first item (not a message)
+ return messages
+end
+
+# Split mbox into [MailHash, Mail2Hash,...], [ [parseerr, order], ...]
+# Returns nil, [read, errors2...] if mbox file can't be read
+def mbox2stats(f)
+ begin
+ mails = read_mbox(f)
+ rescue => e
+ return nil, e
+ end
+ errs = []
+ messages = []
+ order = 0
+ mails.each do |message|
+ mdata = {}
+ mail = nil
+ begin
+ # Preserve message order in case it's important
+ order += 1
+ # Enforce linefeeds; makes Mail happy; borks binary attachments (not used)
+ mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n"))
+ mdata[:order] = order
+ begin # HACK for cases where some values don't parse, try to get good enough values in rescue
+ mdata[:from] = mail[:from].value
+ mdata[:subject] = mail[:subject].value
+ mdata[:listid] = mail[:List_Id].value
+ mdata[:date] = mail.date.to_s
+ rescue => ee
+ mdata[:from] = mail[:from]
+ mdata[:subject] = mail[:subject]
+ mdata[:listid] = mail[:List_Id]
+ mdata[:date] = mail.date.to_s
+ mdata[:parseerr] = mail.errors
+ end
+ mdata[:messageid] = mail.message_id
+ mdata[:inreplyto] = mail.in_reply_to
+ if mail.multipart?
+ text_part = mail.text_part.decoded.split(/\r?\n/)
+ else
+ text_part = mail.body.decoded.split(/\r?\n/)
+ end
+ ctr = 0 # Count text lines of nonblank, nonreply content
+ text_part.each do |l|
+ case l
+ when /\A\s*>/
+ # Don't count reply lines, even when indented
+ when /\A\s*\z/
+ # Don't count blank lines
+ when /\AOn.*wrote:\z/
+ # Don't count most common reply header
+ when /\A-----Original Message-----/
+ # Stop counting if it seems like a forwarded message
+ break
+ else
+ ctr += 1
+ end
+ end
+ mdata[:lines] = ctr
+ find_who_from mdata
+ messages << mdata
+ rescue => e
+ errs << [e, mdata[:order]]
+ end
+ end
+ return messages, errs
+end
+
+# Scan dir of mbox and output json of statistics for each; return meta-array of all mdata
+def scan_dir_mbox2stats(dir, ext)
+ Dir["#{dir}/**/*#{ext}".untaint].each do |f|
+ mails, errs = mbox2stats(f.untaint)
+ puts "scan_mbox(#{f}) mails: #{mails.length} errors: #{errs.length}"
+ File.open("#{f.chomp(ext)}.json", "w") do |fout|
+ fout.puts JSON.pretty_generate([mails, errs])
+ end
+ end
+end
+
+# Add :who field and Apache committer status
+def find_who_from(msg)
+ # Micro-optimize unique names
+ case msg[:from]
+ when /mattmann/i
+ msg[:who] = 'Chris Mattmann'
+ msg[:committer] = MEMBER
+ when /jagielski/i
+ msg[:who] = 'Jim Jagielski'
+ msg[:committer] = MEMBER
+ when /delacretaz/i
+ msg[:who] = 'Bertrand Delacretaz'
+ msg[:committer] = MEMBER
+ when /curcuru/i
+ msg[:who] = 'Shane Curcuru'
+ msg[:committer] = MEMBER
+ when /steitz/i
+ msg[:who] = 'Phil Steitz'
+ msg[:committer] = MEMBER
+ when /gardler/i # Effectively unique (see: Heidi)
+ msg[:who] = 'Ross Gardler'
+ msg[:committer] = MEMBER
+ when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
+ msg[:who] = 'Craig (L )?Russell'
+ msg[:committer] = MEMBER
+ when /McGrail/i
+ msg[:who] = 'Kevin A. McGrail'
+ msg[:committer] = MEMBER
+ else
+ begin
+ # TODO use Real Name (JIRA) to attempt to lookup some notifications
+ tmp = liberal_email_parser(msg[:from])
+ person = ASF::Person.find_by_email(tmp.address.dup)
+ if person
+ msg[:who] = person.cn
+ if person.asf_member?
+ msg[:committer] = MEMBER
+ else
+ msg[:committer] = COMMITTER
+ end
+ else
+ msg[:who] = "#{tmp.display_name} <#{tmp.address}>"
+ msg[:committer] = 'n'
+ end
+ rescue
+ msg[:who] = msg[:from]
+ msg[:committer] = 'N'
+ end
+ end
+end
+
+# Subject regexes that are non-discussion oriented
+# Analysis: don't bother with content lines in these messages,
+# because most of the content is tool-generated
+NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
+ '<board.apache.org>' => {
+ missing: /\AMissing\s\S+\sBoard/,
+ feedback: /\ABoard\sfeedback\son\s20/,
+ notice: /\A\[NOTICE\]/i,
+ report: /\A\[REPORT\]/i,
+ resolution: /\A\[RESOLUTION\]/i,
+ svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
+ svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
+ },
+ '<operations.apache.org>' => {
+ notice: /\A\[NOTICE\]/i,
+ report: /\A\[REPORT\]/i,
+ svn_general: %r{\Asvn commit: r/},
+ svn_bills: %r{\Abills: r\d{4,8} -}
+ },
+ '<trademarks.apache.org>' => {
+ report: /\A\[REPORT\]/i,
+ svn_general: %r{\Asvn commit: r/}
+ },
+ '<fundraising.apache.org>' => {
+ report: /\A\[REPORT\]/i,
+ svn_bills: %r{\Abills: r\d{4,8} -}
+ }
+}
+
+# Annotate mbox stats hash w/nondiscussion marker (hint: don't count content lines)
+def mark_nondiscussion(mails)
+ ctr = 0
+ mails.each do |msg|
+ regex = NONDISCUSSION_SUBJECTS[msg['listid']] # Use subject regex for this list (if any)
+ if regex
+ regex.each do |typ, rx|
+ if msg['subject'] =~ rx
+ msg[:nondiscuss] = typ
+ ctr += 1
+ break
+ end
+ end
+ end
+ end
+end
+
+# Annotate mbox stats hash with various precomputed data
+def annotate_stats(mails)
+ mails.each do |msg|
+ # Fixup bogus From data for Sally's old email address which might not parse
+ if msg['who'] =~ /sallykhudairi@yahoo/i
+ msg[:who] = 'Sally Khudairi'
+ msg[:committer] = 'member'
+ end
+ # Translate date into y, m, d, w (day of week), h, z (timezone), (no minutes)
+ begin
+ d = DateTime.parse(msg['date'])
+ msg[:y] = d.year
+ msg[:m] = d.month
+ msg[:d] = d.day
+ msg[:w] = d.wday
+ msg[:h] = d.hour
+ msg[:z] = d.zone
+ rescue => e
+ # no-op
+ puts "DEBUG: #{e.message} parsing: #{msg['date']}"
+ end
+ end
+end
+
+# Scan dir of mbox .json stats and annotate with nondiscussion markers
+# @return array of any errors
+def scan_dir_json_nondiscussion(dir)
+ errors = []
+ Dir["#{dir}/**/*.json".untaint].each do |f|
+ begin
+ jzon = JSON.parse(File.read(f))
+ msgs = jzon[0] # Should be an array of [[msgs...], [errs...]}
+ # Run both annotations
+ mark_nondiscussion msgs
+ annotate_stats msgs
+ # Now re-write the same file with this included data
+ File.open("#{f}", "w") do |fout|
+ fout.puts JSON.pretty_generate(jzon)
+ end
+ rescue => e
+ puts "ERROR:scan_dir_json_nondiscussion(#{f}) raised #{e.message[0..255]}"
+ errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
+ next
+ end
+ end
+ return errors
+end
+
+# Scan dir of mbox .json stats and fix Sally's bogus yahoo emails
+# @return array of any errors
+def fixup_sally_mail(dir)
+ errors = []
+ Dir["#{dir}/**/*.json".untaint].each do |f|
+ begin
+ jzon = JSON.parse(File.read(f))
+ msgs = jzon[0] # Should be an array of [[msgs...], [errs...]}
+ msgs.each do |msg|
+ # Fixup bogus From data for Sally's old email address which might not parse
+ if msg['who'] =~ /sallykhudairi@yahoo/i || msg['who'] =~ /sk@haloworldwide.com/i
+ sv = {'who' => 'Sally Khudairi', 'committer' => 'member'}
+ msg.merge!(sv)
+ end
+ end
+ # Now re-write the same file with this included data
+ File.open("#{f}", "w") do |fout|
+ fout.puts JSON.pretty_generate(jzon)
+ end
+ rescue => e
+ puts "ERROR:scan_dir_json_nondiscussion(#{f}) raised #{e.message[0..255]}"
+ errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
+ next
+ end
+ end
+ return errors
+end
+
+# Aggregate selected header fields from an mbox
+def scan_mbox_headers(f, headers)
+ begin
+ messages = read_mbox(f)
+ rescue => e
+ puts "ERROR:scan_mbox_hdr(#{f}) #{e}"
+ return
+ end
+ begin
+ messages.each do |message|
+ header = {}
+ catch :headerend do
+ lines = message.split(/\n/)
+ lines.shift # Drop first bogus line
+ lines.each do |line|
+ throw :headerend if line == ""
+ case line
+ when /^Subject: (.*)/
+ header[:subject] = "#{$1}"
+ when /^From: (.*)/
+ header[:from] = "#{$1}"
+ when /^Date: (.*)/
+ header[:date] = "#{$1}"
+ when /^List-Id: <(.*)>/
+ header[:listid] = "#{$1}"
+ when /^Message-ID: <(.*)>/
+ header[:messageid] = "#{$1}"
+ when /^In-Reply-To: <(.*)>/
+ header[:inreplyto] = "#{$1}"
+ end
+ end
+ end
+ headers << header
+ end
+ return
+ rescue => e
+ puts e # TODO rationalize error processing
+ return ["ERROR:scan_mbox_hdr(#{f}) #{e.message[0..255]}", "\t#{e.backtrace.join("\n\t")}"]
+ end
+end
+
+# Return headers for a directory of mboxes
+def scan_dir_headers(dir, ext)
+ headers = []
+ errs = []
+ Dir["#{dir}/**/*#{ext}*".untaint].each do |f|
+ headers, errs = scan_mbox_headers(f.untaint, headers)
+ end
+ annotate_headers(headers)
+ return headers
+end
+
+# Copied from www/secretary/workbench/models/message.rb
+# see https://github.com/mikel/mail/issues/39
+def liberal_email_parser(addr)
+ begin
+ addr = Mail::Address.new(addr)
+ rescue
+ if addr =~ /^"([^"]*)" <(.*)>$/
+ addr = Mail::Address.new
+ addr.address = $2
+ addr.display_name = $1
+ elsif addr =~ /^([^"]*) <(.*)>$/
+ addr = Mail::Address.new
+ addr.address = $2
+ addr.display_name = $1
+ else
+ raise
+ end
+ end
+ return addr
+end
+
+# Simple annotations for best guesses or ASF attributes
+SHANE = 'Shane'
+MEMBER = 'member'
+COMMITTER = 'committer'
+def annotate_headers(headers)
+ headers.each do |header|
+ if header[:from] =~ /\(JIRA\)/
+ header[:type] = 'JIRA'
+ elsif header[:subject] =~ /\Asvn commit/
+ header[:type] = 'SVN'
+ elsif header[:subject] =~ /\A\[REPORT\] /
+ header[:type] = 'REPORT'
+ elsif header[:subject] =~ /\A\[[A-Z]{5,6}\] / # mailto: from website
+ header[:type] = 'Question-Web'
+ else
+ header.key?(:inreplyto) ? header[:type] = '' : header[:type] = 'Question' # Presumably a new incoming question
+ end
+
+ if header[:from] =~ /\AShane Curcuru \(JIRA/i # Optimize case for trademarks@
+ header[:who] = 'Shane-JIRA'
+ header[:committer] = SHANE
+ elsif header[:from] =~ /Shane Curcuru/i # Optimize case for trademarks@
+ header[:who] = 'Shane'
+ header[:committer] = SHANE
+ elsif header[:from] =~ /\AVice President, Brand Management/i # Optimize case for trademarks@
+ header[:who] = 'Shane-VP'
+ header[:committer] = SHANE
+ elsif header[:from] =~ /jira@/i # Optimize case for trademarks@, hackish
+ header[:who] = header[:from].sub("<ji...@apache.org>", '').gsub('""', '')
+ header[:committer] = COMMITTER
+ else
+ begin
+ tmp = liberal_email_parser(header[:from])
+ person = ASF::Person.find_by_email(tmp.address.dup)
+ if person
+ header[:who] = person.cn
+ if person.asf_member?
+ header[:committer] = MEMBER
+ else
+ header[:committer] = COMMITTER
+ end
+ else
+ header[:who] = tmp.display_name
+ header[:committer] = 'n'
+ end
+ rescue
+ header[:who] = header[:from]
+ header[:committer] = 'N'
+ end
+ end
+ end
+end
+
+# Common use case - analyze headers in mbox files to see who asks questions on trademarks@
+def do_mbox2csv_hdr(dir)
+ headers = scan_dir_headers(dir, MBOX_EXT)
+ CSV.open(File.join("#{dir}", "mboxhdr2csv.csv"), "w", headers: %w( date who subject messageid committer question ), write_headers: true) do |csv|
+ headers.each do |h|
+ csv << [h[:date], h[:who], h[:subject], h[:messageid], h[:committer], h[:type] ]
+ end
+ end
+end
+
+# Combine all jsons of mbox stats into single csv, step by step
+# @return [ mailHsh, mailHsh2, ...]
+def scan_stats_to_csv(dir)
+ jzons = Dir["#{dir}/**/*.json".untaint]
+ puts "scan_stats_to_csv() processing #{jzons.length} files"
+ firstfile = jzons.shift
+ firstjson = JSON.parse(File.read(firstfile))
+ # Write out headers and the first file in new csv
+ CSV.open(File.join("#{dir}", "mbox2stats.csv"), "w", headers: %w( year month day weekday hour zone listid who subject lines committer messageid inreplyto ), write_headers: true) do |csv|
+ firstjson[0].each do |m|
+ csv << [ m[:year], m[:month], m[:day], m[:weekday], m[:hour], m[:zone], m[:listid], m[:who], m[:subject], m[:lines], m[:committer], m[:messageid], m[:inreplyto] ]
+ end
+ end
+ # Write out all remaining files, without headers, appending
+ jzons.each do |f|
+ begin
+ jzon = JSON.parse(File.read(f))
+# TODO process json to csv once
+ rescue => e
+ puts "ERROR:combine_stats(#{f}) raised #{e.message[0..255]}"
+ errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
+ next
+ end
+ end
+ return errors
+end
+
+# Common use case - analyze mbox files to see how much everyone writes
+puts "DEBUG-TEST11j"
+# scan_dir_mbox2stats('/Users/curcuru/src/mail/', MBOX_EXT)
+# e = scan_dir_json_nondiscussion('/Users/curcuru/src/mail3/')
+e = fixup_sally_mail('/Users/curcuru/src/mail/')
+e.each do |x|
+ p x
+end
+puts "DEBUG-END11 e.length #{e.length}"
diff --git a/tools/ponypoop.rb b/tools/ponypoop.rb
index de112e8..b848a9b 100755
--- a/tools/ponypoop.rb
+++ b/tools/ponypoop.rb
@@ -1,12 +1,10 @@
#!/usr/bin/env ruby
-<<~HEREDOC
-Pony poop: utilities for analyzing data from Apache Ponymail APIs
-- Analyze stats.lua JSON output for subject/author analysis
-- Analyze mbox.lua mbox files for author/list/lines written analysis
-
-See also: https://ponymail.incubator.apache.org/docs/api
-See also: https://lists.apache.org/ngrams.html
-HEREDOC
+# Pony poop: utilities for analyzing data from Apache Ponymail APIs
+# - Analyze stats.lua JSON output for subject/author analysis
+# - Analyze mbox.lua mbox files for author/list/lines written analysis
+#
+# See also: https://ponymail.incubator.apache.org/docs/api
+# See also: https://lists.apache.org/ngrams.html
require 'json'
require 'csv'
require 'net/http'
@@ -22,8 +20,8 @@ BOARD_REGEX = { # Non-interesting email subjects from board # TODO add features
notice: /\A\[NOTICE\]/i,
report: /\A\[REPORT\]/i,
resolution: /\A\[RESOLUTION\]/i,
- svn_agenda: %r{\Aboard: r\d{4,7} - /foundation/board/},
- svn_iclas: %r{\Aboard: r\d{4,7} - /foundation/officers/iclas.txt}
+ svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
+ svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
}
# ## ### #### ##### ######
diff --git a/www/docs/index.cgi b/www/docs/index.cgi
new file mode 100644
index 0000000..e6d402c
--- /dev/null
+++ b/www/docs/index.cgi
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+PAGETITLE = "Whimsy Code Documentation" # Wvisible:docs
+
+$LOAD_PATH.unshift File.realpath(File.expand_path('../../../lib', __FILE__))
+require 'json'
+require 'whimsy/asf'
+require 'wunderbar'
+require 'wunderbar/bootstrap'
+
+_html do
+ _body? do
+ _whimsy_body(
+ title: PAGETITLE,
+ subtitle: 'About This Documentation',
+ relatedtitle: 'More Useful Links',
+ related: {
+ "/committers/tools" => "Whimsy Tool Listing",
+ "https://github.com/rubys/wunderbar/" => "See Wunderbar Module Documentation",
+ "https://github.com/apache/whimsy/blob/master/www#{ENV['SCRIPT_NAME']}" => "See This Source Code"
+ },
+ helpblock: -> {
+ _p %{
+ This is the homepage for the code and API documentation for the Apache Whimsy project.
+ }
+ }
+ ) do
+
+ _h2 "API Documentation"
+ _a "whimsy/asf module APIs", href: '/docs/api/'
+ end
+ end
+end
--
To stop receiving notification emails like this one, please contact
['"commits@whimsical.apache.org" <co...@whimsical.apache.org>'].