You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by Sam Ruby <ru...@apache.org> on 2015/12/13 16:14:51 UTC
[whimsy.git] [1/37] Commit dddb5be: simple script to parse email and capture headers
Commit dddb5be72638a85cbd6cc7728f421e34e056d220:
simple script to parse email and capture headers
Branch: refs/heads/secmail
Author: Sam Ruby <ru...@intertwingly.net>
Committer: Sam Ruby <ru...@intertwingly.net>
Pusher: rubys <ru...@apache.org>
------------------------------------------------------------
.gitignore | ++
Gemfile | ++
Rakefile | +++++++++++++
parsemail.rb | ++++++++++
------------------------------------------------------------
164 changes: 164 additions, 0 deletions.
------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..169b1cf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+Gemfile.lock
+officers-secretary
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..8a27d17
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,2 @@
+gem 'mail'
+gem 'zip'
diff --git a/Rakefile b/Rakefile
new file mode 100644
index 0000000..859fdd3
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,13 @@
+verbose false
+
+file 'Gemfile.lock' => 'Gemfile' do
+ sh 'bundle update'
+end
+
+task :parse => 'Gemfile.lock' do
+ ruby 'parsemail.rb'
+end
+
+task :fetch => 'Gemfile.lock' do
+ ruby 'parsemail.rb', '--fetch'
+end
diff --git a/parsemail.rb b/parsemail.rb
new file mode 100644
index 0000000..3af7b64
--- /dev/null
+++ b/parsemail.rb
@@ -0,0 +1,147 @@
+#!/usr/bin/ruby
+
+#
+# Parse (and optionally fetch) officer-secretary emails for later
+# processing.
+#
+
+SOURCE = 'minotaur.apache.org:/home/apmail/private-arch/officers-secretary'
+
+require 'mail'
+require 'zlib'
+require 'zip'
+require 'yaml'
+require 'stringio'
+require 'time'
+
+database = File.basename(SOURCE)
+
+Dir.chdir File.dirname(File.expand_path(__FILE__))
+
+if ARGV.include? '--fetch' or not Dir.exist? database
+ system "rsync -av --no-motd --delete --exclude='*.yml' #{SOURCE} ."
+end
+
+# common header logic for messages and attachments
+def headers(part)
+ # extract all fields from the mail (recovering from bad encoding issues)
+ fields = part.header_fields.map do |field|
+ if field.value and field.value.valid_encoding?
+ [field.name, field.value]
+ else
+ [field.name, field.value.inspect]
+ end
+ end
+
+ # group fields by name
+ fields = fields.group_by(&:first).map do |name, values|
+ if values.length == 1
+ [name, values.first.last]
+ else
+ [name, values.map(&:last)]
+ end
+ end
+
+ # return fields as a Hash
+ Hash[fields]
+end
+
+# scan each mailbox for updates
+width = 0
+Dir[File.join(database, '2*')].sort.each do |name|
+ # skip YAML files, update output showing latest file being processed
+ next if name.end_with? '.yml'
+ print "#{name.ljust(width)}\r"
+ width = name.length
+
+ # test read the YAML file to see if the mbox needs to be parsed
+ yaml = File.join(database, File.basename(name)[/\d+/] + '.yml')
+ mbox = YAML.load_file(yaml) || {} rescue {}
+ next if mbox[:mtime] == File.mtime(name)
+
+ # open the YAML file for real (locking it this time)
+ File.open(yaml, File::RDWR|File::CREAT, 0644) do |file|
+ file.flock(File::LOCK_EX)
+ mbox = YAML.load_file(yaml) || {} rescue {}
+ mbox[:mtime] = File.mtime(name)
+
+ # read (and unzip) the mailbox
+ mails = File.read(name)
+ if name.end_with? '.gz'
+ stream = StringIO.new(mails)
+ reader = Zlib::GzipReader.new(stream)
+ mails = reader.read
+ reader.close
+ stream.close rescue nil
+ end
+ mails.force_encoding Encoding::ASCII_8BIT
+
+ # split into individual messages
+ mails = mails.split(/^From .*/)
+ mails.shift
+
+ # process each
+ mails.each do |mail|
+ # extract id, skip if already processed
+ id = mail[/^Message-ID: <(.*?)>\s*$/i, 1]
+ next if id and mbox[id]
+ mail = Mail.read_from_string(mail)
+ id ||= mail.message_id
+ next if mbox[id]
+
+ # parse from address
+ begin
+ from = Mail::Address.new(mail[:from].value).display_name
+ rescue Exception
+ from = mail[:from].value
+ end
+
+ # determine who should be copied on any responses
+ cc = []
+ cc = mail[:to].value.split(/,\s*/) if mail[:to]
+ cc += mail[:cc].value.split(/,\s*/) if mail[:cc]
+
+ # remove secretary and anybody on the to field from the cc list
+ cc.reject! do |email|
+ begin
+ address = Mail::Address.new(email).address
+ return true if address == 'secretary@apache.org'
+ return true if mail.from_addrs.include? address
+ rescue Exception
+ true
+ end
+ end
+
+ # start an entry for this mail
+ mbox[id] = {
+ from: mail.from_addrs.first,
+ name: from,
+ time: (mail.date.to_time.gmtime.iso8601 rescue nil),
+ cc: cc
+ }
+
+ # add in header fields
+ mbox[id].merge! headers(mail)
+
+ # add in attachments
+ if mail.attachments.length > 0
+ attachments = mail.attachments.map do |attach|
+ description = {
+ name: attach.filename,
+ length: attach.body.to_s.length,
+ mime: attach.mime_type
+ }
+
+ description.merge(headers(attach))
+ end
+
+ mbox[id][:attachments] = attachments
+ end
+ end
+
+ # update YAML file
+ YAML.dump(mbox, file)
+ end
+end
+
+puts