You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/04/13 19:43:26 UTC

svn commit: r1091860 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mail/MailContentHandler.java test/resources/test-documents/testRFC822-CC-BCC test/resources/test-documents/testRFC822-big

Author: jukka
Date: Wed Apr 13 17:43:26 2011
New Revision: 1091860

URL: http://svn.apache.org/viewvc?rev=1091860&view=rev
Log:
TIKA-461: RFC822 messages not parsed

Patch by Sjoerd Smeets, with some minor modifications

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-CC-BCC
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-big
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1091860&r1=1091859&r2=1091860&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Wed Apr 13 17:43:26 2011
@@ -22,8 +22,11 @@ import java.io.InputStream;
 import org.apache.james.mime4j.MimeException;
 import org.apache.james.mime4j.descriptor.BodyDescriptor;
 import org.apache.james.mime4j.field.AbstractField;
+import org.apache.james.mime4j.field.AddressListField;
+import org.apache.james.mime4j.field.DateTimeField;
 import org.apache.james.mime4j.field.MailboxListField;
 import org.apache.james.mime4j.field.UnstructuredField;
+import org.apache.james.mime4j.field.address.AddressList;
 import org.apache.james.mime4j.field.address.MailboxList;
 import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.parser.Field;
@@ -123,29 +126,80 @@ class MailContentHandler implements Cont
         if (inPart) {
             return;
         }
-        
-        // TODO add metadata to the parts later
+
         String fieldname = field.getName();
         if (fieldname.equalsIgnoreCase("From")) {
-           MailboxListField fromField = (MailboxListField) AbstractField.parse(field.getRaw());
-           MailboxList mailboxList = fromField.getMailboxList();
-           if(mailboxList != null) {
-              // Add each person in turn
-              for (int i = 0; i < mailboxList.size(); ++i) {
-                 metadata.add(Metadata.AUTHOR, mailboxList.get(i).getDisplayString());        		
-              }
-           } else {
-              // Not a typical from field, do our best
-              String from = fromField.getBody();
-              if(from != null) {
-                 if(from.startsWith("<")) from = from.substring(1);
-                 if(from.endsWith(">")) from = from.substring(0, from.length()-1);
-                 metadata.add(Metadata.AUTHOR, from);
-              }
-           }
+            MailboxListField fromField =
+                (MailboxListField) AbstractField.parse(field.getRaw());
+            MailboxList mailboxList = fromField.getMailboxList();
+            if (fromField.isValidField() && mailboxList != null) {
+                for (int i = 0; i < mailboxList.size(); i++) {
+                    String from = mailboxList.get(i).getDisplayString();
+                    metadata.add(Metadata.MESSAGE_FROM, from);
+                    metadata.add(Metadata.AUTHOR, from);
+                }
+            } else {
+                String from =
+                    stripOutFieldPrefix(field.getRaw().toString(), "From:");
+                if (from.startsWith("<")) {
+                    from = from.substring(1);
+                }
+                if (from.endsWith(">")) {
+                    from = from.substring(0, from.length() - 1);
+                }
+                metadata.add(Metadata.MESSAGE_FROM, from);
+                metadata.add(Metadata.AUTHOR, from);
+            }
         } else if (fieldname.equalsIgnoreCase("Subject")) {
-        	UnstructuredField subjectField = (UnstructuredField) AbstractField.parse(field.getRaw());
+            UnstructuredField subjectField =
+                (UnstructuredField) AbstractField.parse(field.getRaw());
             metadata.add(Metadata.SUBJECT, subjectField.getValue());
+        } else if (fieldname.equalsIgnoreCase("To")) {
+            AddressListField toField =
+                (AddressListField) AbstractField.parse(field.getRaw());
+            if (toField.isValidField()) {
+                AddressList addressList = toField.getAddressList();
+                for (int i = 0; i < addressList.size(); ++i) {
+                    metadata.add(Metadata.MESSAGE_TO, addressList.get(i).getDisplayString());
+                }
+            } else {
+                String to = stripOutFieldPrefix(field.getRaw().toString(), "To:");
+                for (String eachTo : to.split(",")) {
+                    metadata.add(Metadata.MESSAGE_TO, eachTo.trim());
+                }
+            }
+        } else if (fieldname.equalsIgnoreCase("CC")) {
+            AddressListField ccField =
+                (AddressListField) AbstractField.parse(field.getRaw());
+            if (ccField.isValidField()) {
+                AddressList addressList = ccField.getAddressList();
+                for (int i = 0; i < addressList.size(); ++i) {
+                    metadata.add(Metadata.MESSAGE_CC, addressList.get(i).getDisplayString());
+                }
+            } else {
+                String Cc = stripOutFieldPrefix(field.getRaw().toString(), "Cc:");
+                for (String eachCc : Cc.split(",")) {
+                    metadata.add(Metadata.MESSAGE_CC, eachCc.trim());
+                }
+            }
+        } else if (fieldname.equalsIgnoreCase("BCC")) {
+            AddressListField bccField =
+                (AddressListField) AbstractField.parse(field.getRaw());
+            if(bccField.isValidField()){
+                AddressList addressList = bccField.getAddressList();
+                for (int i = 0; i < addressList.size(); ++i) {
+                    metadata.add(Metadata.MESSAGE_BCC, addressList.get(i).getDisplayString());
+                }
+            } else {
+                String Bcc = stripOutFieldPrefix(field.getRaw().toString(), "Bcc:");
+                for(String eachBcc : Bcc.split(",")){
+                    metadata.add(Metadata.MESSAGE_CC, eachBcc.trim());
+                }
+            }
+        }  else if (fieldname.equalsIgnoreCase("Date")) {
+            DateTimeField dateField =
+                (DateTimeField) AbstractField.parse(field.getRaw());
+            metadata.set(Metadata.CREATION_DATE, dateField.getDate());
         }
     }
 
@@ -173,4 +227,12 @@ class MailContentHandler implements Cont
         inPart = true;
     }
 
+    public String stripOutFieldPrefix(String rawField, String fieldname){
+        String temp = rawField.substring(fieldname.length(), rawField.length());
+        while (temp.startsWith(" ")) {
+            temp = temp.substring(1);
+        }
+        return temp;
+    }
+
 }
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-CC-BCC
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-CC-BCC?rev=1091860&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-CC-BCC (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-CC-BCC Wed Apr 13 17:43:26 2011
@@ -0,0 +1,44 @@
+Message-ID: <48...@thyme>
+Date: Tue, 10 Apr 2001 11:52:00 -0700 (PDT)
+From: beth.apollo@enron.com
+To: shona.wilson@enron.com, jeffrey.gossett@enron.com, stacey.white@enron.com,
+	d.hall@enron.com, sheri.thomas@enron.com, brenda.herod@enron.com,
+	john.j.boudreaux@us.arthurandersen.com,
+	john.vickers@us.arthurandersen.com, kate.agnew@us.arthurandersen.com,
+	jennifer.stevenson@us.arthurandersen.com
+Subject: Confidential Folder to safely pass information to  Arthur Andersen
+Cc: sally.beck@enron.com, tom.bauer@us.arthurandersen.com,
+	georgeanne.hodges@enron.com, vanessa.schulte@enron.com,
+	bob.hall@enron.com, leslie.reeves@enron.com, brent.price@enron.com
+Mime-Version: 1.0
+Content-Type: text/plain; charset=us-ascii
+Content-Transfer-Encoding: 7bit
+Bcc: sally.beck@enron.com, tom.bauer@us.arthurandersen.com,
+	georgeanne.hodges@enron.com, vanessa.schulte@enron.com,
+	bob.hall@enron.com, leslie.reeves@enron.com, brent.price@enron.com
+X-From: Beth Apollo <Beth Apollo/ENRON@enronXgate@ENRON>
+X-To: Shona Wilson <Shona Wilson/NA/Enron@Enron>, Jeffrey C Gossett <Jeffrey C Gossett/HOU/ECT@ECT>, Stacey W White <Stacey W White/HOU/ECT@ECT>, D Todd Hall <D Todd Hall/ENRON@enronXgate>, Sheri Thomas <Sheri Thomas/HOU/ECT@ECT>, Brenda F Herod <Brenda F Herod/ENRON@enronXgate>, john.j.boudreaux@us.arthurandersen.com@SMTP <jo...@enronXgate>, john.vickers@us.arthurandersen.com@SMTP <jo...@enronXgate>, kate.agnew@us.arthurandersen.com@SMTP <ka...@enronXgate>, jennifer.stevenson@us.arthurandersen.com@SMTP <je...@enronXgate>
+X-cc: Sally Beck <Sally Beck/HOU/ECT@ECT>, tom.bauer@us.arthurandersen.com@SMTP <to...@enronXgate>, Georgeanne Hodges <Georgeanne Hodges/ENRON@enronXgate>, Vanessa Schulte <Vanessa Schulte/ENRON@enronXgate>, Bob M Hall <Bob M Hall/NA/Enron@Enron>, Leslie Reeves <Leslie Reeves/HOU/ECT@ECT>, Brent A Price <Brent A Price/ENRON@enronXgate>
+X-bcc:
+X-Folder: \Beck, Sally\Beck, Sally\Apollo, Beth
+X-Origin: BECK-S
+X-FileName: Beck, Sally.pst
+
+
+We have become increasingly concerned about confidential information (dpr/position info, curves, validations/stress tests, etc) being passed to Arthur Andersen for audit purposes over the Web to their Arthur Andersen email addresses. (necessary now they no longer have access to Enron's internal email system)
+
+Please use the folder described below when passing any info (that you would have concerns about if it was picked up by a third party) via the shared drive that has been set up for this specific purpose.
+
+Note:  AA should also use the shared drive to pass info back if there are questions, or the data needs updating.  We should also consider the sensitivity of audit findings and special presentations if they are being distributed electronically.
+
+
+Please pass this note to others in your groups who have the need to pass info back and forth.
+
+
+Details on how to access for those who will use this method to pass info:
+
+A secured folder has been set up on the "o" drive under Corporate called Arthur_Andersen (O:\Corporate\Arthur_Anderson).  Please post all confidential files in this folder rather than emailing the files to their company email address.  If you need access to this folder, submit an eRequest through the IT Central site: http://itcentral.enron.com/Data/Services/SecurityRequests/.  Arthur Andersen will be able to retrieve these files for review with their terminal server access at the Three Allen Center location.
+
+Please contact Vanessa Schulte if you have any problems or questions
+
+Beth Apollo
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-big
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-big?rev=1091860&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-big (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-big Wed Apr 13 17:43:26 2011
@@ -0,0 +1,199 @@
+Date: Thu, 7 Jun 2001 02:15:00 -0700 (PDT)
+Message-ID: <00...@PMZL01>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=us-ascii
+Content-Transfer-Encoding: 7bit
+From:  Janette Elbertson
+To:  Alan Aronowitz, Sandi M Braband, Robert Bruce, Teresa G Bushman, Michelle Cash,
+	 Dominic Carolan, Barton Clark, Harry M Collins, Mary Cook, Nancy Corbet, Ned
+	 E Crady, Eddy Daniels, Angela Davis, Peter del Vecchio, Stacy E Dickson, Andrew
+	 Edison, Roseann Engeldorf, Shawna Flynn, Robert H George, Barbara N Gray, Mark
+	 Greenberg, Wayne Gresham, Leslie Hansen, Jeffrey T Hodge, Brent Hendry, Dan
+	 J Hyvl, Anne C Koehler, Cheryl Lindeman, Dan Lyons, Kay Mann, Travis McCullough,
+	 Lisa Mellencamp, Janet H Moore, Harlan Murphy, Julia Murray, Cheryl Nelson,
+	 Gerald Nemec, Marcus Nettelton, Francisco Pinto Leite, David Portz, Coralina
+	 Rivera, Michael A Robison, Daniel R Rogers, Elizabeth Sager, Richard B Sanders,
+	 Frank Sayre, Lance Schuler-Legal, Sara Shackleton, Carlos Sole, Carol St Clair,
+	 Lou Stoler, Mark Taylor, Sheila Tweed, Steve Van Hooser, John Viverito, Ann
+	 Elizabeth White, Randy Young, Susan Bailey, Kimberlee A Bennick, Martha Braddy,
+	 Sarah Bruck, Genia FitzGerald, Nony Flores, Diane Goode, Linda R Guinn, Marie
+	 Heard, Ed B Hearn III, Mary J Heinitz, Tana Jones, Kathleen Carnahan, Deb Korkmas,
+	 Laurie Mayer, Matt Maxwell, Mary Ogden, Stephanie Panus, Debra Perlingiere,
+	 Robert Walker, Kay Young, Merrill W Haas, Samantha Ferguson, Majed Nachawati,
+	 Suzanne Adams, Connie Castillo, Margaret Doucette, Keegan Farrell, Nita Garcia,
+	 Carolyn George, Holly Keiser, MaryHelen Martinez, Taffy Milligan, Linda J Simmons,
+	 Becky Spencer, Twanda Sweet, Alice Wright, Theresa Zucha, Reginald Shanks,
+	 Elizabeth Lauterbach, Claudia Meraz
+Cc:  Gary Bode, Vanessa Griffin, Esmeralda Gonzalez, Martha Keesler, Rae Meadows,
+	 Stephanie Truss
+Subject:  Outlook Migration - EWS Legal
+X-Filename:  sbailey2.nsf
+X-Folder:  \All documents
+X-SDOC:  421977
+X-ZLID:  zl-edrm-enron-v2-bailey-s-1216.eml
+
+Our department will be migrated to Outlook in two groups.  The first group
+will be migrated on Monday, June 11,  and the second group will be migrated
+on Tuesday, June 12.   You will receive four e-mails from the Outlook
+migration team.  Please do not delete them.  You will need to open the four
+e-mails and follow the instructions to migrate to Outlook.
+
+Assistants, you will be responsible for scheduling training for yourself and
+your assignments.  It is recommended everyone attend a one hour training
+class.  Training can be scheduled by contacting Maggie Cruz at extension
+3-1816.  (Assistants, please coordinate training with your backup so both of
+you are not in training at the same time.)  Outlook migration specialists
+will be on the 38th floor to answer questions Tuesday and Wednesday, June 12
+and 13.
+
+Listed below is useful information provided to us by the Outlook Migration
+team.
+
+E-mail Policies
+
+Users will be restricted to a Mailbox size of 100 MB.
+
+Further mailbox size restrictions are detailed as follows:
+
+Issue Warning at 75 MB - users are automatically sent a warning from the
+System Administrator explaining they are near their Mailbox limit.
+
+Prohibit Send at 100 MB - users are prevented from sending e-mail, yet they
+can still receive internal and external messages.  Users must reduce the size
+of their mailbox by deleting old mail, saving attachments to a local drive,
+etc. before they can send e-mail again.
+
+Inbound/Outbound Mail Size Limits - inbound and outbound e-mail messages will
+be limited to a size of 10MB.
+
+Deleted Item Retention - users will be able to recover deleted items from
+their mailbox as old as 8 days.  Deleted items include e-mail messages,
+folders, contacts, calendar entries, tasks, notes, journal entries and
+meeting notices.
+
+Archiving - archiving will not be a supported feature of Outlook 2000.
+
+Migration Preparation
+
+Clean Your Mailbox - due to new space limitations on your mailbox, you are
+advised to clean your Notes mailbox of old, unneeded messages BEFORE
+migration.  If you are at the 100MB limit on the day of migration, you will
+not be able to send messages once you are in Outlook.
+
+Limits on Items Migrated  - from the day of your migration, only 30 days of
+old mail will be migrated from your mailbox.  This includes mail in your
+inbox and other folders.  Calendar items dating back one year from the day of
+migration will be migrated (with the exception of repeating appointments).
+
+The following people will be migrated Monday evening, June 11.
+
+Adams, Suzanne
+Bushman, Teresa
+Cash, Michelle
+Clark, Bart
+Corbet, Nancy
+Daniels, Eddy
+Davis, Angela
+Dickson, Stacy
+Edison, Andy
+Elbertson, Janette
+FitzGerald, Genia
+Flores, Nony
+George, Robert H.
+Goode, Diane
+Guinn, Linda
+Haedicke, Mark
+Hansen, Leslie
+Hearn, Ed
+Heinitz, Mary
+Hodge, Jeff
+Legal Temp 1
+Legal Temp 2
+Legal Temp 3
+Legal Temp 4
+Mann, Kay
+Maxwell, Matt
+McCullough, Travis
+Meraz, Claudia
+Mellencamp, Lisa
+Milligan, Taffy
+Moore, Janet H.
+Nemec, Gerald
+Nettelton, Marcus
+Ogden, Mary
+Perlingiere, Debra
+Portz, David
+Sager, Elizabeth
+Sanders, Richard
+Simmons, Linda
+Sol,, Carlos
+St. Clair, Carol
+Sweet, Twanda
+Tweed, Sheila
+Van Hooser, Steve
+White, Ann Elizabeth
+Zucha, Theresa
+
+The following people will be migrated Tuesday evening, June 12.
+
+Aronowitz, Alan
+Bailey, Susan
+Boyd, Samantha
+Braddy, Martha
+Bruce, Robert
+Bruck, Sarah
+Carolan, Dominic
+Castillo, Connie
+Collins, Harry
+Cook, Mary
+Crady, Ned
+del Vecchio, Peter
+Doucette, Margaret
+Farrell, Keegan
+Ferguson, Samantha
+Garcia, Nita
+George, Carolyn
+Gray, Barbara
+Greenberg, Mark
+Gresham, Wayne
+Haas, Merrill
+Heard, Marie
+Hendry, Brent
+Jones, Tana
+Keiser, Holly
+Koehler, Anne
+Korkmas, Deb
+Lauterbach, Elizabeth
+Legal Temp 5
+Legal Temp 6
+Legal Temp 7
+Lindeman, Cheryl
+Lovelady, Steven
+Lyons, Dan
+Martinez, Mary Helen
+Mayer, Laurie
+Murray, Julia Heintz
+Nachawati, Majed
+Nelson,  Cheryl
+Panus, Stephanie
+Pinto Leite, Francisco
+Rivera, Coralina
+Robison, Michael
+Rogers, Daniel
+Sayre, Frank
+Shackleton, Sara
+Shanks, Reginald
+Spencer, Becky
+Stoler, Lou
+Taylor, Mark
+Viverito, John
+Young, Randy
+
+
+Many thanks for your help in making this a smooth migration to Outlook.
+
+Nony Flores and Janette Elbertson
+
+***********
+EDRM Enron Email Data Set has been produced in EML, PST and NSF format by ZL Technologies, Inc. This Data Set is licensed under a Creative Commons Attribution 3.0 United States License <http://creativecommons.org/licenses/by/3.0/us/> . To provide attribution, please cite to "ZL Technologies, Inc. (http://www.zlti.com)."
+***********
\ No newline at end of file