You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/21 01:03:17 UTC

svn commit: r1173421 - in /tika/trunk: ./ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/mail/ tika-parsers/src/test/java/org/apache/tika/parser/mail/

Author: jukka
Date: Tue Sep 20 23:03:17 2011
New Revision: 1173421

URL: http://svn.apache.org/viewvc?rev=1173421&view=rev
Log:
TIKA-716: Upgrade apache-Mime4J to Version 0.7

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Sep 20 23:03:17 2011
@@ -24,6 +24,9 @@ The most notable changes in Tika 0.10 ov
 
  * The Tika GUI got a facelift and some extra features (TIKA-635)
 
+ * The apache-mime4j dependency of the email message parser was upgraded
+   from version 0.6 to 0.7 (TIKA-716). The parser also now accepts a
+   MimeConfig object in the ParseContext as configuration (TIKA-640).
 
 Release 0.9 - 02/13/2011
 

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Sep 20 23:03:17 2011
@@ -36,6 +36,7 @@
 
   <properties>
     <poi.version>3.8-beta4</poi.version>
+    <mime4j.version>0.7</mime4j.version>
   </properties>
 
   <dependencies>
@@ -51,8 +52,13 @@
     </dependency>
     <dependency>
       <groupId>org.apache.james</groupId>
-      <artifactId>apache-mime4j</artifactId>
-      <version>0.6</version>
+      <artifactId>apache-mime4j-core</artifactId>
+      <version>${mime4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.james</groupId>
+      <artifactId>apache-mime4j-dom</artifactId>
+      <version>${mime4j.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Tue Sep 20 23:03:17 2011
@@ -20,17 +20,22 @@ import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.descriptor.BodyDescriptor;
-import org.apache.james.mime4j.field.AbstractField;
-import org.apache.james.mime4j.field.AddressListField;
-import org.apache.james.mime4j.field.DateTimeField;
-import org.apache.james.mime4j.field.MailboxListField;
-import org.apache.james.mime4j.field.ParsedField;
-import org.apache.james.mime4j.field.UnstructuredField;
-import org.apache.james.mime4j.field.address.AddressList;
-import org.apache.james.mime4j.field.address.MailboxList;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Group;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
 import org.apache.james.mime4j.parser.ContentHandler;
-import org.apache.james.mime4j.parser.Field;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
@@ -133,13 +138,14 @@ class MailContentHandler implements Cont
 
         try {
             String fieldname = field.getName();
-            ParsedField parsedField = AbstractField.parse(field.getRaw());
+            ParsedField parsedField = LenientFieldParser.getParser().parse(
+                    field, DecodeMonitor.SILENT);
             if (fieldname.equalsIgnoreCase("From")) {
                 MailboxListField fromField = (MailboxListField) parsedField;
                 MailboxList mailboxList = fromField.getMailboxList();
                 if (fromField.isValidField() && mailboxList != null) {
                     for (int i = 0; i < mailboxList.size(); i++) {
-                        String from = mailboxList.get(i).getDisplayString();
+                        String from = getDisplayString(mailboxList.get(i));
                         metadata.add(Metadata.MESSAGE_FROM, from);
                         metadata.add(Metadata.AUTHOR, from);
                     }
@@ -181,8 +187,7 @@ class MailContentHandler implements Cont
         if (toField.isValidField()) {
             AddressList addressList = toField.getAddressList();
             for (int i = 0; i < addressList.size(); ++i) {
-                metadata.add(metadataField, addressList.get(i)
-                        .getDisplayString());
+                metadata.add(metadataField, getDisplayString(addressList.get(i)));
             }
         } else {
             String to = stripOutFieldPrefix(field,
@@ -193,6 +198,21 @@ class MailContentHandler implements Cont
         }
     }
 
+    private String getDisplayString(Address address) {
+        if (address instanceof Mailbox) {
+            Mailbox mailbox = (Mailbox) address;
+            String name = mailbox.getName();
+            if (name != null && name.length() > 0) {
+                name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
+                return name + " <" + mailbox.getAddress() + ">";
+            } else {
+                return mailbox.getAddress();
+            }
+        } else {
+            return address.toString();
+        }
+    }
+
     public void preamble(InputStream is) throws MimeException, IOException {
     }
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Tue Sep 20 23:03:17 2011
@@ -22,8 +22,8 @@ import java.util.Collections;
 import java.util.Set;
 
 import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.parser.MimeEntityConfig;
 import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TaggedInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -56,9 +56,9 @@ public class RFC822Parser extends Abstra
             Metadata metadata, ParseContext context) throws IOException,
             SAXException, TikaException {
         // Get the mime4j configuration, or use a default one
-        MimeEntityConfig config = new MimeEntityConfig();
+        MimeConfig config = new MimeConfig();
         config.setMaxLineLen(10000); // max length of any individual header
-        config = context.get(MimeEntityConfig.class, config);
+        config = context.get(MimeConfig.class, config);
 
         MimeStreamParser parser = new MimeStreamParser(config);
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Tue Sep 20 23:03:17 2011
@@ -28,7 +28,7 @@ import java.io.InputStream;
 
 import junit.framework.TestCase;
 
-import org.apache.james.mime4j.parser.MimeEntityConfig;
+import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -192,12 +192,13 @@ public class RFC822ParserTest extends Te
         } catch (TikaException expected) {
         }
 
-        MimeEntityConfig config = new MimeEntityConfig();
+        MimeConfig config = new MimeConfig();
+        config.setMaxHeaderLen(-1);
         config.setMaxLineLen(-1);
-        context.set(MimeEntityConfig.class, config);
+        context.set(MimeConfig.class, config);
         parser.parse(
                 new ByteArrayInputStream(data), handler, metadata, context);
-        assertEquals(name, metadata.get(Metadata.AUTHOR));
+        assertEquals(name.trim(), metadata.get(Metadata.AUTHOR));
     }
     
     /**
@@ -210,8 +211,12 @@ public class RFC822ParserTest extends Te
        ContentHandler handler = new BodyContentHandler();
 
        parser.parse(stream, handler, metadata, new ParseContext());
-       assertEquals("xyz, abc", metadata.get(Metadata.AUTHOR));
-       assertEquals("xyz, abc", metadata.get(Metadata.MESSAGE_FROM));
+       assertEquals(true, metadata.isMultiValued(Metadata.AUTHOR));
+       assertEquals("xyz", metadata.getValues(Metadata.AUTHOR)[0]);
+       assertEquals("abc", metadata.getValues(Metadata.AUTHOR)[1]);
+       assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
+       assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
+       assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
        assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);