You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/21 01:03:17 UTC
svn commit: r1173421 - in /tika/trunk: ./ tika-parsers/
tika-parsers/src/main/java/org/apache/tika/parser/mail/
tika-parsers/src/test/java/org/apache/tika/parser/mail/
Author: jukka
Date: Tue Sep 20 23:03:17 2011
New Revision: 1173421
URL: http://svn.apache.org/viewvc?rev=1173421&view=rev
Log:
TIKA-716: Upgrade apache-Mime4J to Version 0.7
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Sep 20 23:03:17 2011
@@ -24,6 +24,9 @@ The most notable changes in Tika 0.10 ov
* The Tika GUI got a facelift and some extra features (TIKA-635)
+ * The apache-mime4j dependency of the email message parser was upgraded
+ from version 0.6 to 0.7 (TIKA-716). The parser also now accepts a
+ MimeConfig object in the ParseContext as configuration (TIKA-640).
Release 0.9 - 02/13/2011
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Sep 20 23:03:17 2011
@@ -36,6 +36,7 @@
<properties>
<poi.version>3.8-beta4</poi.version>
+ <mime4j.version>0.7</mime4j.version>
</properties>
<dependencies>
@@ -51,8 +52,13 @@
</dependency>
<dependency>
<groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j</artifactId>
- <version>0.6</version>
+ <artifactId>apache-mime4j-core</artifactId>
+ <version>${mime4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-dom</artifactId>
+ <version>${mime4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Tue Sep 20 23:03:17 2011
@@ -20,17 +20,22 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.descriptor.BodyDescriptor;
-import org.apache.james.mime4j.field.AbstractField;
-import org.apache.james.mime4j.field.AddressListField;
-import org.apache.james.mime4j.field.DateTimeField;
-import org.apache.james.mime4j.field.MailboxListField;
-import org.apache.james.mime4j.field.ParsedField;
-import org.apache.james.mime4j.field.UnstructuredField;
-import org.apache.james.mime4j.field.address.AddressList;
-import org.apache.james.mime4j.field.address.MailboxList;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Group;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.parser.ContentHandler;
-import org.apache.james.mime4j.parser.Field;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@@ -133,13 +138,14 @@ class MailContentHandler implements Cont
try {
String fieldname = field.getName();
- ParsedField parsedField = AbstractField.parse(field.getRaw());
+ ParsedField parsedField = LenientFieldParser.getParser().parse(
+ field, DecodeMonitor.SILENT);
if (fieldname.equalsIgnoreCase("From")) {
MailboxListField fromField = (MailboxListField) parsedField;
MailboxList mailboxList = fromField.getMailboxList();
if (fromField.isValidField() && mailboxList != null) {
for (int i = 0; i < mailboxList.size(); i++) {
- String from = mailboxList.get(i).getDisplayString();
+ String from = getDisplayString(mailboxList.get(i));
metadata.add(Metadata.MESSAGE_FROM, from);
metadata.add(Metadata.AUTHOR, from);
}
@@ -181,8 +187,7 @@ class MailContentHandler implements Cont
if (toField.isValidField()) {
AddressList addressList = toField.getAddressList();
for (int i = 0; i < addressList.size(); ++i) {
- metadata.add(metadataField, addressList.get(i)
- .getDisplayString());
+ metadata.add(metadataField, getDisplayString(addressList.get(i)));
}
} else {
String to = stripOutFieldPrefix(field,
@@ -193,6 +198,21 @@ class MailContentHandler implements Cont
}
}
+ private String getDisplayString(Address address) {
+ if (address instanceof Mailbox) {
+ Mailbox mailbox = (Mailbox) address;
+ String name = mailbox.getName();
+ if (name != null && name.length() > 0) {
+ name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
+ return name + " <" + mailbox.getAddress() + ">";
+ } else {
+ return mailbox.getAddress();
+ }
+ } else {
+ return address.toString();
+ }
+ }
+
public void preamble(InputStream is) throws MimeException, IOException {
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Tue Sep 20 23:03:17 2011
@@ -22,8 +22,8 @@ import java.util.Collections;
import java.util.Set;
import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.parser.MimeEntityConfig;
import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TaggedInputStream;
import org.apache.tika.metadata.Metadata;
@@ -56,9 +56,9 @@ public class RFC822Parser extends Abstra
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Get the mime4j configuration, or use a default one
- MimeEntityConfig config = new MimeEntityConfig();
+ MimeConfig config = new MimeConfig();
config.setMaxLineLen(10000); // max length of any individual header
- config = context.get(MimeEntityConfig.class, config);
+ config = context.get(MimeConfig.class, config);
MimeStreamParser parser = new MimeStreamParser(config);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1173421&r1=1173420&r2=1173421&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Tue Sep 20 23:03:17 2011
@@ -28,7 +28,7 @@ import java.io.InputStream;
import junit.framework.TestCase;
-import org.apache.james.mime4j.parser.MimeEntityConfig;
+import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -192,12 +192,13 @@ public class RFC822ParserTest extends Te
} catch (TikaException expected) {
}
- MimeEntityConfig config = new MimeEntityConfig();
+ MimeConfig config = new MimeConfig();
+ config.setMaxHeaderLen(-1);
config.setMaxLineLen(-1);
- context.set(MimeEntityConfig.class, config);
+ context.set(MimeConfig.class, config);
parser.parse(
new ByteArrayInputStream(data), handler, metadata, context);
- assertEquals(name, metadata.get(Metadata.AUTHOR));
+ assertEquals(name.trim(), metadata.get(Metadata.AUTHOR));
}
/**
@@ -210,8 +211,12 @@ public class RFC822ParserTest extends Te
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, new ParseContext());
- assertEquals("xyz, abc", metadata.get(Metadata.AUTHOR));
- assertEquals("xyz, abc", metadata.get(Metadata.MESSAGE_FROM));
+ assertEquals(true, metadata.isMultiValued(Metadata.AUTHOR));
+ assertEquals("xyz", metadata.getValues(Metadata.AUTHOR)[0]);
+ assertEquals("abc", metadata.getValues(Metadata.AUTHOR)[1]);
+ assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
+ assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
+ assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);