You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/01 16:28:59 UTC
svn commit: r1041053 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail:
MailContentHandler.java RFC822Parser.java
Author: jukka
Date: Wed Dec 1 15:28:59 2010
New Revision: 1041053
URL: http://svn.apache.org/viewvc?rev=1041053&view=rev
Log:
TIKA-461: RFC822 messages not parsed
Move the MailContentHandler class to a separate file.
Replace StrictBodyContentHandler by a composition of the Embedded- and BodyContentHandler classes.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
- copied, changed from r1041000, tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
Copied: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (from r1041000, tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java&r1=1041000&r2=1041053&rev=1041053&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Wed Dec 1 15:28:59 2010
@@ -18,89 +18,25 @@ package org.apache.tika.parser.mail;
import java.io.IOException;
import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.descriptor.BodyDescriptor;
+import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.parser.Field;
-import org.apache.james.mime4j.parser.MimeStreamParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within
- * <p>
- * elements.
- *
- * @author jnioche@digitalpebble.com
- **/
-public class RFC822Parser implements Parser {
-
- private static final Set<MediaType> SUPPORTED_TYPES = Collections
- .singleton(MediaType.parse("message/rfc822"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- MimeStreamParser parser = new MimeStreamParser();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
- MailContentHandler mch = new MailContentHandler(xhtml, metadata);
- parser.setContentHandler(mch);
- try {
- parser.parse(stream);
- } catch (MimeException e) {
- throw new TikaException(e.getMessage());
- }
- }
-
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata) throws IOException, SAXException, TikaException {
- parse(stream, handler, metadata, new ParseContext());
- }
-
-}
-
-/**
- * Same as BodyContentHandler but does not even propagate the start |
- * endDocument events
- **/
-class StrictBodyContentHandler extends BodyContentHandler {
-
- public StrictBodyContentHandler(XHTMLContentHandler handler) {
- super(handler);
- }
-
- public void startDocument() throws SAXException {
- }
-
- public void endDocument() throws SAXException {
- }
-
-}
-
-/**
* Bridge between mime4j's content handler and the generic Sax content handler
* used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j
- * /parser/ContentHandler.html
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
*/
-class MailContentHandler implements
- org.apache.james.mime4j.parser.ContentHandler {
+class MailContentHandler implements ContentHandler {
private XHTMLContentHandler handler;
private Metadata metadata;
@@ -125,14 +61,9 @@ class MailContentHandler implements
submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
- // filter the events coming from the underlying parser
- // to prevent getting multiple </body> </html> or </title>
- // in the output
-
- StrictBodyContentHandler bch = new StrictBodyContentHandler(handler);
-
try {
- parser.parse(is, bch, submd);
+ BodyContentHandler bch = new BodyContentHandler(handler);
+ parser.parse(is, new EmbeddedContentHandler(bch), submd);
} catch (SAXException e) {
e.printStackTrace();
} catch (TikaException e) {
@@ -222,4 +153,4 @@ class MailContentHandler implements
inPart = true;
}
-}
+}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1041053&r1=1041052&r2=1041053&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Wed Dec 1 15:28:59 2010
@@ -22,25 +22,19 @@ import java.util.Collections;
import java.util.Set;
import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.descriptor.BodyDescriptor;
-import org.apache.james.mime4j.parser.Field;
import org.apache.james.mime4j.parser.MimeStreamParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within
- * <p>
- * elements.
+ * corresponding parser and displayed within elements.
*
* @author jnioche@digitalpebble.com
**/
@@ -74,152 +68,3 @@ public class RFC822Parser implements Par
}
}
-
-/**
- * Same as BodyContentHandler but does not even propagate the start |
- * endDocument events
- **/
-class StrictBodyContentHandler extends BodyContentHandler {
-
- public StrictBodyContentHandler(XHTMLContentHandler handler) {
- super(handler);
- }
-
- public void startDocument() throws SAXException {
- }
-
- public void endDocument() throws SAXException {
- }
-
-}
-
-/**
- * Bridge between mime4j's content handler and the generic Sax content handler
- * used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j
- * /parser/ContentHandler.html
- */
-class MailContentHandler implements
- org.apache.james.mime4j.parser.ContentHandler {
-
- private XHTMLContentHandler handler;
- private Metadata metadata;
-
- private boolean inPart = false;
-
- MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
- this.handler = xhtml;
- this.metadata = metadata;
- }
-
- public void body(BodyDescriptor body, InputStream is) throws MimeException,
- IOException {
- // call the underlying parser for the part
- // TODO how to retrieve a non-default config?
- AutoDetectParser parser = new AutoDetectParser();
- // use a different metadata object
- // in order to specify the mime type of the
- // sub part without damaging the main metadata
-
- Metadata submd = new Metadata();
- submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
- submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
-
- // filter the events coming from the underlying parser
- // to prevent getting multiple </body> </html> or </title>
- // in the output
-
- StrictBodyContentHandler bch = new StrictBodyContentHandler(handler);
-
- try {
- parser.parse(is, bch, submd);
- } catch (SAXException e) {
- e.printStackTrace();
- } catch (TikaException e) {
- e.printStackTrace();
- }
- }
-
- public void endBodyPart() throws MimeException {
- try {
- handler.endElement("p");
- handler.endElement("div");
- } catch (SAXException e) {
- e.printStackTrace();
- }
- }
-
- public void endHeader() throws MimeException {
- }
-
- public void startMessage() throws MimeException {
- try {
- handler.startDocument();
- } catch (SAXException e) {
- e.printStackTrace();
- }
- }
-
- public void endMessage() throws MimeException {
- try {
- handler.endDocument();
- } catch (SAXException e) {
- e.printStackTrace();
- }
- }
-
- public void endMultipart() throws MimeException {
- inPart = false;
- }
-
- public void epilogue(InputStream is) throws MimeException, IOException {
- }
-
- /**
- * Header for the whole message or its parts
- *
- * @see http
- * ://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
- * Field.html
- **/
- public void field(Field field) throws MimeException {
- // inPart indicates whether these metadata correspond to the
- // whole message or its parts
- if (inPart)
- return;
- // TODO add metadata to the parts later
- String fieldname = field.getName();
- // TODO value could be parsed and/or encoded
- String value = field.getBody();
- if (fieldname.equalsIgnoreCase("From")) {
- metadata.add(Metadata.AUTHOR, value);
- } else if (fieldname.equalsIgnoreCase("Subject")) {
- metadata.add(Metadata.SUBJECT, value);
- }
- }
-
- public void preamble(InputStream is) throws MimeException, IOException {
- }
-
- public void raw(InputStream is) throws MimeException, IOException {
- }
-
- public void startBodyPart() throws MimeException {
- try {
- handler.startElement("div", "class", "email-entry");
- handler.startElement("p");
- } catch (SAXException e) {
- e.printStackTrace();
- }
- }
-
- public void startHeader() throws MimeException {
- // TODO Auto-generated method stub
-
- }
-
- public void startMultipart(BodyDescriptor descr) throws MimeException {
- inPart = true;
- }
-
-}