You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/01 16:28:59 UTC

svn commit: r1041053 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail: MailContentHandler.java RFC822Parser.java

Author: jukka
Date: Wed Dec  1 15:28:59 2010
New Revision: 1041053

URL: http://svn.apache.org/viewvc?rev=1041053&view=rev
Log:
TIKA-461: RFC822 messages not parsed

Move the MailContentHandler class to a separate file.
Replace StrictBodyContentHandler by a composition of the Embedded- and BodyContentHandler classes.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
      - copied, changed from r1041000, tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java

Copied: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (from r1041000, tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java&r1=1041000&r2=1041053&rev=1041053&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Wed Dec  1 15:28:59 2010
@@ -18,89 +18,25 @@ package org.apache.tika.parser.mail;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
 
 import org.apache.james.mime4j.MimeException;
 import org.apache.james.mime4j.descriptor.BodyDescriptor;
+import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.parser.Field;
-import org.apache.james.mime4j.parser.MimeStreamParser;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within
- * <p>
- * elements.
- * 
- * @author jnioche@digitalpebble.com
- **/
-public class RFC822Parser implements Parser {
-
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections
-            .singleton(MediaType.parse("message/rfc822"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-        MimeStreamParser parser = new MimeStreamParser();
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
-        MailContentHandler mch = new MailContentHandler(xhtml, metadata);
-        parser.setContentHandler(mch);
-        try {
-            parser.parse(stream);
-        } catch (MimeException e) {
-            throw new TikaException(e.getMessage());
-        }
-    }
-
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata) throws IOException, SAXException, TikaException {
-        parse(stream, handler, metadata, new ParseContext());
-    }
-
-}
-
-/**
- * Same as BodyContentHandler but does not even propagate the start |
- * endDocument events
- **/
-class StrictBodyContentHandler extends BodyContentHandler {
-
-    public StrictBodyContentHandler(XHTMLContentHandler handler) {
-        super(handler);
-    }
-
-    public void startDocument() throws SAXException {
-    }
-
-    public void endDocument() throws SAXException {
-    }
-
-}
-
-/**
  * Bridge between mime4j's content handler and the generic Sax content handler
  * used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j
- * /parser/ContentHandler.html
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
  */
-class MailContentHandler implements
-        org.apache.james.mime4j.parser.ContentHandler {
+class MailContentHandler implements ContentHandler {
 
     private XHTMLContentHandler handler;
     private Metadata metadata;
@@ -125,14 +61,9 @@ class MailContentHandler implements
         submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
         submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
 
-        // filter the events coming from the underlying parser
-        // to prevent getting multiple </body> </html> or </title>
-        // in the output
-
-        StrictBodyContentHandler bch = new StrictBodyContentHandler(handler);
-
         try {
-            parser.parse(is, bch, submd);
+            BodyContentHandler bch = new BodyContentHandler(handler);
+            parser.parse(is, new EmbeddedContentHandler(bch), submd);
         } catch (SAXException e) {
             e.printStackTrace();
         } catch (TikaException e) {
@@ -222,4 +153,4 @@ class MailContentHandler implements
         inPart = true;
     }
 
-}
+}
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1041053&r1=1041052&r2=1041053&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Wed Dec  1 15:28:59 2010
@@ -22,25 +22,19 @@ import java.util.Collections;
 import java.util.Set;
 
 import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.descriptor.BodyDescriptor;
-import org.apache.james.mime4j.parser.Field;
 import org.apache.james.mime4j.parser.MimeStreamParser;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
  * Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within
- * <p>
- * elements.
+ * corresponding parser and displayed within elements.
  * 
  * @author jnioche@digitalpebble.com
  **/
@@ -74,152 +68,3 @@ public class RFC822Parser implements Par
     }
 
 }
-
-/**
- * Same as BodyContentHandler but does not even propagate the start |
- * endDocument events
- **/
-class StrictBodyContentHandler extends BodyContentHandler {
-
-    public StrictBodyContentHandler(XHTMLContentHandler handler) {
-        super(handler);
-    }
-
-    public void startDocument() throws SAXException {
-    }
-
-    public void endDocument() throws SAXException {
-    }
-
-}
-
-/**
- * Bridge between mime4j's content handler and the generic Sax content handler
- * used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j
- * /parser/ContentHandler.html
- */
-class MailContentHandler implements
-        org.apache.james.mime4j.parser.ContentHandler {
-
-    private XHTMLContentHandler handler;
-    private Metadata metadata;
-
-    private boolean inPart = false;
-
-    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
-        this.handler = xhtml;
-        this.metadata = metadata;
-    }
-
-    public void body(BodyDescriptor body, InputStream is) throws MimeException,
-            IOException {
-        // call the underlying parser for the part
-        // TODO how to retrieve a non-default config?
-        AutoDetectParser parser = new AutoDetectParser();
-        // use a different metadata object
-        // in order to specify the mime type of the
-        // sub part without damaging the main metadata
-
-        Metadata submd = new Metadata();
-        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
-        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
-
-        // filter the events coming from the underlying parser
-        // to prevent getting multiple </body> </html> or </title>
-        // in the output
-
-        StrictBodyContentHandler bch = new StrictBodyContentHandler(handler);
-
-        try {
-            parser.parse(is, bch, submd);
-        } catch (SAXException e) {
-            e.printStackTrace();
-        } catch (TikaException e) {
-            e.printStackTrace();
-        }
-    }
-
-    public void endBodyPart() throws MimeException {
-        try {
-            handler.endElement("p");
-            handler.endElement("div");
-        } catch (SAXException e) {
-            e.printStackTrace();
-        }
-    }
-
-    public void endHeader() throws MimeException {
-    }
-
-    public void startMessage() throws MimeException {
-        try {
-            handler.startDocument();
-        } catch (SAXException e) {
-            e.printStackTrace();
-        }
-    }
-
-    public void endMessage() throws MimeException {
-        try {
-            handler.endDocument();
-        } catch (SAXException e) {
-            e.printStackTrace();
-        }
-    }
-
-    public void endMultipart() throws MimeException {
-        inPart = false;
-    }
-
-    public void epilogue(InputStream is) throws MimeException, IOException {
-    }
-
-    /**
-     * Header for the whole message or its parts
-     * 
-     * @see http 
-     *      ://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
-     *      Field.html
-     **/
-    public void field(Field field) throws MimeException {
-        // inPart indicates whether these metadata correspond to the
-        // whole message or its parts
-        if (inPart)
-            return;
-        // TODO add metadata to the parts later
-        String fieldname = field.getName();
-        // TODO value could be parsed and/or encoded
-        String value = field.getBody();
-        if (fieldname.equalsIgnoreCase("From")) {
-            metadata.add(Metadata.AUTHOR, value);
-        } else if (fieldname.equalsIgnoreCase("Subject")) {
-            metadata.add(Metadata.SUBJECT, value);
-        }
-    }
-
-    public void preamble(InputStream is) throws MimeException, IOException {
-    }
-
-    public void raw(InputStream is) throws MimeException, IOException {
-    }
-
-    public void startBodyPart() throws MimeException {
-        try {
-            handler.startElement("div", "class", "email-entry");
-            handler.startElement("p");
-        } catch (SAXException e) {
-            e.printStackTrace();
-        }
-    }
-
-    public void startHeader() throws MimeException {
-        // TODO Auto-generated method stub
-
-    }
-
-    public void startMultipart(BodyDescriptor descr) throws MimeException {
-        inPart = true;
-    }
-
-}