You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/11 13:47:29 UTC

svn commit: r1091044 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/TNEFParser.java test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java

Author: nick
Date: Mon Apr 11 11:47:29 2011
New Revision: 1091044

URL: http://svn.apache.org/viewvc?rev=1091044&view=rev
Log:
TIKA-615 - POI powered TNEF parser

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
Modified:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1091044&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Mon Apr 11 11:47:29 2011
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hmef.Attachment;
+import org.apache.poi.hmef.HMEFMessage;
+import org.apache.poi.hmef.attribute.MAPIAttribute;
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for TNEF (Transport Neutral
+ *  Encoding Format) messages, aka winmail.dat
+ */
+public class TNEFParser implements Parser {
+   private static final long serialVersionUID = 4611820730372823452L;
+   
+   private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+              MediaType.application("vnd.ms-tnef"),
+              MediaType.application("ms-tnef"),
+              MediaType.application("x-tnef")
+         )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+       
+       // We work by recursing, so get the appropriate bits 
+       EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+       EmbeddedDocumentExtractor embeddedExtractor;
+       if (ex==null) {
+           embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+       } else {
+           embeddedExtractor = ex;
+       }
+       
+       // Ask POI to process the file for us
+       HMEFMessage msg = new HMEFMessage(stream);
+       
+       // Set the message subject if known
+       String subject = msg.getSubject();
+       if(subject != null && subject.length() > 0) {
+          metadata.set(Metadata.SUBJECT, subject);
+       }
+       
+       // Recurse into the message body RTF
+       MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
+       if(attr != null && attr instanceof MAPIRtfAttribute) {
+          MAPIRtfAttribute rtf = (MAPIRtfAttribute)attr;
+          handleEmbedded(
+                "message.rtf", "application/rtf",
+                rtf.getData(),
+                embeddedExtractor, handler
+          );
+       }
+       
+       // Recurse into each attachment in turn
+       for(Attachment attachment : msg.getAttachments()) {
+          String name = attachment.getLongFilename();
+          if(name == null || name.length() == 0) {
+             name = attachment.getFilename();
+          }
+          if(name == null || name.length() == 0) {
+             String ext = attachment.getExtension();
+             if(ext != null) {
+                name = "unknown" + ext;
+             }
+          }
+          handleEmbedded(
+                name, null, attachment.getContents(),
+                embeddedExtractor, handler
+          );
+       }
+    }
+    
+    private void handleEmbedded(String name, String type, byte[] contents,
+          EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
+          throws IOException, SAXException, TikaException {
+       Metadata metadata = new Metadata();
+       if(name != null)
+          metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+       if(type != null)
+          metadata.set(Metadata.CONTENT_TYPE, type);
+
+       if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+         embeddedExtractor.parseEmbedded(
+                 TikaInputStream.get(contents),
+                 new EmbeddedContentHandler(handler),
+                 metadata, false);
+       }
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1091044&r1=1091043&r2=1091044&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java Mon Apr 11 11:47:29 2011
@@ -17,10 +17,15 @@
 package org.apache.tika.parser.microsoft;
 
 import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
 
 /**
  * Tests for the TNEF (winmail.dat) parser
@@ -41,4 +46,49 @@ public class TNEFParserTest extends Abst
          stream.close();
      }
    }
+   
+   public void testMetadata() throws Exception {
+      TikaInputStream stream = getTestFile(file);
+      
+      Metadata metadata = new Metadata();
+      ContentHandler handler = new BodyContentHandler();
+      
+      TNEFParser tnef = new TNEFParser();
+      tnef.parse(stream, handler, metadata, new ParseContext());
+      
+      assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
+   }
+   
+    /**
+     * Check the Rtf and Attachments are returned
+     *  as expected
+     */
+    public void testBodyAndAttachments() throws Exception {
+       ContainerExtractor extractor = new ParserContainerExtractor();
+       
+       // Process it with recursing
+       // Will have the message body RTF and the attachments
+       TrackingHandler handler = process(file, extractor, true);
+       assertEquals(6, handler.filenames.size());
+       assertEquals(6, handler.mediaTypes.size());
+       
+       // We know the filenames for all of them
+       assertEquals("message.rtf", handler.filenames.get(0));
+       assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
+       
+       assertEquals("quick.doc", handler.filenames.get(1));
+       assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
+       
+       assertEquals("quick.html", handler.filenames.get(2));
+       assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
+       
+       assertEquals("quick.pdf", handler.filenames.get(3));
+       assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
+       
+       assertEquals("quick.txt", handler.filenames.get(4));
+       assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
+       
+       assertEquals("quick.xml", handler.filenames.get(5));
+       assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
+    }
 }