You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/28 14:57:09 UTC

svn commit: r1207196 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/

Author: nick
Date: Mon Nov 28 13:57:08 2011
New Revision: 1207196

URL: http://svn.apache.org/viewvc?rev=1207196&view=rev
Log:
TIKA-790 Remove the duplicated detection code between OfficeParser and POIFSContainerDetector, by following the pattern from TIKA-791 and adding a type for OLE10Native, then pushing the rest of the detection work to POIFSContainerDetector

Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1207196&r1=1207195&r2=1207196&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Nov 28 13:57:08 2011
@@ -2761,6 +2761,11 @@
     </magic>
   </mime-type>
 
+  <mime-type type="application/x-tika-msoffice-embedded">
+    <sub-class-of type="application/x-tika-msoffice"/>
+    <_comment>OLE10 Native Embedded Document</_comment>
+  </mime-type>
+
   <!-- =================================================================== -->
   <!-- Office Open XML file formats                                        -->
   <!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm -->

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1207196&r1=1207195&r2=1207196&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Nov 28 13:57:08 2011
@@ -21,10 +21,8 @@ import java.io.InputStream;
 import java.security.GeneralSecurityException;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Locale;
-import java.util.Map;
 import java.util.Set;
 
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
@@ -77,7 +75,7 @@ public class OfficeParser extends Abstra
 
     public enum POIFSDocumentType {
         WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
-        OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice")),
+        OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice-embedded")),
         WORDDOCUMENT("doc", MediaType.application("msword")),
         UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
         ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
@@ -113,39 +111,15 @@ public class OfficeParser extends Abstra
        }
 
         public static POIFSDocumentType detectType(DirectoryEntry node) {
+            Set<String> names = new HashSet<String>();
             for (Entry entry : node) {
-                POIFSDocumentType type = detectType(entry);
-                if (type!=UNKNOWN) {
-                    return type;
-                }
+                names.add(entry.getName());
             }
-            return UNKNOWN;
-        }
-
-        // TODO Avoid this duplication with POIFSContainerDetector (TIKA-790)
-        private final static Map<String,POIFSDocumentType> typeMap = new HashMap<String,POIFSDocumentType>();
-        static {
-            typeMap.put("Workbook", WORKBOOK);
-            typeMap.put("EncryptedPackage", ENCRYPTED);
-            typeMap.put("WordDocument", WORDDOCUMENT);
-            typeMap.put("Quill", PUBLISHER);
-            typeMap.put("PowerPoint Document", POWERPOINT);
-            typeMap.put("VisioDocument", VISIO);
-            typeMap.put("CONTENTS", WORKS);
-            typeMap.put("\u0001Ole10Native", POIFSDocumentType.OLE10_NATIVE);
-            typeMap.put("Props", PROJECT);  // Project 8
-            typeMap.put("Props9", PROJECT); // Project 9, 10, 11
-            typeMap.put("Props12", PROJECT); // Project 12+
-        }
-
-        public static POIFSDocumentType detectType(Entry entry) {
-            String name = entry.getName();
-            POIFSDocumentType type = typeMap.get(name);
-            if (type != null) {
-                return type;
-            }
-            if (entry.getName().startsWith("__substg1.0_")) {
-                return OUTLOOK;
+            MediaType type = POIFSContainerDetector.detect(names);
+            for (POIFSDocumentType poifsType : values()) {
+               if (type.equals(poifsType.type)) {
+                  return poifsType;
+               }
             }
             return UNKNOWN;
         }
@@ -193,71 +167,64 @@ public class OfficeParser extends Abstra
         new SummaryExtractor(metadata).parseSummaries(root);
 
         // Parse remaining document entries
-        boolean outlookExtracted = false;
-        for (Entry entry : root) {
-            POIFSDocumentType type = POIFSDocumentType.detectType(entry);
+        POIFSDocumentType type = POIFSDocumentType.detectType(root);
 
-            if (type!=POIFSDocumentType.UNKNOWN) {
-                setType(metadata, type.getType());
-            }
+        if (type!=POIFSDocumentType.UNKNOWN) {
+            setType(metadata, type.getType());
+        }
 
-            switch (type) {
-                case PUBLISHER:
-                    PublisherTextExtractor publisherTextExtractor =
-                        new PublisherTextExtractor(root);
-                    xhtml.element("p", publisherTextExtractor.getText());
-                    break;
-                case WORDDOCUMENT:
-                    new WordExtractor(context).parse(root, xhtml);
-                    break;
-                case POWERPOINT:
-                    new HSLFExtractor(context).parse(root, xhtml);
-                    break;
-                case WORKBOOK:
-                    Locale locale = context.get(Locale.class, Locale.getDefault());
-                    new ExcelExtractor(context).parse(root, xhtml, locale);
-                    break;
-                case PROJECT:
-                    // We currently can't do anything beyond the metadata
-                    break;
-                case VISIO:
-                    VisioTextExtractor visioTextExtractor =
-                        new VisioTextExtractor(root);
-                    for (String text : visioTextExtractor.getAllText()) {
-                        xhtml.element("p", text);
-                    }
-                    break;
-                case OUTLOOK:
-                    if (!outlookExtracted) {
-                        outlookExtracted = true;
-
-                        OutlookExtractor extractor =
-                            new OutlookExtractor(root, context);
-
-                        extractor.parse(xhtml, metadata);
-                    }
-                    break;
-                case ENCRYPTED:
-                    EncryptionInfo info = new EncryptionInfo(root);
-                    Decryptor d = Decryptor.getInstance(info);
-
-                    try {
-                        // TODO Allow the user to specify the password via the ParseContext
-                        if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
-                            throw new EncryptedDocumentException();
-                        }
-
-                        // Decrypt the OLE2 stream, and delegate the resulting OOXML
-                        //  file to the regular OOXML parser for normal handling
-                        OOXMLParser parser = new OOXMLParser();
-
-                        parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
-                                        new BodyContentHandler(xhtml)),
-                                        metadata, context);
-                    } catch (GeneralSecurityException ex) {
-                        throw new EncryptedDocumentException(ex);
-                    }
-            }
+        switch (type) {
+        case PUBLISHER:
+           PublisherTextExtractor publisherTextExtractor =
+              new PublisherTextExtractor(root);
+           xhtml.element("p", publisherTextExtractor.getText());
+           break;
+        case WORDDOCUMENT:
+           new WordExtractor(context).parse(root, xhtml);
+           break;
+        case POWERPOINT:
+           new HSLFExtractor(context).parse(root, xhtml);
+           break;
+        case WORKBOOK:
+           Locale locale = context.get(Locale.class, Locale.getDefault());
+           new ExcelExtractor(context).parse(root, xhtml, locale);
+           break;
+        case PROJECT:
+           // We currently can't do anything beyond the metadata
+           break;
+        case VISIO:
+           VisioTextExtractor visioTextExtractor =
+              new VisioTextExtractor(root);
+           for (String text : visioTextExtractor.getAllText()) {
+              xhtml.element("p", text);
+           }
+           break;
+        case OUTLOOK:
+           OutlookExtractor extractor =
+                 new OutlookExtractor(root, context);
+
+           extractor.parse(xhtml, metadata);
+           break;
+        case ENCRYPTED:
+           EncryptionInfo info = new EncryptionInfo(root);
+           Decryptor d = Decryptor.getInstance(info);
+
+           try {
+              // TODO Allow the user to specify the password via the ParseContext
+              if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
+                 throw new EncryptedDocumentException();
+              }
+
+              // Decrypt the OLE2 stream, and delegate the resulting OOXML
+              //  file to the regular OOXML parser for normal handling
+              OOXMLParser parser = new OOXMLParser();
+
+              parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
+                    new BodyContentHandler(xhtml)),
+                    metadata, context);
+           } catch (GeneralSecurityException ex) {
+              throw new EncryptedDocumentException(ex);
+           }
         }
     }
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1207196&r1=1207195&r2=1207196&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Nov 28 13:57:08 2011
@@ -50,6 +50,9 @@ public class POIFSContainerDetector impl
     
     /** The protected OOXML base file format */
     public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+    
+    /** An OLE10 Native embedded document within another OLE2 document */
+    public static final MediaType OLE10_NATIVE = application("x-tika-msoffice-embedded");
 
     /** Microsoft Excel */
     public static final MediaType XLS = application("vnd.ms-excel");
@@ -119,7 +122,16 @@ public class POIFSContainerDetector impl
             // Look for known top level entry names to detect the document type
             names = getTopLevelNames(tis);
         }
-
+        
+        // Detect based on the names (as available)
+        return detect(names);
+    }
+    
+    /**
+     * Internal detection of the specific kind of OLE2 document, based on the 
+     *  names of the top level streams within the file.
+     */
+    protected static MediaType detect(Set<String> names) {
         if (names != null) {
             if (names.contains("Workbook")) {
                 return XLS;
@@ -142,6 +154,8 @@ public class POIFSContainerDetector impl
                 return PPT;
             } else if (names.contains("VisioDocument")) {
                 return VSD;
+            } else if (names.contains("\u0001Ole10Native")) {
+                return OLE10_NATIVE;
             } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
                // Newer Works files
                return WPS;
@@ -161,8 +175,6 @@ public class POIFSContainerDetector impl
                      return MPP;
                   }
                }
-            } else if (names.contains("\u0001Ole10Native")) {
-                return OLE;
             } else if (names.contains("PerfectOffice_MAIN")) {
                 if (names.contains("SlideShow")) {
                     return MediaType.application("x-corelpresentations"); // .shw