You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/08/21 16:12:11 UTC

svn commit: r1159985 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/detect/ tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/detect/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ ti...

Author: jukka
Date: Sun Aug 21 14:12:10 2011
New Revision: 1159985

URL: http://svn.apache.org/viewvc?rev=1159985&view=rev
Log:
TIKA-447: Container aware mimetype detection

Move the container detectors to matching o.a.t.parser subpackages to avoid complicating the OSGi bundle classpath.

The ContainerAwareDetector class is no longer needed as the DefaultDetector will automatically load any available container detectors. Instead of directly removing the class, I moved it to tika-core and marked it as deprecated to prevent backwards compatibility problems with Tika 0.9 clients.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java   (contents, props changed)
      - copied, changed from r1159980, tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
      - copied, changed from r1159980, tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
      - copied, changed from r1159980, tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Removed:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java (from r1159980, tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java&r1=1159980&r2=1159985&rev=1159985&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java Sun Aug 21 14:12:10 2011
@@ -18,12 +18,7 @@ package org.apache.tika.detect;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.zip.ZipException;
 
-import org.apache.poi.poifs.common.POIFSConstants;
-import org.apache.poi.poifs.storage.HeaderBlockConstants;
-import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -38,14 +33,14 @@ import org.apache.tika.mime.MimeTypes;
  *  to handle detection for non container formats. 
  * Should normally be used with a {@link TikaInputStream} to minimise 
  *  the memory usage.
+ *
+ * @deprecated Use the {@link DefaultDetector} class instead
  */
 public class ContainerAwareDetector implements Detector {
 
     private Detector fallbackDetector;
 
-    private Detector zipDetector;
-
-    private Detector poifsDetector;
+    private Detector defaultDetector;
 
     /**
      * Creates a new container detector, which will use the
@@ -54,16 +49,12 @@ public class ContainerAwareDetector impl
      */
     public ContainerAwareDetector(Detector fallbackDetector) {
         this.fallbackDetector = fallbackDetector;
-        poifsDetector = new POIFSContainerDetector();
-        zipDetector = new ZipContainerDetector();
+        this.defaultDetector = new DefaultDetector();
     }
 
     public MediaType detect(InputStream input, Metadata metadata)
             throws IOException {
-        MediaType type = zipDetector.detect(input, metadata);
-        if (MediaType.OCTET_STREAM.equals(type)) {
-            type = poifsDetector.detect(input, metadata);
-        }
+        MediaType type = defaultDetector.detect(input, metadata);
         if (MediaType.OCTET_STREAM.equals(type)) {
             return fallbackDetector.detect(input, metadata);
         }

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Sun Aug 21 14:12:10 2011
@@ -1317,7 +1317,9 @@
     <glob pattern="*.wks"/>
     <glob pattern="*.wcm"/>
     <glob pattern="*.wdb"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
+
   <mime-type type="application/vnd.ms-wpl">
     <glob pattern="*.wpl"/>
   </mime-type>
@@ -1430,6 +1432,7 @@
       </match>
     </magic>
     <glob pattern="*.odf"/>
+    <sub-class-of type="application/zip"/>
   </mime-type>
 
   <mime-type type="application/vnd.oasis.opendocument.formula-template">

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Sun Aug 21 14:12:10 2011
@@ -31,7 +31,6 @@ import org.apache.poi.poifs.filesystem.O
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
-import org.apache.tika.detect.ZipContainerDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
@@ -40,6 +39,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 

Copied: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (from r1159980, tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java&r1=1159980&r2=1159985&rev=1159985&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Sun Aug 21 14:12:10 2011
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.detect;
+package org.apache.tika.parser.microsoft;
 
 import static org.apache.tika.mime.MediaType.application;
 
@@ -27,6 +27,8 @@ import java.util.Set;
 
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -89,46 +91,50 @@ public class POIFSContainerDetector impl
         }
 
         // We can only detect the exact type when given a TikaInputStream
-        if (!TikaInputStream.isTikaInputStream(input)) {
-            return OLE;
-        }
-
-        // Look for known top level entry names to detect the document type
-        Set<String> names = getTopLevelNames(TikaInputStream.get(input));
-        if (names.contains("Workbook")) {
-            return XLS;
-        } else if (names.contains("EncryptedPackage")) {
-            return OLE;
-        } else if (names.contains("WordDocument")) {
-            return DOC;
-        } else if (names.contains("Quill")) {
-            return PUB;
-        } else if (names.contains("PowerPoint Document")) {
-            return PPT;
-        } else if (names.contains("VisioDocument")) {
-            return VSD;
-        } else if (names.contains("CONTENTS")) {
-            return WPS;
-        } else if (names.contains("\u0001Ole10Native")) {
-            return OLE;
-        } else if (names.contains("PerfectOffice_MAIN")) {
-            if (names.contains("SlideShow")) {
-                return MediaType.application("x-corelpresentations"); // .shw
-            } else if (names.contains("PerfectOffice_OBJECTS")) {
-                return MediaType.application("x-quattro-pro"); // .wb?
-            } else {
-                return OLE;
-            }
-        } else if (names.contains("NativeContent_MAIN")) {
-            return MediaType.application("x-quattro-pro"); // .qpw
-        } else {
-            for (String name : names) {
-                if (name.startsWith("__substg1.0_")) {
-                    return MSG;
+        if (TikaInputStream.isTikaInputStream(input)) {
+            TemporaryFiles tmp = new TemporaryFiles();
+            try {
+                // Look for known top level entry names to detect the document type
+                Set<String> names =
+                    getTopLevelNames(TikaInputStream.get(input, tmp));
+                if (names.contains("Workbook")) {
+                    return XLS;
+                } else if (names.contains("EncryptedPackage")) {
+                    return OLE;
+                } else if (names.contains("WordDocument")) {
+                    return DOC;
+                } else if (names.contains("Quill")) {
+                    return PUB;
+                } else if (names.contains("PowerPoint Document")) {
+                    return PPT;
+                } else if (names.contains("VisioDocument")) {
+                    return VSD;
+                } else if (names.contains("CONTENTS")) {
+                    return WPS;
+                } else if (names.contains("\u0001Ole10Native")) {
+                    return OLE;
+                } else if (names.contains("PerfectOffice_MAIN")) {
+                    if (names.contains("SlideShow")) {
+                        return MediaType.application("x-corelpresentations"); // .shw
+                    } else if (names.contains("PerfectOffice_OBJECTS")) {
+                        return MediaType.application("x-quattro-pro"); // .wb?
+                    }
+                } else if (names.contains("NativeContent_MAIN")) {
+                    return MediaType.application("x-quattro-pro"); // .qpw
+                } else {
+                    for (String name : names) {
+                        if (name.startsWith("__substg1.0_")) {
+                            return MSG;
+                        }
+                    }
                 }
+            } finally {
+                tmp.dispose();
             }
-            return OLE;
         }
+
+        // Couldn't detect a more specific type
+        return OLE;
     }
 
     private static Set<String> getTopLevelNames(TikaInputStream stream)

Copied: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (from r1159980, tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java&r1=1159980&r2=1159985&rev=1159985&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Sun Aug 21 14:12:10 2011
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.detect;
+package org.apache.tika.parser.pkg;
 
 import java.io.File;
 import java.io.IOException;
@@ -28,7 +28,9 @@ import org.apache.poi.openxml4j.opc.OPCP
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -68,8 +70,9 @@ public class ZipContainerDetector implem
             return MediaType.APPLICATION_ZIP;
         }
 
+        TemporaryFiles tmp = new TemporaryFiles();
         try {
-            File file = TikaInputStream.get(input).getFile();
+            File file = TikaInputStream.get(input, tmp).getFile();
             ZipFile zip = new ZipFile(file);
 
             MediaType type = detectOpenDocument(zip);
@@ -88,6 +91,8 @@ public class ZipContainerDetector implem
             return type;
         } catch (IOException e) {
             return MediaType.APPLICATION_ZIP;
+        } finally {
+            tmp.dispose();
         }
     }
 
@@ -168,4 +173,5 @@ public class ZipContainerDetector implem
             return null;
         }
     }
+
 }
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector Sun Aug 21 14:12:10 2011
@@ -13,5 +13,5 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-org.apache.tika.detect.POIFSContainerDetector
-org.apache.tika.detect.ZipContainerDetector
+org.apache.tika.parser.microsoft.POIFSContainerDetector
+org.apache.tika.parser.pkg.ZipContainerDetector

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Sun Aug 21 14:12:10 2011
@@ -32,7 +32,7 @@ import org.apache.tika.mime.MimeTypes;
  */
 public class TestContainerAwareDetector extends TestCase {
 
-    private final ContainerAwareDetector detector =
+    private final Detector detector =
         new ContainerAwareDetector(MimeTypes.getDefaultMimeTypes());
 
     private void assertDetect(String file, String type) throws Exception {
@@ -135,7 +135,7 @@ public class TestContainerAwareDetector 
         TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
         try {
             assertEquals(
-                    MediaType.APPLICATION_ZIP,
+                    MediaType.application("x-tika-ooxml"),
                     detector.detect(xlsx, new Metadata()));
         } finally {
             xlsx.close();