You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/11 17:49:19 UTC

svn commit: r1665940 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/detect/ tika-parsers/src/test/java/org/apache/tika/mime/

Author: nick
Date: Wed Mar 11 16:49:18 2015
New Revision: 1665940

URL: http://svn.apache.org/r1665940
Log:
TIKA-1286 Bring the overall file mime types into line with the other OOXML formats, and add container aware detection + tests for the visio ooxml types

Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed Mar 11 16:49:18 2015
@@ -2389,32 +2389,32 @@
     <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
-  <mime-type type="application/vnd.ms-visio.drawing.main+xml">
+  <mime-type type="application/vnd.ms-visio.drawing">
     <_comment>Office Open XML Visio Drawing (macro-free)</_comment>
     <glob pattern="*.vsdx"/>
     <sub-class-of type="application/x-tika-visio-ooxml"/>
   </mime-type>
-  <mime-type type="application/vnd.ms-visio.template.main+xml">
+  <mime-type type="application/vnd.ms-visio.template">
     <_comment>Office Open XML Visio Template (macro-free)</_comment>
     <glob pattern="*.vstx"/>
     <sub-class-of type="application/x-tika-visio-ooxml"/>
   </mime-type>
-  <mime-type type="application/vnd.ms-visio.stencil.main+xml">
+  <mime-type type="application/vnd.ms-visio.stencil">
     <_comment>Office Open XML Visio Stencil (macro-free)</_comment>
     <glob pattern="*.vssx"/>
     <sub-class-of type="application/x-tika-visio-ooxml"/>
   </mime-type>
-  <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.main+xml">
+  <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.12">
     <_comment>Office Open XML Visio Drawing (macro-enabled)</_comment>
     <glob pattern="*.vsdm"/>
     <sub-class-of type="application/x-tika-visio-ooxml"/>
   </mime-type>
-  <mime-type type="application/vnd.ms-visio.template.macroEnabled.main+xml">
+  <mime-type type="application/vnd.ms-visio.template.macroEnabled.12">
     <_comment>Office Open XML Visio Template (macro-enabled)</_comment>
     <glob pattern="*.vstm"/>
     <sub-class-of type="application/x-tika-visio-ooxml"/>
   </mime-type>
-  <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.main+xml">
+  <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.12">
     <_comment>Office Open XML Visio Stencil (macro-enabled)</_comment>
     <glob pattern="*.vssm"/>
     <sub-class-of type="application/x-tika-visio-ooxml"/>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Wed Mar 11 16:49:18 2015
@@ -58,6 +58,10 @@ import org.apache.tika.parser.iwork.IWor
 public class ZipContainerDetector implements Detector {
     private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
 
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, it is defined in ExtractorFactory there
+    private static final String VISIO_DOCUMENT_REL =
+            "http://schemas.microsoft.com/visio/2010/relationships/document";
+    
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
 
@@ -231,8 +235,15 @@ public class ZipContainerDetector implem
      *  opened Package 
      */
     public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+        // Check for the normal Office core document
         PackageRelationshipCollection core = 
            pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
+        // Otherwise check for some other Office core document types
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+        }
+        
+        // If we didn't find a single core document of any type, skip detection
         if (core.size() != 1) {
             // Invalid OOXML Package received
             return null;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Wed Mar 11 16:49:18 2015
@@ -215,15 +215,12 @@ public class TestContainerAwareDetector
         assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
         assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
 
-        // TODO Support detecting the Visio OOXML files
-/*
-        assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.main+xml");
-        assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing.main+xml");
-        assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.main+xml");
-        assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil.main+xml");
-        assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.main+xml");
-        assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template.main+xml");
-*/
+        assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");
+        assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing");
+        assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12");
+        assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil");
+        assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12");
+        assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template");
         
         // .xlsb is an OOXML file containing the binary parts, and not
         //  an OLE2 file as you might initially expect!

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Wed Mar 11 16:49:18 2015
@@ -291,21 +291,21 @@ public class TestMimeTypes {
     public void testVisioDetection() throws Exception {
         // By Name, should get it right
         assertTypeByName("application/vnd.visio", "testVISIO.vsd");
-        assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
-        assertTypeByName("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
-        assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
-        assertTypeByName("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
-        assertTypeByName("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
-        assertTypeByName("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+        assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+        assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+        assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+        assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+        assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+        assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
         
         // By Name and Data, should get it right
         assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
-        assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
-        assertTypeByNameAndData("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
-        assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
-        assertTypeByNameAndData("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
-        assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
-        assertTypeByNameAndData("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+        assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+        assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+        assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+        assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+        assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+        assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
         
         // By Data only, will get the container parent
         assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");