You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/11 17:03:23 UTC

svn commit: r1665917 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/test/java/org/apache/tika/detect/ tika-parsers/src/test/java/org/apache/tika/mime/

Author: nick
Date: Wed Mar 11 16:03:23 2015
New Revision: 1665917

URL: http://svn.apache.org/r1665917
Log:
TIKA-1286 Visio OOXML mimetypes, and non-container detection unit tests

Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1665917&r1=1665916&r2=1665917&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed Mar 11 16:03:23 2015
@@ -2380,6 +2380,7 @@
 
   <!-- http://www.iana.org/assignments/media-types/application/vnd.visio -->
   <mime-type type="application/vnd.visio">
+    <alias type="application/vnd.ms-visio"/>
     <_comment>Microsoft Visio Diagram</_comment>
     <glob pattern="*.vsd"/>
     <glob pattern="*.vst"/>
@@ -2388,6 +2389,37 @@
     <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
+  <mime-type type="application/vnd.ms-visio.drawing.main+xml">
+    <_comment>Office Open XML Visio Drawing (macro-free)</_comment>
+    <glob pattern="*.vsdx"/>
+    <sub-class-of type="application/x-tika-visio-ooxml"/>
+  </mime-type>
+  <mime-type type="application/vnd.ms-visio.template.main+xml">
+    <_comment>Office Open XML Visio Template (macro-free)</_comment>
+    <glob pattern="*.vstx"/>
+    <sub-class-of type="application/x-tika-visio-ooxml"/>
+  </mime-type>
+  <mime-type type="application/vnd.ms-visio.stencil.main+xml">
+    <_comment>Office Open XML Visio Stencil (macro-free)</_comment>
+    <glob pattern="*.vssx"/>
+    <sub-class-of type="application/x-tika-visio-ooxml"/>
+  </mime-type>
+  <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.main+xml">
+    <_comment>Office Open XML Visio Drawing (macro-enabled)</_comment>
+    <glob pattern="*.vsdm"/>
+    <sub-class-of type="application/x-tika-visio-ooxml"/>
+  </mime-type>
+  <mime-type type="application/vnd.ms-visio.template.macroEnabled.main+xml">
+    <_comment>Office Open XML Visio Template (macro-enabled)</_comment>
+    <glob pattern="*.vstm"/>
+    <sub-class-of type="application/x-tika-visio-ooxml"/>
+  </mime-type>
+  <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.main+xml">
+    <_comment>Office Open XML Visio Stencil (macro-enabled)</_comment>
+    <glob pattern="*.vssm"/>
+    <sub-class-of type="application/x-tika-visio-ooxml"/>
+  </mime-type>
+
   <mime-type type="application/vnd.visionary">
     <glob pattern="*.vis"/>
   </mime-type>
@@ -3737,6 +3769,11 @@
     <_comment>Password Protected OOXML File</_comment>
   </mime-type>
 
+  <mime-type type="application/x-tika-visio-ooxml">
+    <sub-class-of type="application/x-tika-ooxml"/>
+    <_comment>Visio OOXML File</_comment>
+  </mime-type>
+
   <!-- Older StarOffice formats extend up the Microsoft OLE2 format -->
   <mime-type type="application/x-tika-staroffice">
     <sub-class-of type="application/x-tika-msoffice"/>

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1665917&r1=1665916&r2=1665917&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Wed Mar 11 16:03:23 2015
@@ -92,6 +92,7 @@ public class TestContainerAwareDetector
         assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
         assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
         assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
+        
         // older Works Word Processor files can't be recognized
         // they were created with Works Word Processor 7.0 (hence the text inside)
         // and exported to the older formats with the "Save As" feature
@@ -100,6 +101,7 @@ public class TestContainerAwareDetector
         assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
         assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
         assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
+        
         // Excel95 can be detected by not parsed
         assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
 
@@ -213,6 +215,16 @@ public class TestContainerAwareDetector
         assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
         assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
 
+        // TODO Support detecting the Visio OOXML files
+/*
+        assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.main+xml");
+        assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing.main+xml");
+        assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.main+xml");
+        assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil.main+xml");
+        assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.main+xml");
+        assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template.main+xml");
+*/
+        
         // .xlsb is an OOXML file containing the binary parts, and not
         //  an OLE2 file as you might initially expect!
         assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1665917&r1=1665916&r2=1665917&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Wed Mar 11 16:03:23 2015
@@ -282,6 +282,40 @@ public class TestMimeTypes {
         assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
         assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
     }
+    
+    /**
+     * Note - container based formats, needs container detection
+     *  to be properly correct
+     */
+    @Test
+    public void testVisioDetection() throws Exception {
+        // By Name, should get it right
+        assertTypeByName("application/vnd.visio", "testVISIO.vsd");
+        assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
+        assertTypeByName("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
+        assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
+        assertTypeByName("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
+        assertTypeByName("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
+        assertTypeByName("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+        
+        // By Name and Data, should get it right
+        assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
+        assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
+        assertTypeByNameAndData("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
+        assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
+        assertTypeByNameAndData("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
+        assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
+        assertTypeByNameAndData("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+        
+        // By Data only, will get the container parent
+        assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
+    }
 
     /**
      * Note - detecting container formats by mime magic is very very