You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/11 17:03:23 UTC
svn commit: r1665917 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/test/java/org/apache/tika/detect/
tika-parsers/src/test/java/org/apache/tika/mime/
Author: nick
Date: Wed Mar 11 16:03:23 2015
New Revision: 1665917
URL: http://svn.apache.org/r1665917
Log:
TIKA-1286 Visio OOXML mimetypes, and non-container detection unit tests
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1665917&r1=1665916&r2=1665917&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed Mar 11 16:03:23 2015
@@ -2380,6 +2380,7 @@
<!-- http://www.iana.org/assignments/media-types/application/vnd.visio -->
<mime-type type="application/vnd.visio">
+ <alias type="application/vnd.ms-visio"/>
<_comment>Microsoft Visio Diagram</_comment>
<glob pattern="*.vsd"/>
<glob pattern="*.vst"/>
@@ -2388,6 +2389,37 @@
<sub-class-of type="application/x-tika-msoffice"/>
</mime-type>
+ <mime-type type="application/vnd.ms-visio.drawing.main+xml">
+ <_comment>Office Open XML Visio Drawing (macro-free)</_comment>
+ <glob pattern="*.vsdx"/>
+ <sub-class-of type="application/x-tika-visio-ooxml"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-visio.template.main+xml">
+ <_comment>Office Open XML Visio Template (macro-free)</_comment>
+ <glob pattern="*.vstx"/>
+ <sub-class-of type="application/x-tika-visio-ooxml"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-visio.stencil.main+xml">
+ <_comment>Office Open XML Visio Stencil (macro-free)</_comment>
+ <glob pattern="*.vssx"/>
+ <sub-class-of type="application/x-tika-visio-ooxml"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.main+xml">
+ <_comment>Office Open XML Visio Drawing (macro-enabled)</_comment>
+ <glob pattern="*.vsdm"/>
+ <sub-class-of type="application/x-tika-visio-ooxml"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-visio.template.macroEnabled.main+xml">
+ <_comment>Office Open XML Visio Template (macro-enabled)</_comment>
+ <glob pattern="*.vstm"/>
+ <sub-class-of type="application/x-tika-visio-ooxml"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.main+xml">
+ <_comment>Office Open XML Visio Stencil (macro-enabled)</_comment>
+ <glob pattern="*.vssm"/>
+ <sub-class-of type="application/x-tika-visio-ooxml"/>
+ </mime-type>
+
<mime-type type="application/vnd.visionary">
<glob pattern="*.vis"/>
</mime-type>
@@ -3737,6 +3769,11 @@
<_comment>Password Protected OOXML File</_comment>
</mime-type>
+ <mime-type type="application/x-tika-visio-ooxml">
+ <sub-class-of type="application/x-tika-ooxml"/>
+ <_comment>Visio OOXML File</_comment>
+ </mime-type>
+
<!-- Older StarOffice formats extend up the Microsoft OLE2 format -->
<mime-type type="application/x-tika-staroffice">
<sub-class-of type="application/x-tika-msoffice"/>
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1665917&r1=1665916&r2=1665917&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Wed Mar 11 16:03:23 2015
@@ -92,6 +92,7 @@ public class TestContainerAwareDetector
assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
+
// older Works Word Processor files can't be recognized
// they were created with Works Word Processor 7.0 (hence the text inside)
// and exported to the older formats with the "Save As" feature
@@ -100,6 +101,7 @@ public class TestContainerAwareDetector
assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
+
// Excel95 can be detected by not parsed
assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
@@ -213,6 +215,16 @@ public class TestContainerAwareDetector
assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
+ // TODO Support detecting the Visio OOXML files
+/*
+ assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.main+xml");
+ assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing.main+xml");
+ assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.main+xml");
+ assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil.main+xml");
+ assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.main+xml");
+ assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template.main+xml");
+*/
+
// .xlsb is an OOXML file containing the binary parts, and not
// an OLE2 file as you might initially expect!
assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12");
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1665917&r1=1665916&r2=1665917&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Wed Mar 11 16:03:23 2015
@@ -282,6 +282,40 @@ public class TestMimeTypes {
assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
}
+
+ /**
+ * Note - container based formats, needs container detection
+ * to be properly correct
+ */
+ @Test
+ public void testVisioDetection() throws Exception {
+ // By Name, should get it right
+ assertTypeByName("application/vnd.visio", "testVISIO.vsd");
+ assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
+ assertTypeByName("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
+ assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
+ assertTypeByName("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
+ assertTypeByName("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
+ assertTypeByName("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+
+ // By Name and Data, should get it right
+ assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
+ assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
+ assertTypeByNameAndData("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+
+ // By Data only, will get the container parent
+ assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
+ }
/**
* Note - detecting container formats by mime magic is very very