You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/11 17:49:19 UTC
svn commit: r1665940 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/pkg/
tika-parsers/src/test/java/org/apache/tika/detect/
tika-parsers/src/test/java/org/apache/tika/mime/
Author: nick
Date: Wed Mar 11 16:49:18 2015
New Revision: 1665940
URL: http://svn.apache.org/r1665940
Log:
TIKA-1286 Bring the overall file mime types into line with the other OOXML formats, and add container aware detection + tests for the visio ooxml types
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed Mar 11 16:49:18 2015
@@ -2389,32 +2389,32 @@
<sub-class-of type="application/x-tika-msoffice"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.drawing.main+xml">
+ <mime-type type="application/vnd.ms-visio.drawing">
<_comment>Office Open XML Visio Drawing (macro-free)</_comment>
<glob pattern="*.vsdx"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.template.main+xml">
+ <mime-type type="application/vnd.ms-visio.template">
<_comment>Office Open XML Visio Template (macro-free)</_comment>
<glob pattern="*.vstx"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.stencil.main+xml">
+ <mime-type type="application/vnd.ms-visio.stencil">
<_comment>Office Open XML Visio Stencil (macro-free)</_comment>
<glob pattern="*.vssx"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.main+xml">
+ <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.12">
<_comment>Office Open XML Visio Drawing (macro-enabled)</_comment>
<glob pattern="*.vsdm"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.template.macroEnabled.main+xml">
+ <mime-type type="application/vnd.ms-visio.template.macroEnabled.12">
<_comment>Office Open XML Visio Template (macro-enabled)</_comment>
<glob pattern="*.vstm"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.main+xml">
+ <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.12">
<_comment>Office Open XML Visio Stencil (macro-enabled)</_comment>
<glob pattern="*.vssm"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Wed Mar 11 16:49:18 2015
@@ -58,6 +58,10 @@ import org.apache.tika.parser.iwork.IWor
public class ZipContainerDetector implements Detector {
private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, it is defined in ExtractorFactory there
+ private static final String VISIO_DOCUMENT_REL =
+ "http://schemas.microsoft.com/visio/2010/relationships/document";
+
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
@@ -231,8 +235,15 @@ public class ZipContainerDetector implem
* opened Package
*/
public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+ // Check for the normal Office core document
PackageRelationshipCollection core =
pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
+ // Otherwise check for some other Office core document types
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ }
+
+ // If we didn't find a single core document of any type, skip detection
if (core.size() != 1) {
// Invalid OOXML Package received
return null;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Wed Mar 11 16:49:18 2015
@@ -215,15 +215,12 @@ public class TestContainerAwareDetector
assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
- // TODO Support detecting the Visio OOXML files
-/*
- assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.main+xml");
- assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing.main+xml");
- assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.main+xml");
- assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil.main+xml");
- assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.main+xml");
- assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template.main+xml");
-*/
+ assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");
+ assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing");
+ assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12");
+ assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil");
+ assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12");
+ assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template");
// .xlsb is an OOXML file containing the binary parts, and not
// an OLE2 file as you might initially expect!
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Wed Mar 11 16:49:18 2015
@@ -291,21 +291,21 @@ public class TestMimeTypes {
public void testVisioDetection() throws Exception {
// By Name, should get it right
assertTypeByName("application/vnd.visio", "testVISIO.vsd");
- assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
- assertTypeByName("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
- assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
- assertTypeByName("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
- assertTypeByName("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
- assertTypeByName("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+ assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+ assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+ assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+ assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
// By Name and Data, should get it right
assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.main+xml", "testVISIO.vsdm");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing.main+xml", "testVISIO.vsdx");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.main+xml", "testVISIO.vssm");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil.main+xml", "testVISIO.vssx");
- assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.main+xml", "testVISIO.vstm");
- assertTypeByNameAndData("application/vnd.ms-visio.template.main+xml", "testVISIO.vstx");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+ assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
// By Data only, will get the container parent
assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");