You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/06/23 13:27:40 UTC

tika git commit: Detection magic for POI-generated OOXML files, which have _rels before content type, plus test

Repository: tika
Updated Branches:
  refs/heads/master d6981ad81 -> 52ea9ba7c


Detection magic for POI-generated OOXML files, which have _rels before content type, plus test


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/52ea9ba7
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/52ea9ba7
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/52ea9ba7

Branch: refs/heads/master
Commit: 52ea9ba7c2e3c99e7a2d4fb38875caa996438857
Parents: d6981ad
Author: Nick Burch <ni...@gagravarr.org>
Authored: Thu Jun 23 14:27:14 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Thu Jun 23 14:27:14 2016 +0100

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml         |   3 ++-
 .../java/org/apache/tika/mime/TestMimeTypes.java    |   5 +++++
 .../resources/test-documents/testEXCEL_poi.xlsx     | Bin 0 -> 3360 bytes
 3 files changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a94f188..b39f529 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3989,10 +3989,11 @@
   <!-- =================================================================== -->
   <mime-type type="application/x-tika-ooxml">
     <sub-class-of type="application/zip"/>
-    <!-- Only works if the Content Types file is the first zip entry -->
+    <!-- Only works if the Content Types or rels file is the first zip entry -->
     <magic priority="50">
       <match value="PK\003\004" type="string" offset="0">
         <match value="[Content_Types].xml" type="string" offset="30"/>
+        <match value="_rels/.rels" type="string" offset="30"/>
       </match>
     </magic>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 102b005..81b154c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -283,12 +283,17 @@ public class TestMimeTypes {
         // As such, our mime magic can't figure it out...
         assertTypeByData("application/zip", "testWORD.docx");
         
+        // POI-generated files have the rels first not Content Types
+        assertTypeByData("application/x-tika-ooxml", "testEXCEL_poi.xlsx");
+        
         // If we give the filename as well as the data, we can
         //  specialise the ooxml generic one to the correct type
         assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
         assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
         assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
         
+        assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL_poi.xlsx");
+        
         // Test a few of the less usual ones
         assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
         assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");

http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx
new file mode 100644
index 0000000..713fb2e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx differ