You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/22 15:56:45 UTC

tika git commit: TIKA-1358 add preliminary detection for iWorks 2013 file types

Repository: tika
Updated Branches:
  refs/heads/master a46ffacf1 -> d6981ad81


TIKA-1358 add preliminary detection for iWorks 2013 file types


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d6981ad8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d6981ad8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d6981ad8

Branch: refs/heads/master
Commit: d6981ad81334eb20174004d7c0d96acd9f1d2f12
Parents: a46ffac
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 22 11:56:37 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 22 11:56:37 2016 -0400

----------------------------------------------------------------------
 .../iwork/iwana/IWork13PackageParser.java       |  86 +++++++++++++++++++
 .../tika/parser/pkg/ZipContainerDetector.java   |  12 +++
 .../tika/detect/TestContainerAwareDetector.java |  11 +++
 .../test-documents/testKeynote2013.key          | Bin 0 -> 274397 bytes
 .../resources/test-documents/testKeynoteNew.key | Bin 274397 -> 0 bytes
 .../test-documents/testNumbers2013.numbers      | Bin 0 -> 179147 bytes
 .../test-documents/testNumbersNew.numbers       | Bin 179147 -> 0 bytes
 .../test-documents/testPages2013.pages          | Bin 0 -> 237567 bytes
 .../resources/test-documents/testPagesNew.pages | Bin 237567 -> 0 bytes
 9 files changed, 109 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
new file mode 100644
index 0000000..637b51b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.iwork.iwana;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+public class IWork13PackageParser extends AbstractParser {
+
+    public enum IWork13DocumentType {
+        KEYNOTE13(MediaType.application("vnd.apple.keynote.13")),
+        NUMBERS13(MediaType.application("vnd.apple.numbers.13")),
+        PAGES13(MediaType.application("vnd.apple.pages.13")),
+        UNKNOWN13(MediaType.application("vnd.apple.unknown.13"));
+
+        private final MediaType mediaType;
+
+        IWork13DocumentType(MediaType mediaType) {
+            this.mediaType = mediaType;
+        }
+
+        public MediaType getType() {
+            return mediaType;
+        }
+
+        public static MediaType detect(ZipFile zipFile) {
+            ZipArchiveEntry entry = zipFile.getEntry("Index/MasterSlide.iwa");
+            if (zipFile.getEntry("Index/MasterSlide.iwa") != null ||
+                    zipFile.getEntry("Index/Slide.iwa") != null) {
+                return KEYNOTE13.getType();
+            }
+            //TODO: figure out how to distinguish numbers from pages
+            return UNKNOWN13.getType();
+        }
+    }
+
+    /**
+     * All iWork 13 files contain this, so we can detect based on it
+     */
+    public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist";
+
+    private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+            IWork13DocumentType.KEYNOTE13.getType(),
+            IWork13DocumentType.NUMBERS13.getType(),
+            IWork13DocumentType.PAGES13.getType()
+            )));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        //no-op for now
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 12f22bc..d43a17c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -50,6 +50,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
+import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
@@ -147,6 +148,9 @@ public class ZipContainerDetector implements Detector {
                     type = detectOPCBased(zip, tis);
                 }
                 if (type == null) {
+                    type = detectIWork13(zip);
+                }
+                if (type == null) {
                     type = detectIWork(zip);
                 }
                 if (type == null) {
@@ -300,6 +304,14 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
+    private static MediaType detectIWork13(ZipFile zip) {
+        if (zip.getEntry(IWork13PackageParser.IWORK13_COMMON_ENTRY) != null) {
+            return IWork13PackageParser.IWork13DocumentType.detect(zip);
+        }
+        return null;
+
+    }
+
     private static MediaType detectIWork(ZipFile zip) {
         if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
             // Locate the appropriate index file entry, and reads from that

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 5787408..828c55c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -31,6 +31,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
 import org.junit.Test;
 
 /**
@@ -316,6 +317,16 @@ public class TestContainerAwareDetector {
     }
 
     @Test
+    public void testDetectIWork2013() throws Exception {
+        assertTypeByData("testKeynote2013.key",
+                IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString());
+        assertTypeByData("testNumbers2013.numbers",
+                IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
+        assertTypeByData("testPages2013.pages",
+                IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
+    }
+
+    @Test
     public void testDetectKMZ() throws Exception {
        assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testKeynote2013.key
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testKeynote2013.key b/tika-parsers/src/test/resources/test-documents/testKeynote2013.key
new file mode 100644
index 0000000..d0dd416
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testKeynote2013.key differ

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key b/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key
deleted file mode 100644
index d0dd416..0000000
Binary files a/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key and /dev/null differ

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers b/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers
new file mode 100644
index 0000000..3f9a013
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers differ

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers b/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers
deleted file mode 100644
index 3f9a013..0000000
Binary files a/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers and /dev/null differ

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testPages2013.pages
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPages2013.pages b/tika-parsers/src/test/resources/test-documents/testPages2013.pages
new file mode 100644
index 0000000..b82ac7a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPages2013.pages differ

http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testPagesNew.pages
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPagesNew.pages b/tika-parsers/src/test/resources/test-documents/testPagesNew.pages
deleted file mode 100644
index b82ac7a..0000000
Binary files a/tika-parsers/src/test/resources/test-documents/testPagesNew.pages and /dev/null differ