You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/22 15:56:45 UTC
tika git commit: TIKA-1358 add preliminary detection for iWorks 2013
file types
Repository: tika
Updated Branches:
refs/heads/master a46ffacf1 -> d6981ad81
TIKA-1358 add preliminary detection for iWorks 2013 file types
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d6981ad8
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d6981ad8
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d6981ad8
Branch: refs/heads/master
Commit: d6981ad81334eb20174004d7c0d96acd9f1d2f12
Parents: a46ffac
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 22 11:56:37 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 22 11:56:37 2016 -0400
----------------------------------------------------------------------
.../iwork/iwana/IWork13PackageParser.java | 86 +++++++++++++++++++
.../tika/parser/pkg/ZipContainerDetector.java | 12 +++
.../tika/detect/TestContainerAwareDetector.java | 11 +++
.../test-documents/testKeynote2013.key | Bin 0 -> 274397 bytes
.../resources/test-documents/testKeynoteNew.key | Bin 274397 -> 0 bytes
.../test-documents/testNumbers2013.numbers | Bin 0 -> 179147 bytes
.../test-documents/testNumbersNew.numbers | Bin 179147 -> 0 bytes
.../test-documents/testPages2013.pages | Bin 0 -> 237567 bytes
.../resources/test-documents/testPagesNew.pages | Bin 237567 -> 0 bytes
9 files changed, 109 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
new file mode 100644
index 0000000..637b51b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.iwork.iwana;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+public class IWork13PackageParser extends AbstractParser {
+
+ public enum IWork13DocumentType {
+ KEYNOTE13(MediaType.application("vnd.apple.keynote.13")),
+ NUMBERS13(MediaType.application("vnd.apple.numbers.13")),
+ PAGES13(MediaType.application("vnd.apple.pages.13")),
+ UNKNOWN13(MediaType.application("vnd.apple.unknown.13"));
+
+ private final MediaType mediaType;
+
+ IWork13DocumentType(MediaType mediaType) {
+ this.mediaType = mediaType;
+ }
+
+ public MediaType getType() {
+ return mediaType;
+ }
+
+ public static MediaType detect(ZipFile zipFile) {
+ ZipArchiveEntry entry = zipFile.getEntry("Index/MasterSlide.iwa");
+ if (zipFile.getEntry("Index/MasterSlide.iwa") != null ||
+ zipFile.getEntry("Index/Slide.iwa") != null) {
+ return KEYNOTE13.getType();
+ }
+ //TODO: figure out how to distinguish numbers from pages
+ return UNKNOWN13.getType();
+ }
+ }
+
+ /**
+ * All iWork 13 files contain this, so we can detect based on it
+ */
+ public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist";
+
+ private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ IWork13DocumentType.KEYNOTE13.getType(),
+ IWork13DocumentType.NUMBERS13.getType(),
+ IWork13DocumentType.PAGES13.getType()
+ )));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return supportedTypes;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ //no-op for now
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 12f22bc..d43a17c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -50,6 +50,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
+import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
import static java.nio.charset.StandardCharsets.UTF_8;
@@ -147,6 +148,9 @@ public class ZipContainerDetector implements Detector {
type = detectOPCBased(zip, tis);
}
if (type == null) {
+ type = detectIWork13(zip);
+ }
+ if (type == null) {
type = detectIWork(zip);
}
if (type == null) {
@@ -300,6 +304,14 @@ public class ZipContainerDetector implements Detector {
}
}
+ private static MediaType detectIWork13(ZipFile zip) {
+ if (zip.getEntry(IWork13PackageParser.IWORK13_COMMON_ENTRY) != null) {
+ return IWork13PackageParser.IWork13DocumentType.detect(zip);
+ }
+ return null;
+
+ }
+
private static MediaType detectIWork(ZipFile zip) {
if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
// Locate the appropriate index file entry, and reads from that
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 5787408..828c55c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -31,6 +31,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
import org.junit.Test;
/**
@@ -316,6 +317,16 @@ public class TestContainerAwareDetector {
}
@Test
+ public void testDetectIWork2013() throws Exception {
+ assertTypeByData("testKeynote2013.key",
+ IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString());
+ assertTypeByData("testNumbers2013.numbers",
+ IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
+ assertTypeByData("testPages2013.pages",
+ IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
+ }
+
+ @Test
public void testDetectKMZ() throws Exception {
assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
}
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testKeynote2013.key
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testKeynote2013.key b/tika-parsers/src/test/resources/test-documents/testKeynote2013.key
new file mode 100644
index 0000000..d0dd416
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testKeynote2013.key differ
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key b/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key
deleted file mode 100644
index d0dd416..0000000
Binary files a/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers b/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers
new file mode 100644
index 0000000..3f9a013
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers differ
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers b/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers
deleted file mode 100644
index 3f9a013..0000000
Binary files a/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testPages2013.pages
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPages2013.pages b/tika-parsers/src/test/resources/test-documents/testPages2013.pages
new file mode 100644
index 0000000..b82ac7a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPages2013.pages differ
http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testPagesNew.pages
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPagesNew.pages b/tika-parsers/src/test/resources/test-documents/testPagesNew.pages
deleted file mode 100644
index b82ac7a..0000000
Binary files a/tika-parsers/src/test/resources/test-documents/testPagesNew.pages and /dev/null differ