You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2017/10/18 14:51:56 UTC
[tika] 02/03: Have the iWorks 13 parser set the content type on the
metadata if possible, otherwise remains no-op
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5c7547bac9208082920859a5040a8b9fa31da642
Author: Nick Burch <ni...@apache.org>
AuthorDate: Wed Oct 18 14:59:35 2017 +0100
Have the iWorks 13 parser set the content type on the metadata if possible, otherwise remains no-op
---
.../parser/iwork/iwana/IWork13PackageParser.java | 81 +++++++++++++++++++---
.../tika/parser/pkg/ZipContainerDetector.java | 1 -
.../tika/parser/iwork/iwana/IWork13ParserTest.java | 6 +-
3 files changed, 77 insertions(+), 11 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index 637b51b..b96cc39 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -20,6 +20,7 @@ package org.apache.tika.parser.iwork.iwana;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -31,8 +32,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
+import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
public class IWork13PackageParser extends AbstractParser {
@@ -53,13 +57,35 @@ public class IWork13PackageParser extends AbstractParser {
}
public static MediaType detect(ZipFile zipFile) {
- ZipArchiveEntry entry = zipFile.getEntry("Index/MasterSlide.iwa");
- if (zipFile.getEntry("Index/MasterSlide.iwa") != null ||
- zipFile.getEntry("Index/Slide.iwa") != null) {
- return KEYNOTE13.getType();
- }
- //TODO: figure out how to distinguish numbers from pages
- return UNKNOWN13.getType();
+ MediaType type = null;
+ Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ type = IWork13DocumentType.detectIfPossible(entry);
+ if (type != null) return type;
+ }
+ return UNKNOWN13.getType();
+ }
+
+ /**
+ * @return Specific type if this identifies one, otherwise null
+ */
+ public static MediaType detectIfPossible(ZipEntry entry) {
+ String name = entry.getName();
+ if (! name.endsWith(".iwa")) return null;
+
+ if (name.equals("Index/MasterSlide.iwa") ||
+ name.startsWith("Index/MasterSlide-")) {
+ return KEYNOTE13.getType();
+ }
+ if (name.equals("Index/Slide.iwa") ||
+ name.startsWith("Index/Slide-")) {
+ return KEYNOTE13.getType();
+ }
+ //TODO: figure out how to distinguish numbers from pages
+
+ // Unknown
+ return null;
}
}
@@ -81,6 +107,45 @@ public class IWork13PackageParser extends AbstractParser {
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
- //no-op for now
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // For now, just detect
+ MediaType type = null;
+ if (zipFile != null) {
+ Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ if (type == null) {
+ type = IWork13DocumentType.detectIfPossible(entry);
+ }
+ }
+ } else {
+ ZipEntry entry = zipStream.getNextEntry();
+ while (entry != null) {
+ if (type == null) {
+ type = IWork13DocumentType.detectIfPossible(entry);
+ }
+ entry = zipStream.getNextEntry();
+ }
+ }
+ if (type != null) {
+ metadata.add(Metadata.CONTENT_TYPE, type.toString());
+ }
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 3f9211b..9a5befa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -292,7 +292,6 @@ public class ZipContainerDetector implements Detector {
return IWork13PackageParser.IWork13DocumentType.detect(zip);
}
return null;
-
}
private static MediaType detectIWork(ZipFile zip) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
index c671253..4bbbcbf 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
@@ -57,9 +57,11 @@ public class IWork13ParserTest {
iWorkParser.parse(input, handler, metadata, parseContext);
// Currently parsing is a no-op
- // TODO Test properly when a full Parser is added
- assertEquals(0, metadata.size());
+ // Will only get type
+ assertEquals(1, metadata.size());
assertEquals("", handler.toString());
+ assertEquals(IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString(),
+ metadata.get(Metadata.CONTENT_TYPE));
}
@Test
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.