You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2017/10/18 14:51:57 UTC
[tika] 03/03: Add notes on why we can't get the Numbers or Pages
type just yet - need to call out to another library or decode the
Document.iwa snappy stream ourselves
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0d92bc862c3c344d65d3f6c260b0f5ea4c389fc0
Author: Nick Burch <ni...@apache.org>
AuthorDate: Wed Oct 18 15:50:59 2017 +0100
Add notes on why we can't get the Numbers or Pages type just yet - need to call out to another library or decode the Document.iwa snappy stream ourselves
---
.../parser/iwork/iwana/IWork13PackageParser.java | 34 +++++++++++++---------
.../tika/detect/TestContainerAwareDetector.java | 2 ++
.../tika/parser/iwork/iwana/IWork13ParserTest.java | 14 ++++-----
3 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index b96cc39..a090e84 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -17,17 +17,6 @@
package org.apache.tika.parser.iwork.iwana;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
@@ -38,6 +27,16 @@ import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
public class IWork13PackageParser extends AbstractParser {
public enum IWork13DocumentType {
@@ -64,16 +63,19 @@ public class IWork13PackageParser extends AbstractParser {
type = IWork13DocumentType.detectIfPossible(entry);
if (type != null) return type;
}
+
+ // If we get here, we don't know what it is
return UNKNOWN13.getType();
}
/**
* @return Specific type if this identifies one, otherwise null
*/
- public static MediaType detectIfPossible(ZipEntry entry) {
+ protected static MediaType detectIfPossible(ZipEntry entry) {
String name = entry.getName();
if (! name.endsWith(".iwa")) return null;
+ // Is it a uniquely identifying filename?
if (name.equals("Index/MasterSlide.iwa") ||
name.startsWith("Index/MasterSlide-")) {
return KEYNOTE13.getType();
@@ -82,7 +84,13 @@ public class IWork13PackageParser extends AbstractParser {
name.startsWith("Index/Slide-")) {
return KEYNOTE13.getType();
}
- //TODO: figure out how to distinguish numbers from pages
+
+ // Is it the main document?
+ if (name.equals("Index/Document.iwa")) {
+ // TODO Decode the snappy stream, and check for the Message Type
+ // = 2 (TN::SheetArchive), it is a numbers file;
+ // = 10000 (TP::DocumentArchive), that's a pages file
+ }
// Unknown
return null;
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index b6a79eb..e4117c4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -320,6 +320,8 @@ public class TestContainerAwareDetector {
public void testDetectIWork2013() throws Exception {
assertTypeByData("testKeynote2013.key",
IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString());
+ // Without decoding the Document snappy stream, we can't tell the
+ // difference between these two just based on the zip entries
assertTypeByData("testNumbers2013.numbers",
IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
assertTypeByData("testPages2013.pages",
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
index 4bbbcbf..60477a5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
@@ -16,16 +16,11 @@
*/
package org.apache.tika.parser.iwork.iwana;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -56,8 +51,7 @@ public class IWork13ParserTest {
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
- // Currently parsing is a no-op
- // Will only get type
+ // Currently parsing is a no-op, so will only get the Type
assertEquals(1, metadata.size());
assertEquals("", handler.toString());
assertEquals(IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString(),
@@ -71,7 +65,8 @@ public class IWork13ParserTest {
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
- // Currently parsing is a no-op
+ // Currently parsing is a no-op, and we can't get the type without
+ // decoding the Snappy stream
// TODO Test properly when a full Parser is added
assertEquals(0, metadata.size());
assertEquals("", handler.toString());
@@ -84,7 +79,8 @@ public class IWork13ParserTest {
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
- // Currently parsing is a no-op
+ // Currently parsing is a no-op, and we can't get the type without
+ // decoding the Snappy stream
// TODO Test properly when a full Parser is added
assertEquals(0, metadata.size());
assertEquals("", handler.toString());
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.