You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2017/10/18 14:51:56 UTC

[tika] 02/03: Have the iWorks 13 parser set the content type on the metadata if possible, otherwise remains no-op

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5c7547bac9208082920859a5040a8b9fa31da642
Author: Nick Burch <ni...@apache.org>
AuthorDate: Wed Oct 18 14:59:35 2017 +0100

    Have the iWorks 13 parser set the content type on the metadata if possible, otherwise remains no-op
---
 .../parser/iwork/iwana/IWork13PackageParser.java   | 81 +++++++++++++++++++---
 .../tika/parser/pkg/ZipContainerDetector.java      |  1 -
 .../tika/parser/iwork/iwana/IWork13ParserTest.java |  6 +-
 3 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index 637b51b..b96cc39 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -20,6 +20,7 @@ package org.apache.tika.parser.iwork.iwana;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
@@ -31,8 +32,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
 
 public class IWork13PackageParser extends AbstractParser {
 
@@ -53,13 +57,35 @@ public class IWork13PackageParser extends AbstractParser {
         }
 
         public static MediaType detect(ZipFile zipFile) {
-            ZipArchiveEntry entry = zipFile.getEntry("Index/MasterSlide.iwa");
-            if (zipFile.getEntry("Index/MasterSlide.iwa") != null ||
-                    zipFile.getEntry("Index/Slide.iwa") != null) {
-                return KEYNOTE13.getType();
-            }
-            //TODO: figure out how to distinguish numbers from pages
-            return UNKNOWN13.getType();
+           MediaType type = null;
+           Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+           while (entries.hasMoreElements()) {
+              ZipEntry entry = entries.nextElement();
+              type = IWork13DocumentType.detectIfPossible(entry);
+              if (type != null) return type;
+           }
+           return UNKNOWN13.getType();
+        }
+        
+        /**
+         * @return Specific type if this identifies one, otherwise null
+         */
+        public static MediaType detectIfPossible(ZipEntry entry) {
+           String name = entry.getName();
+           if (! name.endsWith(".iwa")) return null;
+
+           if (name.equals("Index/MasterSlide.iwa") ||
+               name.startsWith("Index/MasterSlide-")) {
+              return KEYNOTE13.getType();
+           }
+           if (name.equals("Index/Slide.iwa") ||
+               name.startsWith("Index/Slide-")) {
+              return KEYNOTE13.getType();
+           }
+           //TODO: figure out how to distinguish numbers from pages
+
+           // Unknown
+           return null;
         }
     }
 
@@ -81,6 +107,45 @@ public class IWork13PackageParser extends AbstractParser {
 
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
-        //no-op for now
+       // Open the Zip stream
+       // Use a File if we can, and an already open zip is even better
+       ZipFile zipFile = null;
+       ZipInputStream zipStream = null;
+       if (stream instanceof TikaInputStream) {
+          TikaInputStream tis = (TikaInputStream) stream;
+          Object container = ((TikaInputStream) stream).getOpenContainer();
+          if (container instanceof ZipFile) {
+             zipFile = (ZipFile) container;
+          } else if (tis.hasFile()) {
+             zipFile = new ZipFile(tis.getFile());
+          } else {
+             zipStream = new ZipInputStream(stream);
+          }
+       } else {
+          zipStream = new ZipInputStream(stream);
+       }
+
+       // For now, just detect
+       MediaType type = null;
+       if (zipFile != null) {
+          Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+          while (entries.hasMoreElements()) {
+             ZipEntry entry = entries.nextElement();
+             if (type == null) {
+                type = IWork13DocumentType.detectIfPossible(entry);
+             }
+          }
+       } else {
+          ZipEntry entry = zipStream.getNextEntry();
+          while (entry != null) {
+             if (type == null) {
+                type = IWork13DocumentType.detectIfPossible(entry);
+             }
+             entry = zipStream.getNextEntry();
+          }
+       }
+       if (type != null) {
+          metadata.add(Metadata.CONTENT_TYPE, type.toString());
+       }
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 3f9211b..9a5befa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -292,7 +292,6 @@ public class ZipContainerDetector implements Detector {
             return IWork13PackageParser.IWork13DocumentType.detect(zip);
         }
         return null;
-
     }
 
     private static MediaType detectIWork(ZipFile zip) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
index c671253..4bbbcbf 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
@@ -57,9 +57,11 @@ public class IWork13ParserTest {
         iWorkParser.parse(input, handler, metadata, parseContext);
         
         // Currently parsing is a no-op
-        // TODO Test properly when a full Parser is added
-        assertEquals(0, metadata.size());
+        // Will only get type
+        assertEquals(1, metadata.size());
         assertEquals("", handler.toString());
+        assertEquals(IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString(),
+                     metadata.get(Metadata.CONTENT_TYPE));
     }
     
     @Test

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.