You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/16 18:13:57 UTC

[tika] 01/13: TIKA-3104 -- add bplist subtype detector

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 38e393e7d1b48a1490a67baf854972f30d1ed7b3
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 3 14:02:38 2020 -0400

    TIKA-3104 -- add bplist subtype detector
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |   5 +
 .../apache/tika/parser/apple/BPListDetector.java   | 114 +++++++++++++++++++++
 .../apple/{PListParser.java => BPListParser.java}  |  33 ++++--
 .../services/org.apache.tika.detect.Detector       |   1 +
 .../services/org.apache.tika.parser.Parser         |   2 +-
 .../tika/detect/TestContainerAwareDetector.java    |   7 ++
 ...{PListParserTest.java => BPListParserTest.java} |  18 +++-
 .../resources/test-documents/testMemgraph.memgraph | Bin 0 -> 646412 bytes
 8 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index f16ae5a..cf95c0d 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3702,6 +3702,11 @@
     <glob pattern="*.iso"/>
   </mime-type>
 
+  <mime-type type="application/x-itunes-bplist">
+    <_comment>Apple iTunes Binary Property List</_comment>
+    <sub-class-of type="application/x-bplist"/>
+  </mime-type>
+
   <mime-type type="application/x-itunes-ipa">
     <sub-class-of type="application/zip"/>
     <_comment>Apple iOS IPA AppStore file</_comment>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
new file mode 100644
index 0000000..6631fa7
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.apple;
+
+import com.dd.plist.NSDictionary;
+import com.dd.plist.NSObject;
+import com.dd.plist.PropertyListFormatException;
+import com.dd.plist.PropertyListParser;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.Set;
+
+/**
+ * Parser that wraps com.dd.plist's PList parser to handle
+ * binary property lists
+ */
+public class BPListDetector implements Detector {
+
+
+    MediaType MEMGRAPH = MediaType.application("x-memgraph");
+    MediaType WEBARCHIVE = MediaType.application("x-webarchive");
+    MediaType BPLIST = MediaType.application("x-bplist");
+    MediaType ITUNES = MediaType.application("x-itunes-bplist");
+
+    /**
+     * @param input    input stream must support reset
+     * @param metadata input metadata for the document
+     * @return
+     * @throws IOException
+     */
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+        input.mark(8);
+        byte[] bytes = new byte[8];
+
+        try {
+            int read = IOUtils.readFully(input, bytes);
+            if (read < 6) {
+                return MediaType.OCTET_STREAM;
+            }
+        } catch (IOException e) {
+            return MediaType.OCTET_STREAM;
+        } finally {
+            input.reset();
+        }
+
+        int i = 0;
+        if (bytes[i++] != 'b' || bytes[i++] != 'p'
+                || bytes[i++] != 'l' || bytes[i++] != 'i'
+                || bytes[i++] != 's' || bytes[i++] != 't') {
+            return MediaType.OCTET_STREAM;
+        }
+        //TODO: extract the version with the next two bytes if they were read
+        NSObject rootObj = null;
+        try {
+            if (input instanceof TikaInputStream && ((TikaInputStream) input).hasFile()) {
+                rootObj = PropertyListParser.parse(((TikaInputStream) input).getFile());
+            } else {
+                rootObj = PropertyListParser.parse(input);
+            }
+            if (input instanceof TikaInputStream) {
+                ((TikaInputStream) input).setOpenContainer(rootObj);
+            }
+        } catch (PropertyListFormatException | ParseException | ParserConfigurationException | SAXException e) {
+            throw new IOExceptionWithCause("problem parsing root", e);
+        }
+        if (rootObj instanceof NSDictionary) {
+            return detectOnKeys(((NSDictionary) rootObj).getHashMap().keySet());
+        }
+        return BPLIST;
+    }
+
+    private MediaType detectOnKeys(Set<String> keySet) {
+        if (keySet.contains("nodes") && keySet.contains("edges")
+                && keySet.contains("graphEncodingVersion")) {
+            return MEMGRAPH;
+        } else if (keySet.contains("WebMainResource") //&& keySet.contains("WebSubresources") should we require this?
+        ) {
+            return WEBARCHIVE;
+        } else if (keySet.contains("Playlists") && keySet.contains("Tracks")
+                && keySet.contains("Music Folder")) {
+            return ITUNES;
+        } //if it contains $archiver and $objects, it is a bplist inside a webarchive
+
+        return BPLIST;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
similarity index 84%
rename from tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
rename to tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
index 5d4cc3e..29d0fb9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
@@ -26,6 +26,7 @@ import com.dd.plist.NSSet;
 import com.dd.plist.NSString;
 import com.dd.plist.PropertyListFormatException;
 import com.dd.plist.PropertyListParser;
+import com.dd.plist.UID;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -54,7 +55,7 @@ import java.util.Set;
  * Parser for Apple's plist and bplist.  This is a wrapper around
  *       com.googlecode.plist:dd-plist
  */
-public class PListParser extends AbstractParser {
+public class BPListParser extends AbstractParser {
 
     private static final String ARR = "array";
     private static final String DATA = "data";
@@ -65,6 +66,7 @@ public class PListParser extends AbstractParser {
     private static final String PLIST = "plist";
     private static final String SET = "set";
     private static final String STRING = "string";
+    private static final String UID = "uid";
 
 
     private static final Set<MediaType> SUPPORTED_TYPES =
@@ -82,14 +84,22 @@ public class PListParser extends AbstractParser {
                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
         DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
         NSObject rootObj = null;
-        try {
-            if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) {
-                rootObj = PropertyListParser.parse(((TikaInputStream) stream).getFile());
-            } else {
-                rootObj = PropertyListParser.parse(stream);
+        //if this already went through the PListDetector,
+        //there should be an NSObject in the open container
+        if (stream instanceof TikaInputStream) {
+            rootObj = (NSObject) ((TikaInputStream)stream).getOpenContainer();
+        }
+
+        if (rootObj == null) {
+            try {
+                if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) {
+                    rootObj = PropertyListParser.parse(((TikaInputStream) stream).getFile());
+                } else {
+                    rootObj = PropertyListParser.parse(stream);
+                }
+            } catch (PropertyListFormatException | ParseException | ParserConfigurationException e) {
+                throw new TikaException("problem parsing root", e);
             }
-        } catch (PropertyListFormatException|ParseException|ParserConfigurationException e) {
-            throw new TikaException("problem parsing root", e);
         }
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         State state = new State(xhtml, metadata, embeddedDocumentExtractor, df);
@@ -133,8 +143,13 @@ public class PListParser extends AbstractParser {
             state.xhtml.startElement(SET);
             parseSet((NSSet)obj, state);
             state.xhtml.endElement(SET);
+        } else if (obj instanceof UID) {
+            //do we want to do anything with obj.getBytes()
+            state.xhtml.element(UID, ((UID)obj).getName());
         } else {
-            throw new UnsupportedOperationException("don't yet support this type of object: "+obj.getClass());
+            throw new UnsupportedOperationException(
+                    "don't yet support this type of object: "+obj.getClass() +
+                    " Please open an issue on our tracker");
         }
     }
 
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
index 8a3d85f..5e766c6 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -16,3 +16,4 @@
 org.apache.tika.detect.OverrideDetector
 org.apache.tika.parser.microsoft.POIFSContainerDetector
 org.apache.tika.parser.pkg.ZipContainerDetector
+org.apache.tika.parser.apple.BPListDetector
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 028de26..ceb1399 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -14,7 +14,7 @@
 #  limitations under the License.
 
 org.apache.tika.parser.apple.AppleSingleFileParser
-org.apache.tika.parser.apple.PListParser
+org.apache.tika.parser.apple.BPListParser
 org.apache.tika.parser.asm.ClassParser
 org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 2fa274a..2b4c39a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -563,4 +563,11 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
         assertEquals("application/vnd.oasis.opendocument.presentation",
                 metadataList.get(2).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testBPList() throws Exception {
+        assertTypeByData("testMemgraph.memgraph", "application/x-memgraph");
+        assertTypeByData("testWEBARCHIVE.webarchive", "application/x-webarchive");
+        assertTypeByData("testBPList.bplist", "application/x-itunes-bplist");
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
similarity index 68%
rename from tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
rename to tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
index 9d78548..9fad311 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
@@ -17,7 +17,9 @@
 package org.apache.tika.parser.apple;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.junit.Test;
 
@@ -26,7 +28,7 @@ import java.util.List;
 import static org.junit.Assert.assertEquals;
 
 
-public class PListParserTest extends TikaTest {
+public class BPListParserTest extends TikaTest {
 
     @Test
     public void testBasicBinaryPList() throws Exception {
@@ -35,10 +37,24 @@ public class PListParserTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist");
         assertEquals(21, metadataList.size());
         Metadata m = metadataList.get(0);
+        assertEquals("application/x-itunes-bplist", m.get(Metadata.CONTENT_TYPE));
         String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         assertContains("<key>Application Version</key><string>9.0", content);
 
         //TODO -- bad encoding right after this...smart quote?
         assertContains("<string>90", content);
     }
+
+    @Test
+    public void testWebArchive() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWEBARCHIVE.webarchive");
+        assertEquals(12, metadataList.size());
+        Metadata m0 = metadataList.get(0);
+        assertEquals("application/x-webarchive", m0.get(Metadata.CONTENT_TYPE));
+        Metadata m1 = metadataList.get(1);
+        String content = m1.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+        assertContains("December 2008: Apache Tika Release", content);
+    }
+
+    //TODO -- add unit tests for memgraph
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testMemgraph.memgraph b/tika-parsers/src/test/resources/test-documents/testMemgraph.memgraph
new file mode 100644
index 0000000..cb7df3e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testMemgraph.memgraph differ