You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/25 21:03:00 UTC

[tika] branch master updated: TIKA-3104 -- add detection and parsing for xml based plist files

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 62fe4ad  TIKA-3104 -- add detection and parsing for xml based plist files
62fe4ad is described below

commit 62fe4ada48f961ec49a41f6b67f28a64681c9b93
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jun 25 17:02:29 2020 -0400

    TIKA-3104 -- add detection and parsing for xml based plist files
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |  7 +++
 .../apache/tika/parser/apple/BPListDetector.java   | 50 ++++++++++++++++------
 .../apple/{BPListParser.java => PListParser.java}  | 26 +++++++++--
 .../services/org.apache.tika.parser.Parser         |  2 +-
 .../tika/detect/TestContainerAwareDetector.java    |  6 +--
 ...{BPListParserTest.java => PListParserTest.java} |  9 ++--
 6 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d284718..5dbcf99 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3308,6 +3308,7 @@
   </mime-type>
 
   <mime-type type="application/x-bplist">
+    <!-- this is the binary verision of x-plist -->
     <!-- Check for well-known bplist versions -->
     <magic priority="70">
       <match value="bplist\000\000" type="string" offset="0"/>
@@ -3329,6 +3330,12 @@
     </magic>
   </mime-type>
 
+  <mime-type type="application/x-plist">
+    <!-- this is the xml version of x-plist -->
+    <root-XML localName="plist"/>
+    <sub-class-of type="application/xml"/>
+  </mime-type>
+
   <mime-type type="application/x-gtar">
     <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
     <magic priority="50">
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
index 6631fa7..24b6bb8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
@@ -32,19 +32,41 @@ import javax.xml.parsers.ParserConfigurationException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.Set;
 
 /**
- * Parser that wraps com.dd.plist's PList parser to handle
- * binary property lists
+ * Detector for BPList with utility functions for PList.
+ *
+ * Without significant refactoring, this can't easily work as a true
+ * detector on plist subtypes.  Rather, for now, we require the file to be
+ * parsed and then the parser adds the subtype for xml-based plists.
+ * @since 1.25
  */
 public class BPListDetector implements Detector {
 
+    //xml versions
+    static MediaType MEMGRAPH = MediaType.application("x-plist-memgraph");
+    static MediaType WEBARCHIVE = MediaType.application("x-plist-webarchive");
+    static MediaType PLIST = MediaType.application("x-plist");
+    static MediaType ITUNES = MediaType.application("x-plist-itunes");
+
+
+    //binary versions
+    static MediaType BMEMGRAPH = MediaType.application("x-bplist-memgraph");
+    static MediaType BWEBARCHIVE = MediaType.application("x-bplist-webarchive");
+    static MediaType BPLIST = MediaType.application("x-bplist");
+    static MediaType BITUNES = MediaType.application("x-bplist-itunes");
+
+    private static Map<MediaType, MediaType> BINARY_TO_XML = new HashMap<>();
 
-    MediaType MEMGRAPH = MediaType.application("x-memgraph");
-    MediaType WEBARCHIVE = MediaType.application("x-webarchive");
-    MediaType BPLIST = MediaType.application("x-bplist");
-    MediaType ITUNES = MediaType.application("x-itunes-bplist");
+    static {
+        BINARY_TO_XML.put(BMEMGRAPH, MEMGRAPH);
+        BINARY_TO_XML.put(BWEBARCHIVE, WEBARCHIVE);
+        BINARY_TO_XML.put(BPLIST, PLIST);
+        BINARY_TO_XML.put(BITUNES, ITUNES);
+    }
 
     /**
      * @param input    input stream must support reset
@@ -97,18 +119,20 @@ public class BPListDetector implements Detector {
         return BPLIST;
     }
 
-    private MediaType detectOnKeys(Set<String> keySet) {
+    static MediaType detectOnKeys(Set<String> keySet) {
         if (keySet.contains("nodes") && keySet.contains("edges")
                 && keySet.contains("graphEncodingVersion")) {
-            return MEMGRAPH;
-        } else if (keySet.contains("WebMainResource") //&& keySet.contains("WebSubresources") should we require this?
-        ) {
-            return WEBARCHIVE;
+            return BMEMGRAPH;
+        } else if (keySet.contains("WebMainResource")){ //&& keySet.contains("WebSubresources") should we require this?
+            return BWEBARCHIVE;
         } else if (keySet.contains("Playlists") && keySet.contains("Tracks")
                 && keySet.contains("Music Folder")) {
-            return ITUNES;
+            return BITUNES;
         } //if it contains $archiver and $objects, it is a bplist inside a webarchive
-
         return BPLIST;
     }
+
+    static MediaType detectXMLOnKeys(Set<String> keySet) {
+        return BINARY_TO_XML.get(detectOnKeys(keySet));
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
similarity index 89%
rename from tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
rename to tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
index 29d0fb9..a05f03c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -46,16 +46,23 @@ import java.io.InputStream;
 import java.text.DateFormat;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
+import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
 /**
- * Parser for Apple's plist and bplist.  This is a wrapper around
+ * Parser for Apple's plist and bplist. This is a wrapper around
  *       com.googlecode.plist:dd-plist
+ *
+ * As of 1.25, Tika does not have detection for the text based plist,
+ * so those files will not be directed to this parser
+ *
+ * @since 1.25
  */
-public class BPListParser extends AbstractParser {
+public class PListParser extends AbstractParser {
 
     private static final String ARR = "array";
     private static final String DATA = "data";
@@ -70,7 +77,13 @@ public class BPListParser extends AbstractParser {
 
 
     private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.singleton(MediaType.application("x-bplist"));
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    BPListDetector.BITUNES,
+                    BPListDetector.BMEMGRAPH,
+                    BPListDetector.BPLIST,
+                    BPListDetector.BWEBARCHIVE,
+                    BPListDetector.PLIST)));
+
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
@@ -101,6 +114,13 @@ public class BPListParser extends AbstractParser {
                 throw new TikaException("problem parsing root", e);
             }
         }
+        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        if (BPListDetector.PLIST.toString().equals(contentType)) {
+            if (rootObj instanceof NSDictionary) {
+                MediaType subtype = BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet());
+                metadata.set(Metadata.CONTENT_TYPE, subtype.toString());
+            }
+        }
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         State state = new State(xhtml, metadata, embeddedDocumentExtractor, df);
         xhtml.startDocument();
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 7725f8c..72595d0 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -14,7 +14,7 @@
 #  limitations under the License.
 
 org.apache.tika.parser.apple.AppleSingleFileParser
-org.apache.tika.parser.apple.BPListParser
+org.apache.tika.parser.apple.PListParser
 org.apache.tika.parser.asm.ClassParser
 org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 7e197a7..18d8697 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -567,8 +567,8 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
 
     @Test
     public void testBPList() throws Exception {
-        assertTypeByData("testMemgraph.memgraph", "application/x-memgraph");
-        assertTypeByData("testWEBARCHIVE.webarchive", "application/x-webarchive");
-        assertTypeByData("testBPList.bplist", "application/x-itunes-bplist");
+        assertTypeByData("testMemgraph.memgraph", "application/x-bplist-memgraph");
+        assertTypeByData("testWEBARCHIVE.webarchive", "application/x-bplist-webarchive");
+        assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
     }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
similarity index 88%
rename from tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
rename to tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
index 9fad311..9c60d5b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
@@ -17,9 +17,7 @@
 package org.apache.tika.parser.apple;
 
 import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.junit.Test;
 
@@ -28,7 +26,7 @@ import java.util.List;
 import static org.junit.Assert.assertEquals;
 
 
-public class BPListParserTest extends TikaTest {
+public class PListParserTest extends TikaTest {
 
     @Test
     public void testBasicBinaryPList() throws Exception {
@@ -37,7 +35,7 @@ public class BPListParserTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist");
         assertEquals(21, metadataList.size());
         Metadata m = metadataList.get(0);
-        assertEquals("application/x-itunes-bplist", m.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/x-bplist-itunes", m.get(Metadata.CONTENT_TYPE));
         String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         assertContains("<key>Application Version</key><string>9.0", content);
 
@@ -50,11 +48,12 @@ public class BPListParserTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata("testWEBARCHIVE.webarchive");
         assertEquals(12, metadataList.size());
         Metadata m0 = metadataList.get(0);
-        assertEquals("application/x-webarchive", m0.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/x-bplist-webarchive", m0.get(Metadata.CONTENT_TYPE));
         Metadata m1 = metadataList.get(1);
         String content = m1.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         assertContains("December 2008: Apache Tika Release", content);
     }
 
     //TODO -- add unit tests for memgraph
+    //TODO -- convert existing unit tests to xml plist and add unit tests.
 }