You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/01 20:44:09 UTC

[tika] branch master updated: TIKA-3104 -- addition of rudimentary bplist parser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 48884e5  TIKA-3104 -- addition of rudimentary bplist parser
48884e5 is described below

commit 48884e597aac219e5e97635f5297bb0d005e607f
Author: tallison <ta...@apache.org>
AuthorDate: Mon Jun 1 16:43:09 2020 -0400

    TIKA-3104 -- addition of rudimentary bplist parser
---
 CHANGES.txt                                        |   4 +
 LICENSE.txt                                        |  22 ++++
 tika-bundle/pom.xml                                |   2 +
 tika-parsers/pom.xml                               |   5 +
 .../org/apache/tika/parser/apple/PListParser.java  | 122 +++++++++++++++++++++
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../apache/tika/parser/apple/PListParserTest.java  |  38 +++++++
 .../resources/test-documents/testBPList.bplist     | Bin 0 -> 24433 bytes
 8 files changed, 194 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 19d1985..fe236dd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,10 @@ Release 2.0.0 - ???
 
 Release 1.25 - ???
 
+   * Add a basic parser for plist files based on com.googlecode.plist:dd-plist (TIKA-3104).
+
+Release 1.24.1 - 4/17/2020
+
    * Allow gzip compression of input and output streams for tika-server (TIKA-3073).
 
 Release 1.24 - 3/11/2019
diff --git a/LICENSE.txt b/LICENSE.txt
index e998546..17ea384 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -437,3 +437,25 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
+
+com.googlecode.plist:dd-plist
+dd-plist - An open source library to parse and generate property lists
+Copyright (C) 2016 Daniel Dreibrodt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 81ae4e3..0c47cd1 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -199,6 +199,7 @@
               commons-io|
               commons-exec|
               commons-collections4|
+              dd-plist|
               junrar|
               pdfbox|
               pdfbox-tools|
@@ -280,6 +281,7 @@
               com.adobe.xmp;resolution:=optional,
               com.adobe.xmp.impl;resolution:=optional,
               com.adobe.xmp.options;resolution:=optional,
+              com.dd.plist;resolution:=optional,
               com.adobe.xmp.properties;resolution:=optional,
               com.github.luben.zstd;resolution:=optional,
               com.github.openjson;resolution:=optional,
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index fa65e78..932964e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -160,6 +160,11 @@
       <version>${mime4j.version}</version>
     </dependency>
     <dependency>
+      <groupId>com.googlecode.plist</groupId>
+      <artifactId>dd-plist</artifactId>
+      <version>1.23</version>
+    </dependency>
+    <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
       <version>${commons.compress.version}</version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
new file mode 100644
index 0000000..ff56efe
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.apple;
+
+import com.dd.plist.NSArray;
+import com.dd.plist.NSData;
+import com.dd.plist.NSDate;
+import com.dd.plist.NSDictionary;
+import com.dd.plist.NSNumber;
+import com.dd.plist.NSObject;
+import com.dd.plist.NSSet;
+import com.dd.plist.NSString;
+import com.dd.plist.PropertyListFormatException;
+import com.dd.plist.PropertyListParser;
+import com.lexicalscope.jewelcli.internal.cglib.asm.$MethodAdapter;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Parser for Apple's plist and bplist.  This is a wrapper around
+ *       <groupId>com.googlecode.plist</groupId>
+ *       <artifactId>dd-plist</artifactId>
+ *       <version>1.23</version>
+ */
+public class PListParser extends AbstractParser {
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("x-bplist"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+
+        NSObject rootObj = null;
+        try {
+            if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) {
+                rootObj = PropertyListParser.parse(((TikaInputStream) stream).getFile());
+            } else {
+                rootObj = PropertyListParser.parse(stream);
+            }
+        } catch (PropertyListFormatException|ParseException|ParserConfigurationException e) {
+            throw new TikaException("problem parsing root", e);
+        }
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        parseObject(rootObj, xhtml, metadata);
+        xhtml.endDocument();
+    }
+
+    private void parseObject(NSObject obj, XHTMLContentHandler handler, Metadata metadata)
+            throws SAXException {
+
+        if (obj instanceof NSDictionary) {
+            parseDict((NSDictionary)obj, handler, metadata);
+        } else if (obj instanceof NSArray) {
+            NSArray nsArray = (NSArray)obj;
+            for (NSObject child : nsArray.getArray()) {
+                parseObject(child, handler, metadata);
+            }
+        } else if (obj instanceof NSString) {
+            handler.characters(((NSString)obj).toString());
+        } else if (obj instanceof NSNumber) {
+            handler.characters(((NSNumber) obj).toString());
+        } else if (obj instanceof NSData) {
+            handleData((NSData) obj, handler, metadata);
+        } else if (obj instanceof NSDate) {
+            handler.characters(((NSDate)obj).toString());
+        } else{
+            throw new UnsupportedOperationException("don't know baout: "+obj.getClass());
+
+        }
+    }
+
+    private void parseDict(NSDictionary obj, XHTMLContentHandler xhtml, Metadata metadata) throws SAXException {
+        for (Map.Entry<String, NSObject> mapEntry : obj.getHashMap().entrySet()) {
+            String key = mapEntry.getKey();
+            NSObject value = mapEntry.getValue();
+            xhtml.startElement("div", "class", key);
+            parseObject(value, xhtml, metadata);
+            xhtml.endElement("div");
+        }
+    }
+
+    private void handleData(NSData value, XHTMLContentHandler handler, Metadata metadata) {
+        byte[] bytes = value.bytes();
+        //TODO handle embedded file
+    }
+}
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 79d5f5d..75087ab 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -14,6 +14,7 @@
 #  limitations under the License.
 
 org.apache.tika.parser.apple.AppleSingleFileParser
+org.apache.tika.parser.apple.PListParser
 org.apache.tika.parser.asm.ClassParser
 org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
new file mode 100644
index 0000000..534f65b
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.apple;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import java.util.List;
+
+
+public class PListParserTest extends TikaTest {
+
+    @Test
+    public void testBasicBinaryPList() throws Exception {
+        //test file is MIT licensed:
+        // https://github.com/joeferner/node-bplist-parser/blob/master/test/iTunes-small.bplist
+        List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist");
+        Metadata m = metadataList.get(0);
+        String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+        assertContains("<div class=\"Application Version\">9.0.3</div>", content);
+    }
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testBPList.bplist b/tika-parsers/src/test/resources/test-documents/testBPList.bplist
new file mode 100644
index 0000000..b7edb14
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testBPList.bplist differ