You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/25 21:03:00 UTC
[tika] branch master updated: TIKA-3104 -- add detection and
parsing for xml based plist files
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 62fe4ad TIKA-3104 -- add detection and parsing for xml based plist files
62fe4ad is described below
commit 62fe4ada48f961ec49a41f6b67f28a64681c9b93
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jun 25 17:02:29 2020 -0400
TIKA-3104 -- add detection and parsing for xml based plist files
---
.../org/apache/tika/mime/tika-mimetypes.xml | 7 +++
.../apache/tika/parser/apple/BPListDetector.java | 50 ++++++++++++++++------
.../apple/{BPListParser.java => PListParser.java} | 26 +++++++++--
.../services/org.apache.tika.parser.Parser | 2 +-
.../tika/detect/TestContainerAwareDetector.java | 6 +--
...{BPListParserTest.java => PListParserTest.java} | 9 ++--
6 files changed, 75 insertions(+), 25 deletions(-)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d284718..5dbcf99 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3308,6 +3308,7 @@
</mime-type>
<mime-type type="application/x-bplist">
+ <!-- this is the binary verision of x-plist -->
<!-- Check for well-known bplist versions -->
<magic priority="70">
<match value="bplist\000\000" type="string" offset="0"/>
@@ -3329,6 +3330,12 @@
</magic>
</mime-type>
+ <mime-type type="application/x-plist">
+ <!-- this is the xml version of x-plist -->
+ <root-XML localName="plist"/>
+ <sub-class-of type="application/xml"/>
+ </mime-type>
+
<mime-type type="application/x-gtar">
<_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
<magic priority="50">
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
index 6631fa7..24b6bb8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
@@ -32,19 +32,41 @@ import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Set;
/**
- * Parser that wraps com.dd.plist's PList parser to handle
- * binary property lists
+ * Detector for BPList with utility functions for PList.
+ *
+ * Without significant refactoring, this can't easily work as a true
+ * detector on plist subtypes. Rather, for now, we require the file to be
+ * parsed and then the parser adds the subtype for xml-based plists.
+ * @since 1.25
*/
public class BPListDetector implements Detector {
+ //xml versions
+ static MediaType MEMGRAPH = MediaType.application("x-plist-memgraph");
+ static MediaType WEBARCHIVE = MediaType.application("x-plist-webarchive");
+ static MediaType PLIST = MediaType.application("x-plist");
+ static MediaType ITUNES = MediaType.application("x-plist-itunes");
+
+
+ //binary versions
+ static MediaType BMEMGRAPH = MediaType.application("x-bplist-memgraph");
+ static MediaType BWEBARCHIVE = MediaType.application("x-bplist-webarchive");
+ static MediaType BPLIST = MediaType.application("x-bplist");
+ static MediaType BITUNES = MediaType.application("x-bplist-itunes");
+
+ private static Map<MediaType, MediaType> BINARY_TO_XML = new HashMap<>();
- MediaType MEMGRAPH = MediaType.application("x-memgraph");
- MediaType WEBARCHIVE = MediaType.application("x-webarchive");
- MediaType BPLIST = MediaType.application("x-bplist");
- MediaType ITUNES = MediaType.application("x-itunes-bplist");
+ static {
+ BINARY_TO_XML.put(BMEMGRAPH, MEMGRAPH);
+ BINARY_TO_XML.put(BWEBARCHIVE, WEBARCHIVE);
+ BINARY_TO_XML.put(BPLIST, PLIST);
+ BINARY_TO_XML.put(BITUNES, ITUNES);
+ }
/**
* @param input input stream must support reset
@@ -97,18 +119,20 @@ public class BPListDetector implements Detector {
return BPLIST;
}
- private MediaType detectOnKeys(Set<String> keySet) {
+ static MediaType detectOnKeys(Set<String> keySet) {
if (keySet.contains("nodes") && keySet.contains("edges")
&& keySet.contains("graphEncodingVersion")) {
- return MEMGRAPH;
- } else if (keySet.contains("WebMainResource") //&& keySet.contains("WebSubresources") should we require this?
- ) {
- return WEBARCHIVE;
+ return BMEMGRAPH;
+ } else if (keySet.contains("WebMainResource")){ //&& keySet.contains("WebSubresources") should we require this?
+ return BWEBARCHIVE;
} else if (keySet.contains("Playlists") && keySet.contains("Tracks")
&& keySet.contains("Music Folder")) {
- return ITUNES;
+ return BITUNES;
} //if it contains $archiver and $objects, it is a bplist inside a webarchive
-
return BPLIST;
}
+
+ static MediaType detectXMLOnKeys(Set<String> keySet) {
+ return BINARY_TO_XML.get(detectOnKeys(keySet));
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
similarity index 89%
rename from tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
rename to tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
index 29d0fb9..a05f03c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/BPListParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -46,16 +46,23 @@ import java.io.InputStream;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
+import java.util.Arrays;
import java.util.Collections;
+import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
/**
- * Parser for Apple's plist and bplist. This is a wrapper around
+ * Parser for Apple's plist and bplist. This is a wrapper around
* com.googlecode.plist:dd-plist
+ *
+ * As of 1.25, Tika does not have detection for the text based plist,
+ * so those files will not be directed to this parser
+ *
+ * @since 1.25
*/
-public class BPListParser extends AbstractParser {
+public class PListParser extends AbstractParser {
private static final String ARR = "array";
private static final String DATA = "data";
@@ -70,7 +77,13 @@ public class BPListParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("x-bplist"));
+ Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+ BPListDetector.BITUNES,
+ BPListDetector.BMEMGRAPH,
+ BPListDetector.BPLIST,
+ BPListDetector.BWEBARCHIVE,
+ BPListDetector.PLIST)));
+
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -101,6 +114,13 @@ public class BPListParser extends AbstractParser {
throw new TikaException("problem parsing root", e);
}
}
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ if (BPListDetector.PLIST.toString().equals(contentType)) {
+ if (rootObj instanceof NSDictionary) {
+ MediaType subtype = BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet());
+ metadata.set(Metadata.CONTENT_TYPE, subtype.toString());
+ }
+ }
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
State state = new State(xhtml, metadata, embeddedDocumentExtractor, df);
xhtml.startDocument();
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 7725f8c..72595d0 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -14,7 +14,7 @@
# limitations under the License.
org.apache.tika.parser.apple.AppleSingleFileParser
-org.apache.tika.parser.apple.BPListParser
+org.apache.tika.parser.apple.PListParser
org.apache.tika.parser.asm.ClassParser
org.apache.tika.parser.audio.AudioParser
org.apache.tika.parser.audio.MidiParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 7e197a7..18d8697 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -567,8 +567,8 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
@Test
public void testBPList() throws Exception {
- assertTypeByData("testMemgraph.memgraph", "application/x-memgraph");
- assertTypeByData("testWEBARCHIVE.webarchive", "application/x-webarchive");
- assertTypeByData("testBPList.bplist", "application/x-itunes-bplist");
+ assertTypeByData("testMemgraph.memgraph", "application/x-bplist-memgraph");
+ assertTypeByData("testWEBARCHIVE.webarchive", "application/x-bplist-webarchive");
+ assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
similarity index 88%
rename from tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
rename to tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
index 9fad311..9c60d5b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/apple/BPListParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
@@ -17,9 +17,7 @@
package org.apache.tika.parser.apple;
import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.junit.Test;
@@ -28,7 +26,7 @@ import java.util.List;
import static org.junit.Assert.assertEquals;
-public class BPListParserTest extends TikaTest {
+public class PListParserTest extends TikaTest {
@Test
public void testBasicBinaryPList() throws Exception {
@@ -37,7 +35,7 @@ public class BPListParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist");
assertEquals(21, metadataList.size());
Metadata m = metadataList.get(0);
- assertEquals("application/x-itunes-bplist", m.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-bplist-itunes", m.get(Metadata.CONTENT_TYPE));
String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
assertContains("<key>Application Version</key><string>9.0", content);
@@ -50,11 +48,12 @@ public class BPListParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testWEBARCHIVE.webarchive");
assertEquals(12, metadataList.size());
Metadata m0 = metadataList.get(0);
- assertEquals("application/x-webarchive", m0.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-bplist-webarchive", m0.get(Metadata.CONTENT_TYPE));
Metadata m1 = metadataList.get(1);
String content = m1.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
assertContains("December 2008: Apache Tika Release", content);
}
//TODO -- add unit tests for memgraph
+ //TODO -- convert existing unit tests to xml plist and add unit tests.
}