You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/02 12:51:15 UTC
[tika] branch master updated: TIKA-3104 -- fix markup to be closer
to plist xml, add embedded document extractor
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 424161d TIKA-3104 -- fix markup to be closer to plist xml, add embedded document extractor
424161d is described below
commit 424161d7cbe3d3615324f689db964b9b0c1f89a8
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 2 08:50:54 2020 -0400
TIKA-3104 -- fix markup to be closer to plist xml, add embedded document extractor
---
.../org/apache/tika/parser/apple/PListParser.java | 116 ++++++++++++++++-----
.../apache/tika/parser/apple/PListParserTest.java | 8 +-
2 files changed, 99 insertions(+), 25 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
index ff56efe..643a611 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -26,8 +26,9 @@ import com.dd.plist.NSSet;
import com.dd.plist.NSString;
import com.dd.plist.PropertyListFormatException;
import com.dd.plist.PropertyListParser;
-import com.lexicalscope.jewelcli.internal.cglib.asm.$MethodAdapter;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -41,30 +42,44 @@ import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
+import java.text.DateFormat;
import java.text.ParseException;
+import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
/**
* Parser for Apple's plist and bplist. This is a wrapper around
- * <groupId>com.googlecode.plist</groupId>
- * <artifactId>dd-plist</artifactId>
- * <version>1.23</version>
+ * com.googlecode.plist:dd-plist
*/
public class PListParser extends AbstractParser {
+ private static final String ARR = "array";
+ private static final String DATA = "data";
+ private static final String DATE = "date";
+ private static final String DICT = "dict";
+ private static final String KEY = "key";
+ private static final String NUMBER = "number";
+ private static final String PLIST = "plist";
+ private static final String SET = "set";
+ private static final String STRING = "string";
+
+
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-bplist"));
-
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
NSObject rootObj = null;
try {
if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) {
@@ -76,47 +91,100 @@ public class PListParser extends AbstractParser {
throw new TikaException("problem parsing root", e);
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ State state = new State(xhtml, metadata, embeddedDocumentExtractor, df);
xhtml.startDocument();
- parseObject(rootObj, xhtml, metadata);
+ xhtml.startElement(PLIST);
+ parseObject(rootObj, state);
+ xhtml.endElement(PLIST);
xhtml.endDocument();
}
- private void parseObject(NSObject obj, XHTMLContentHandler handler, Metadata metadata)
- throws SAXException {
+ private void parseObject(NSObject obj, State state)
+ throws SAXException, IOException {
if (obj instanceof NSDictionary) {
- parseDict((NSDictionary)obj, handler, metadata);
+ parseDict((NSDictionary)obj, state);
} else if (obj instanceof NSArray) {
NSArray nsArray = (NSArray)obj;
+ state.xhtml.startElement(ARR);
for (NSObject child : nsArray.getArray()) {
- parseObject(child, handler, metadata);
+ parseObject(child, state);
}
+ state.xhtml.endElement(ARR);
} else if (obj instanceof NSString) {
- handler.characters(((NSString)obj).toString());
+ state.xhtml.startElement(STRING);
+ state.xhtml.characters(((NSString)obj).getContent());
+ state.xhtml.endElement(STRING);
} else if (obj instanceof NSNumber) {
- handler.characters(((NSNumber) obj).toString());
+ state.xhtml.startElement(NUMBER);
+ state.xhtml.characters(((NSNumber) obj).toString());
+ state.xhtml.endElement(NUMBER);
} else if (obj instanceof NSData) {
- handleData((NSData) obj, handler, metadata);
+ state.xhtml.startElement(DATA);
+ handleData((NSData) obj, state);
+ state.xhtml.endElement(DATA);
} else if (obj instanceof NSDate) {
- handler.characters(((NSDate)obj).toString());
- } else{
- throw new UnsupportedOperationException("don't know baout: "+obj.getClass());
+ state.xhtml.startElement(DATE);
+ String dateString = state.dateFormat.format(((NSDate)obj).getDate());
+ state.xhtml.characters(dateString);
+ state.xhtml.endElement(DATE);
+ } else if (obj instanceof NSSet) {
+ state.xhtml.startElement(SET);
+ parseSet((NSSet)obj, state);
+ state.xhtml.endElement(SET);
+ } else {
+ throw new UnsupportedOperationException("don't yet support this type of object: "+obj.getClass());
+ }
+ }
+ private void parseSet(NSSet obj, State state)
+ throws SAXException, IOException {
+ state.xhtml.startElement(SET);
+ for (NSObject child : obj.allObjects()) {
+ parseObject(child, state);
}
+ state.xhtml.endElement(SET);
}
- private void parseDict(NSDictionary obj, XHTMLContentHandler xhtml, Metadata metadata) throws SAXException {
+ private void parseDict(NSDictionary obj, State state)
+ throws SAXException, IOException {
+ state.xhtml.startElement(DICT);
for (Map.Entry<String, NSObject> mapEntry : obj.getHashMap().entrySet()) {
String key = mapEntry.getKey();
NSObject value = mapEntry.getValue();
- xhtml.startElement("div", "class", key);
- parseObject(value, xhtml, metadata);
- xhtml.endElement("div");
+ state.xhtml.element(KEY, key);
+ parseObject(value, state);
+ }
+ state.xhtml.endElement(DICT);
+ }
+
+ private void handleData(NSData value, State state) throws IOException,
+ SAXException {
+ state.xhtml.characters(value.getBase64EncodedData());
+ Metadata embeddedMetadata = new Metadata();
+ if (! state.embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ return;
+ }
+
+ try (TikaInputStream tis = TikaInputStream.get(value.bytes())) {
+ state.embeddedDocumentExtractor.parseEmbedded(tis, state.xhtml, embeddedMetadata, false);
}
}
- private void handleData(NSData value, XHTMLContentHandler handler, Metadata metadata) {
- byte[] bytes = value.bytes();
- //TODO handle embedded file
+ private static class State {
+ final XHTMLContentHandler xhtml;
+ final Metadata metadata;
+ final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+ final DateFormat dateFormat;
+
+ public State(XHTMLContentHandler xhtml,
+ Metadata metadata,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ DateFormat df) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ this.embeddedDocumentExtractor = embeddedDocumentExtractor;
+ this.dateFormat = df;
+ }
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
index 534f65b..9d78548 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
@@ -23,6 +23,8 @@ import org.junit.Test;
import java.util.List;
+import static org.junit.Assert.assertEquals;
+
public class PListParserTest extends TikaTest {
@@ -31,8 +33,12 @@ public class PListParserTest extends TikaTest {
//test file is MIT licensed:
// https://github.com/joeferner/node-bplist-parser/blob/master/test/iTunes-small.bplist
List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist");
+ assertEquals(21, metadataList.size());
Metadata m = metadataList.get(0);
String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
- assertContains("<div class=\"Application Version\">9.0.3</div>", content);
+ assertContains("<key>Application Version</key><string>9.0", content);
+
+ //TODO -- bad encoding right after this...smart quote?
+ assertContains("<string>90", content);
}
}