You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/21 15:41:59 UTC
[tika] branch branch_1x updated: TIKA-3045 -- Added XMLProfiler as
an optional parser to profile XFA and XMP in PDFs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new d5af2cf TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs
d5af2cf is described below
commit d5af2cf72dd38b9ce10f7beeca94e2922df7a7c3
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 21 10:33:05 2020 -0500
TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs
# Conflicts:
# tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
---
CHANGES.txt | 4 +
.../apache/tika/metadata/TikaCoreProperties.java | 9 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 72 ++++++++++
.../org/apache/tika/parser/xml/XMLProfiler.java | 151 +++++++++++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 50 ++++++-
.../tika/parser/pdf/tika-xml-profiler-config.xml | 24 ++++
6 files changed, 305 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 800bf54..6703627 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,8 @@
Release 1.24 - ???
+
+ * Added XMLProfiler as an optional parser to profile XFA and XMP
+ in PDFs (TIKA-3045).
+
* Extract inline images that rely on the DCT filter from PDFs (TIKA-3041).
* Upgrade to PDFBox 2.0.18 (TIKA-3021).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 9f78a99..5f63cae 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -54,9 +54,12 @@ public interface TikaCoreProperties {
*
*/
public enum EmbeddedResourceType {
- INLINE,
- ATTACHMENT,
- MACRO
+ INLINE, //image that is intended to be displayed in a rendering of the file
+ ATTACHMENT,//standard attachment as in email
+ MACRO, //any code that is intended to be run by the application
+ METADATA, //e.g. xmp, xfa
+ FONT;//embedded font files
+ //what else?
};
/**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 8f55086..2e58123 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,6 +33,7 @@ import java.nio.file.Path;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
@@ -94,7 +95,9 @@ import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.EmbeddedContentHandler;
@@ -140,6 +143,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
+ private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
+ private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
+
/**
* Format used for signature dates
* TODO Make this thread-safe
@@ -189,6 +195,70 @@ class AbstractPDF2XHTML extends PDFTextStripper {
writeParagraphStart();
}
+ private void extractXMPXFA(PDDocument pdfDocument, Metadata parentMetadata, ParseContext context) throws IOException, SAXException {
+ Set<MediaType> supportedTypes = Collections.EMPTY_SET;
+ Parser embeddedParser = context.get(Parser.class);
+ if (embeddedParser != null) {
+ supportedTypes = embeddedParser.getSupportedTypes(context);
+ }
+
+ if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
+ Metadata xmpMetadata = new Metadata();
+ xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
+ xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+ if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata) &&
+ supportedTypes.contains(XMP_MEDIA_TYPE)) {
+ InputStream is = null;
+ try {
+ is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata();
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ }
+ if (is != null) {
+ try {
+ parseMetadata(is, xmpMetadata);
+ } finally {
+ org.apache.tika.io.IOUtils.closeQuietly(is);
+ }
+ }
+ }
+ }
+
+ //now try the xfa
+ if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
+ pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+
+ Metadata xfaMetadata = new Metadata();
+ xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
+ xfaMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+ if (embeddedDocumentExtractor.shouldParseEmbedded(xfaMetadata) &&
+ supportedTypes.contains(XFA_MEDIA_TYPE)) {
+ byte[] bytes = null;
+ try {
+ bytes = pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ }
+ if (bytes != null) {
+ try (InputStream is = new ByteArrayInputStream(bytes)) {
+ parseMetadata(is, xfaMetadata);
+ }
+ }
+ }
+ }
+ }
+
+ private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
+ try {
+ embeddedDocumentExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+
private void extractEmbeddedDocuments(PDDocument document)
throws IOException, SAXException, TikaException {
PDDocumentNameDictionary namesDictionary =
@@ -581,6 +651,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
handleCatchableIOE(e);
}
+ extractXMPXFA(pdf, metadata, context);
+
//extract acroform data at end of doc
if (config.getExtractAcroFormContent() == true) {
try {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLProfiler.java b/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLProfiler.java
new file mode 100644
index 0000000..30bb8f0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLProfiler.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+
+/**
+ * <p>
+ *
+ * This parser enables profiling of XML. It captures the root entity as well as
+ * entity uris/namespaces and entity local names in parallel arrays.
+ * </p>
+ * <p>
+ *
+ * This parser is not part of the default set of parsers and must be "turned on"
+ * via a tika config:
+ *
+ * <properties>
+ * <parsers>
+ * <parser class="org.apache.tika.parser.DefaultParser"/>
+ * <parser class="org.apache.tika.parser.xml.XMLProfiler"/>
+ * </parsers>
+ * </properties>
+ * </p>
+ * <p>
+ * This was initially designed to profile xmp and xfa in PDFs. Further
+ * work would need to be done to extract other types of xml and/or
+ * xmp in other file formats. Please open a ticket.
+ * </p>
+ */
+public class XMLProfiler extends AbstractParser {
+
+
+ public static Property ROOT_ENTITY = Property.internalText("xmlprofiler:root_entity");
+ public static Property ENTITY_URIS = Property.internalTextBag("xmlprofiler:entity_uris");
+ public static Property ENTITY_LOCAL_NAMES = Property.internalTextBag("xmlprofiler:entity_local_names");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+ MediaType.application("xml"),
+ //https://wwwimages2.adobe.com/content/dam/acom/en/devnet/xmp/pdfs/XMP%20SDK%20Release%20cc-2016-08/XMPSpecificationPart3.pdf
+ //"If a MIME type is needed, use application/rdf+xml."
+ MediaType.application("rdf+xml"),//xmp
+ //xfa: https://en.wikipedia.org/wiki/XFA
+ MediaType.application("vnd.adobe.xdp+xml")
+ )));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new XMLProfileHandler(metadata)), context);
+ }
+
+ private static class XMLProfileHandler extends DefaultHandler {
+ private final Metadata metadata;
+
+ int starts = 0;
+
+ Map<String, Set> entities = new TreeMap<>();
+
+ public XMLProfileHandler(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (starts == 0) {
+ metadata.set(ROOT_ENTITY, qName);
+ }
+ Set<String> localNames = entities.get(uri);
+ if (localNames == null) {
+ localNames = new TreeSet<>();
+ entities.put(uri, localNames);
+ }
+ localNames.add(localName);
+ starts++;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ String[] uris = new String[entities.size()];
+ String[] localNames = new String[entities.size()];
+ int i = 0;
+ for (Map.Entry<String, Set> e : entities.entrySet()) {
+ uris[i] = e.getKey();
+ localNames[i] = joinWith(" ", e.getValue());
+ i++;
+ }
+ metadata.set(ENTITY_URIS, uris);
+ metadata.set(ENTITY_LOCAL_NAMES, localNames);
+ }
+
+ static String joinWith(String delimiter, Collection<String> strings) {
+ StringBuilder sb = new StringBuilder();
+ int i = 0;
+ for (String s : strings) {
+ if (i > 0) {
+ sb.append(delimiter);
+ }
+ sb.append(s);
+ i++;
+ }
+ return sb.toString();
+ }
+ }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6fa268d..edcd513 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,8 +26,6 @@ import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Paths;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -66,6 +64,7 @@ import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.xml.XMLProfiler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
@@ -1146,6 +1145,53 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testXMLProfiler() throws Exception {
+ //test that the xml profiler is not triggered by default
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf");
+ assertEquals(1, metadataList.size());
+
+ //test that it is triggered when added to the default parser
+ //via the config, tesseract should skip this file because it is too large
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+
+ metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", p);
+ assertEquals(3, metadataList.size());
+
+ int xmlProfilers = 0;
+ for (Metadata metadata : metadataList) {
+ String[] parsedBy = metadata.getValues("X-Parsed-By");
+ for (int i = 0; i < parsedBy.length; i++) {
+ if (parsedBy[i].equals(XMLProfiler.class.getCanonicalName())) {
+ xmlProfilers++;
+ }
+ }
+ }
+
+ assertEquals(2, xmlProfilers);
+
+ //check xmp first
+ String[] uris = metadataList.get(1).getValues(XMLProfiler.ENTITY_URIS);
+ String[] localNames = metadataList.get(1).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
+ assertEquals(8, uris.length);
+ assertEquals(uris.length, localNames.length);
+ assertEquals("adobe:ns:meta/", uris[0]);
+ assertEquals("CreateDate CreatorTool MetadataDate ModifyDate Thumbnails", localNames[2]);
+ assertEquals("x:xmpmeta", metadataList.get(1).get(XMLProfiler.ROOT_ENTITY));
+
+ //check xfa
+ uris = metadataList.get(2).getValues(XMLProfiler.ENTITY_URIS);
+ localNames = metadataList.get(2).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
+ assertEquals(8, uris.length);
+ assertEquals(uris.length, localNames.length);
+ assertEquals("http://ns.adobe.com/xdp/", uris[1]);
+ assertEquals("field form instanceManager subform value", localNames[5]);
+ assertEquals("xdp:xdp", metadataList.get(2).get(XMLProfiler.ROOT_ENTITY));
+ }
+
+ @Test
public void testXMPMM() throws Exception {
Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
new file mode 100644
index 0000000..20adbf2
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ </parser>
+ <parser class="org.apache.tika.parser.xml.XMLProfiler"/>
+ </parsers>
+</properties>