You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/21 15:41:59 UTC

[tika] branch branch_1x updated: TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new d5af2cf  TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs
d5af2cf is described below

commit d5af2cf72dd38b9ce10f7beeca94e2922df7a7c3
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 21 10:33:05 2020 -0500

    TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs
    
    # Conflicts:
    #	tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
---
 CHANGES.txt                                        |   4 +
 .../apache/tika/metadata/TikaCoreProperties.java   |   9 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  72 ++++++++++
 .../org/apache/tika/parser/xml/XMLProfiler.java    | 151 +++++++++++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  50 ++++++-
 .../tika/parser/pdf/tika-xml-profiler-config.xml   |  24 ++++
 6 files changed, 305 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 800bf54..6703627 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,8 @@
 Release 1.24 - ???
+
+   * Added XMLProfiler as an optional parser to profile XFA and XMP
+     in PDFs (TIKA-3045).
+
    * Extract inline images that rely on the DCT filter from PDFs (TIKA-3041).
 
    * Upgrade to PDFBox 2.0.18 (TIKA-3021).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 9f78a99..5f63cae 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -54,9 +54,12 @@ public interface TikaCoreProperties {
      *
      */
     public enum EmbeddedResourceType {
-        INLINE,
-        ATTACHMENT,
-        MACRO
+        INLINE, //image that is intended to be displayed in a rendering of the file
+        ATTACHMENT,//standard attachment as in email
+        MACRO, //any code that is intended to be run by the application
+        METADATA, //e.g. xmp, xfa
+        FONT;//embedded font files
+        //what else?
     };
 
     /**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 8f55086..2e58123 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,6 +33,7 @@ import java.nio.file.Path;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -94,7 +95,9 @@ import org.apache.tika.metadata.Font;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.EmbeddedContentHandler;
@@ -140,6 +143,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
 
+    private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
+    private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
+
     /**
      * Format used for signature dates
      * TODO Make this thread-safe
@@ -189,6 +195,70 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         writeParagraphStart();
     }
 
+    private void extractXMPXFA(PDDocument pdfDocument, Metadata parentMetadata, ParseContext context) throws IOException, SAXException {
+        Set<MediaType> supportedTypes = Collections.EMPTY_SET;
+        Parser embeddedParser = context.get(Parser.class);
+        if (embeddedParser != null) {
+            supportedTypes = embeddedParser.getSupportedTypes(context);
+        }
+
+        if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
+            Metadata xmpMetadata = new Metadata();
+            xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
+            xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+            if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata) &&
+                    supportedTypes.contains(XMP_MEDIA_TYPE)) {
+                InputStream is = null;
+                try {
+                    is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata();
+                } catch (IOException e) {
+                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+                }
+                if (is != null) {
+                    try {
+                        parseMetadata(is, xmpMetadata);
+                    } finally {
+                        org.apache.tika.io.IOUtils.closeQuietly(is);
+                    }
+                }
+            }
+        }
+
+        //now try the xfa
+        if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
+            pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+
+            Metadata xfaMetadata = new Metadata();
+            xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
+            xfaMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+            if (embeddedDocumentExtractor.shouldParseEmbedded(xfaMetadata) &&
+                    supportedTypes.contains(XFA_MEDIA_TYPE)) {
+                byte[] bytes = null;
+                try {
+                    bytes = pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
+                } catch (IOException e) {
+                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+                }
+                if (bytes != null) {
+                    try (InputStream is = new ByteArrayInputStream(bytes)) {
+                        parseMetadata(is, xfaMetadata);
+                    }
+                }
+            }
+        }
+    }
+
+    private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
+        try {
+            embeddedDocumentExtractor.parseEmbedded(
+                    stream,
+                    new EmbeddedContentHandler(xhtml),
+                    embeddedMetadata, false);
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        }
+    }
+
     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
             PDDocumentNameDictionary namesDictionary =
@@ -581,6 +651,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                 handleCatchableIOE(e);
             }
 
+            extractXMPXFA(pdf, metadata, context);
+
             //extract acroform data at end of doc
             if (config.getExtractAcroFormContent() == true) {
                 try {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLProfiler.java b/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLProfiler.java
new file mode 100644
index 0000000..30bb8f0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLProfiler.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+
+/**
+ * <p>
+ *
+ * This parser enables profiling of XML.  It captures the root entity as well as
+ * entity uris/namespaces and entity local names in parallel arrays.
+ * </p>
+ * <p>
+ *
+ * This parser is not part of the default set of parsers and must be "turned on"
+ * via a tika config:
+ *
+ * &lt;properties&gt;
+ *     &lt;parsers&gt;
+ *         &lt;parser class="org.apache.tika.parser.DefaultParser"/&gt;
+ *         &lt;parser class="org.apache.tika.parser.xml.XMLProfiler"/&gt;
+ *     &lt;/parsers&gt;
+ * &lt;/properties&gt;
+ * </p>
+ * <p>
+ *     This was initially designed to profile xmp and xfa in PDFs.  Further
+ *     work would need to be done to extract other types of xml and/or
+ *     xmp in other file formats.  Please open a ticket.
+ * </p>
+ */
+public class XMLProfiler extends AbstractParser {
+
+
+    public static Property ROOT_ENTITY = Property.internalText("xmlprofiler:root_entity");
+    public static Property ENTITY_URIS = Property.internalTextBag("xmlprofiler:entity_uris");
+    public static Property ENTITY_LOCAL_NAMES = Property.internalTextBag("xmlprofiler:entity_local_names");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.application("xml"),
+                    //https://wwwimages2.adobe.com/content/dam/acom/en/devnet/xmp/pdfs/XMP%20SDK%20Release%20cc-2016-08/XMPSpecificationPart3.pdf
+                    //"If a MIME type is needed, use application/rdf+xml."
+                    MediaType.application("rdf+xml"),//xmp
+                    //xfa: https://en.wikipedia.org/wiki/XFA
+                    MediaType.application("vnd.adobe.xdp+xml")
+            )));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XMLReaderUtils.parseSAX(
+                new CloseShieldInputStream(stream),
+                new OfflineContentHandler(new XMLProfileHandler(metadata)), context);
+    }
+
+    private static class XMLProfileHandler extends DefaultHandler {
+        private final Metadata metadata;
+
+        int starts = 0;
+
+        Map<String, Set> entities = new TreeMap<>();
+
+        public XMLProfileHandler(Metadata metadata) {
+            this.metadata = metadata;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+            if (starts == 0) {
+                metadata.set(ROOT_ENTITY, qName);
+            }
+            Set<String> localNames = entities.get(uri);
+            if (localNames == null) {
+                localNames = new TreeSet<>();
+                entities.put(uri, localNames);
+            }
+            localNames.add(localName);
+            starts++;
+        }
+
+        @Override
+        public void endDocument() throws SAXException {
+            String[] uris = new String[entities.size()];
+            String[] localNames = new String[entities.size()];
+            int i = 0;
+            for (Map.Entry<String, Set> e : entities.entrySet()) {
+                uris[i] = e.getKey();
+                localNames[i] = joinWith(" ", e.getValue());
+                i++;
+            }
+            metadata.set(ENTITY_URIS, uris);
+            metadata.set(ENTITY_LOCAL_NAMES, localNames);
+        }
+
+        static String joinWith(String delimiter, Collection<String> strings) {
+            StringBuilder sb = new StringBuilder();
+            int i = 0;
+            for (String s : strings) {
+                if (i > 0) {
+                    sb.append(delimiter);
+                }
+                sb.append(s);
+                i++;
+            }
+            return sb.toString();
+        }
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6fa268d..edcd513 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,8 +26,6 @@ import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -66,6 +64,7 @@ import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.xml.XMLProfiler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
@@ -1146,6 +1145,53 @@ public class PDFParserTest extends TikaTest {
     }
 
     @Test
+    public void testXMLProfiler() throws Exception {
+        //test that the xml profiler is not triggered by default
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf");
+        assertEquals(1, metadataList.size());
+
+        //test that it is triggered when added to the default parser
+        //via the config, tesseract should skip this file because it is too large
+        InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml");
+        assertNotNull(is);
+        TikaConfig tikaConfig = new TikaConfig(is);
+        Parser p = new AutoDetectParser(tikaConfig);
+
+        metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", p);
+        assertEquals(3, metadataList.size());
+
+        int xmlProfilers = 0;
+        for (Metadata metadata : metadataList) {
+            String[] parsedBy = metadata.getValues("X-Parsed-By");
+            for (int i = 0; i < parsedBy.length; i++) {
+                if (parsedBy[i].equals(XMLProfiler.class.getCanonicalName())) {
+                    xmlProfilers++;
+                }
+            }
+        }
+
+        assertEquals(2, xmlProfilers);
+
+        //check xmp first
+        String[] uris = metadataList.get(1).getValues(XMLProfiler.ENTITY_URIS);
+        String[] localNames = metadataList.get(1).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
+        assertEquals(8, uris.length);
+        assertEquals(uris.length, localNames.length);
+        assertEquals("adobe:ns:meta/", uris[0]);
+        assertEquals("CreateDate CreatorTool MetadataDate ModifyDate Thumbnails", localNames[2]);
+        assertEquals("x:xmpmeta", metadataList.get(1).get(XMLProfiler.ROOT_ENTITY));
+
+        //check xfa
+        uris = metadataList.get(2).getValues(XMLProfiler.ENTITY_URIS);
+        localNames = metadataList.get(2).getValues(XMLProfiler.ENTITY_LOCAL_NAMES);
+        assertEquals(8, uris.length);
+        assertEquals(uris.length, localNames.length);
+        assertEquals("http://ns.adobe.com/xdp/", uris[1]);
+        assertEquals("field form instanceManager subform value", localNames[5]);
+        assertEquals("xdp:xdp", metadataList.get(2).get(XMLProfiler.ROOT_ENTITY));
+    }
+
+    @Test
     public void testXMPMM() throws Exception {
 
         Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
new file mode 100644
index 0000000..20adbf2
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+        </parser>
+        <parser class="org.apache.tika.parser.xml.XMLProfiler"/>
+    </parsers>
+</properties>