You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/28 19:27:16 UTC

[tika] branch master updated: TIKA-3055 -- add an optional Preflight parser for PDFs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 73eacc2  TIKA-3055 -- add an optional Preflight parser for PDFs
73eacc2 is described below

commit 73eacc2d9bf77d962245b0832ba5d4fe0045ee56
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 28 14:26:54 2020 -0500

    TIKA-3055 -- add an optional Preflight parser for PDFs
---
 CHANGES.txt                                        |   2 +
 .../main/java/org/apache/tika/metadata/PDF.java    |  12 ++
 .../src/test/java/org/apache/tika/TikaTest.java    |  10 +-
 tika-parsers/pom.xml                               |   5 +
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  19 ++-
 .../apache/tika/parser/pdf/PDFPreflightParser.java | 180 +++++++++++++++++++++
 .../tika/parser/pdf/PDFPreflightParserTest.java    |  58 +++++++
 .../tika/parser/pdf/tika-preflight-config.xml      |  25 +++
 8 files changed, 307 insertions(+), 4 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index d382061..23c8fd6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
 
 Release 1.24 - ???
 
+   * Add an optional Preflight parser for PDFs (TIKA-3055).
+
    * Improve detection of some zip-based formats (TIKA-3057).
 
    * Upgrade metadata-extractor to 2.13.0 (TIKA-2952).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index f129f84..64b521b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -26,6 +26,7 @@ public interface PDF {
     String PDF_PREFIX = "pdf"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
     String PDFA_PREFIX = "pdfa"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
     String PDFAID_PREFIX = "pdfaid"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    String PDF_PREFLIGHT_PREFIX = "pdf-preflight"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
     /**
      * Prefix to be used for properties that record what was stored
@@ -92,4 +93,15 @@ public interface PDF {
     Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
 
     Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX+"hasMarkedContent");
+
+    Property PREFLIGHT_IS_VALID = Property.internalBoolean(PDF_PREFLIGHT_PREFIX+"isValid");
+    Property PREFLIGHT_PARSE_EXCEPTION = Property.internalText(PDF_PREFLIGHT_PREFIX+"parseException");
+    Property PREFLIGHT_VALIDATION_ERRORS = Property.internalTextBag(PDF_PREFLIGHT_PREFIX+"validationErrors");
+    Property PREFLIGHT_SPECIFICATION = Property.internalText(PDF_PREFLIGHT_PREFIX+"specification");
+    Property PREFLIGHT_TRAILER_COUNT = Property.internalInteger(PDF_PREFLIGHT_PREFIX+"trailerCount");
+    Property PREFLIGHT_XREF_TYPE = Property.internalText(PDF_PREFLIGHT_PREFIX+"xrefType");
+    Property PREFLIGHT_ICC_PROFILE = Property.internalText(PDF_PREFLIGHT_PREFIX+"iccProfile");
+    Property PREFLIGHT_IS_LINEARIZED = Property.internalBoolean(PDF_PREFLIGHT_PREFIX+"isLinearized");
+
+    Property PREFLIGHT_INCREMENTAL_UPDATES = Property.internalBoolean(PDF_PREFLIGHT_PREFIX+"hasIncrementalUpdates");
 }
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index c863e1b..5966e82 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -30,7 +30,9 @@ import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -448,7 +450,9 @@ public abstract class TikaTest {
     public static void debug(List<Metadata> list) {
         int i = 0;
         for (Metadata m : list) {
-            for (String n : m.names()) {
+            List<String> names = Arrays.asList(m.names());
+            Collections.sort(names);
+            for (String n : names) {
                 for (String v : m.getValues(n)) {
                     System.out.println(i + ": "+n + " : "+v);
                 }
@@ -458,7 +462,9 @@ public abstract class TikaTest {
     }
 
     public static void debug(Metadata metadata) {
-        for (String n : metadata.names()) {
+        List<String> names = Arrays.asList(metadata.names());
+        Collections.sort(names);
+        for (String n : names) {
             for (String v : metadata.getValues(n)) {
                 System.out.println(n + " : "+v);
             }
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 463e60f..07f84c0 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -219,6 +219,11 @@
     </dependency>
     <dependency>
       <groupId>org.apache.pdfbox</groupId>
+      <artifactId>preflight</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
       <artifactId>jempbox</artifactId>
       <version>${jempbox.version}</version>
     </dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 41644bf..ac57724 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -20,6 +20,7 @@ import javax.xml.stream.XMLStreamException;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Path;
 import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Collections;
@@ -137,9 +138,11 @@ public class PDFParser extends AbstractParser implements Initializable {
             }
             if (tstream != null && tstream.hasFile()) {
                 // File based -- send file directly to PDFBox
-                pdfDocument = PDDocument.load(tstream.getPath().toFile(), password, memoryUsageSetting);
+                pdfDocument = getPDDocument(tstream, password, memoryUsageSetting, metadata,
+                        context);
             } else {
-                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password, memoryUsageSetting);
+                pdfDocument = getPDDocument(new CloseShieldInputStream(stream), password,
+                        memoryUsageSetting, metadata, context);
             }
             metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
 
@@ -176,6 +179,18 @@ public class PDFParser extends AbstractParser implements Initializable {
         }
     }
 
+    protected PDDocument getPDDocument(InputStream inputStream, String password,
+                                     MemoryUsageSetting memoryUsageSetting,
+                                       Metadata metadata, ParseContext parseContext) throws IOException {
+        return PDDocument.load(inputStream, password, memoryUsageSetting);
+    }
+
+    protected PDDocument getPDDocument(Path path, String password,
+                                       MemoryUsageSetting memoryUsageSetting,
+                                       Metadata metadata, ParseContext parseContext) throws IOException {
+        return PDDocument.load(path.toFile(), password, memoryUsageSetting);
+    }
+
     private boolean hasMarkedContent(PDDocument pdDocument) {
         PDStructureTreeRoot root = pdDocument.getDocumentCatalog().getStructureTreeRoot();
         if (root == null) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
new file mode 100644
index 0000000..3676bf6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.pdfparser.XrefTrailerResolver;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.preflight.Format;
+import org.apache.pdfbox.preflight.PreflightConfiguration;
+import org.apache.pdfbox.preflight.PreflightContext;
+import org.apache.pdfbox.preflight.PreflightDocument;
+import org.apache.pdfbox.preflight.ValidationResult;
+import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
+import org.apache.pdfbox.preflight.parser.PreflightParser;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.ExceptionUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.util.List;
+
+import static org.apache.pdfbox.preflight.PreflightConstants.DICTIONARY_KEY_LINEARIZED;
+
+public class PDFPreflightParser extends PDFParser {
+
+    private static final PDFPreflightParserConfig DEFAULT = new PDFPreflightParserConfig();
+
+    @Override
+    protected PDDocument getPDDocument(InputStream inputStream, String password,
+                                       MemoryUsageSetting memoryUsageSetting,
+                                       Metadata metadata, ParseContext parseContext) throws IOException {
+        try (TikaInputStream tis = TikaInputStream.get(inputStream)) {
+            return getPDDocument(tis.getPath(), password, memoryUsageSetting, metadata, parseContext);
+        }
+    }
+
+    @Override
+    protected PDDocument getPDDocument(Path path, String password,
+                                       MemoryUsageSetting memoryUsageSetting,
+                                       Metadata metadata, ParseContext context) throws IOException {
+        PDFPreflightParserConfig pppConfig = context.get(PDFPreflightParserConfig.class, DEFAULT);
+
+        PreflightConfiguration configuration = new PreflightConfiguration();
+        configuration.setMaxErrors(pppConfig.getMaxErrors());
+        PreflightParser preflightParser = new PreflightParser(path.toFile());
+
+        preflightParser.setLenient(pppConfig.isLenient);
+        try {
+            preflightParser.parse(pppConfig.getFormat(), configuration);
+        } catch (SyntaxValidationException e) {
+            //back off to try to load the file normally
+            return handleSyntaxException(path, password, memoryUsageSetting, metadata, e);
+        }
+
+        PreflightDocument preflightDocument = preflightParser.getPreflightDocument();
+        preflightDocument.validate();
+        extractPreflight(preflightDocument, metadata);
+
+        //need to return this to ensure that it gets closed
+        //the preflight document can keep some other resources open.
+        return preflightParser.getPreflightDocument();
+    }
+
+    private void extractPreflight(PreflightDocument preflightDocument, Metadata metadata) {
+        ValidationResult result = preflightDocument.getResult();
+        metadata.set(PDF.PREFLIGHT_SPECIFICATION, preflightDocument.getSpecification().toString());
+        metadata.set(PDF.PREFLIGHT_IS_VALID, Boolean.toString(result.isValid()));
+
+
+        List<ValidationResult.ValidationError> errors = result.getErrorsList();
+        for (ValidationResult.ValidationError err : errors) {
+            metadata.add(PDF.PREFLIGHT_VALIDATION_ERRORS,
+                    err.getErrorCode() + " : " + err.getDetails());
+        }
+
+        PreflightContext preflightContext = preflightDocument.getContext();
+
+        XrefTrailerResolver resolver = preflightContext.getXrefTrailerResolver();
+        int trailerCount = resolver.getTrailerCount();
+
+        metadata.set(PDF.PREFLIGHT_TRAILER_COUNT, trailerCount);
+        metadata.set(PDF.PREFLIGHT_XREF_TYPE, resolver.getXrefType().toString());
+        if (preflightContext.getIccProfileWrapper() != null &&
+                preflightContext.getIccProfileWrapper().getProfile() != null) {
+            metadata.set(
+                    PDF.PREFLIGHT_ICC_PROFILE,
+                    preflightContext.getIccProfileWrapper().getProfile().toString());
+        }
+        COSDictionary linearized = getLinearizedDictionary(preflightDocument);
+        if (linearized != null) {
+            metadata.set(PDF.PREFLIGHT_IS_LINEARIZED, "true");
+            if (trailerCount > 2) {
+                metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "true");
+            } else {
+                metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "false");
+            }
+        } else {
+            metadata.set(PDF.PREFLIGHT_IS_LINEARIZED, "false");
+            if (trailerCount > 1) {
+                metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "true");
+            } else {
+                metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "false");
+            }
+        }
+    }
+
+    /**
+     * Copied verbatim from PDFBox
+     *
+     * According to the PDF Reference, A linearized PDF contain a dictionary as first object (linearized dictionary) and
+     * only this one in the first section.
+     *
+     * @param document the document to validate.
+     * @return the linearization dictionary or null.
+     */
+    protected static COSDictionary getLinearizedDictionary(PDDocument document) {
+        // ---- Get Ref to obj
+        COSDocument cDoc = document.getDocument();
+        List<?> lObj = cDoc.getObjects();
+        for (Object object : lObj) {
+            COSBase curObj = ((COSObject) object).getObject();
+            if (curObj instanceof COSDictionary
+                    && ((COSDictionary) curObj).keySet().contains(COSName.getPDFName(DICTIONARY_KEY_LINEARIZED))) {
+                return (COSDictionary) curObj;
+            }
+        }
+        return null;
+    }
+
+    private PDDocument handleSyntaxException(Path path,
+                                             String password,
+                                             MemoryUsageSetting memoryUsageSetting, Metadata metadata,
+                                             SyntaxValidationException e)
+            throws IOException {
+        metadata.add(PDF.PREFLIGHT_PARSE_EXCEPTION, ExceptionUtils.getStackTrace(e));
+        return PDDocument.load(path.toFile(), password, memoryUsageSetting);
+    }
+
+    private static class PDFPreflightParserConfig {
+        private int maxErrors = 100;
+        private boolean isLenient = true;
+        private Format format = Format.PDF_A1B;
+
+        public int getMaxErrors() {
+            return maxErrors;
+        }
+
+        public boolean isLenient() {
+            return isLenient;
+        }
+
+        public Format getFormat() {
+            return format;
+        }
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
new file mode 100644
index 0000000..3330e97
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.util.List;
+
+import static junit.framework.TestCase.assertEquals;
+
+public class PDFPreflightParserTest extends TikaTest {
+
+    private static Parser PREFLIGHT_AUTO_DETECT_PARSER;
+
+    @BeforeClass
+    public static void setUp() throws Exception {
+        try (InputStream is = PDFPreflightParser.class.getResourceAsStream("tika-preflight-config.xml")) {
+            PREFLIGHT_AUTO_DETECT_PARSER = new AutoDetectParser(new TikaConfig(is).getParser());
+        }
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPDFFileEmbInAnnotation.pdf",
+                PREFLIGHT_AUTO_DETECT_PARSER);
+        assertEquals(2, metadataList.size());
+
+        Metadata m = metadataList.get(0);
+        assertEquals("true", m.get(PDF.PREFLIGHT_IS_LINEARIZED));
+        assertEquals("true", m.get(PDF.PREFLIGHT_IS_VALID));
+        assertEquals("PDF_A1B", m.get(PDF.PREFLIGHT_SPECIFICATION));
+        assertEquals("2", m.get(PDF.PREFLIGHT_TRAILER_COUNT));
+        assertEquals("STREAM", m.get(PDF.PREFLIGHT_XREF_TYPE));
+        assertEquals("false", m.get(PDF.PREFLIGHT_INCREMENTAL_UPDATES));
+    }
+}
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
new file mode 100644
index 0000000..67c02f4
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFPreflightParser"/>
+    </parsers>
+</properties>