You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/28 19:27:16 UTC
[tika] branch master updated: TIKA-3055 -- add an optional
Preflight parser for PDFs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 73eacc2 TIKA-3055 -- add an optional Preflight parser for PDFs
73eacc2 is described below
commit 73eacc2d9bf77d962245b0832ba5d4fe0045ee56
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 28 14:26:54 2020 -0500
TIKA-3055 -- add an optional Preflight parser for PDFs
---
CHANGES.txt | 2 +
.../main/java/org/apache/tika/metadata/PDF.java | 12 ++
.../src/test/java/org/apache/tika/TikaTest.java | 10 +-
tika-parsers/pom.xml | 5 +
.../java/org/apache/tika/parser/pdf/PDFParser.java | 19 ++-
.../apache/tika/parser/pdf/PDFPreflightParser.java | 180 +++++++++++++++++++++
.../tika/parser/pdf/PDFPreflightParserTest.java | 58 +++++++
.../tika/parser/pdf/tika-preflight-config.xml | 25 +++
8 files changed, 307 insertions(+), 4 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index d382061..23c8fd6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
Release 1.24 - ???
+ * Add an optional Preflight parser for PDFs (TIKA-3055).
+
* Improve detection of some zip-based formats (TIKA-3057).
* Upgrade metadata-extractor to 2.13.0 (TIKA-2952).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index f129f84..64b521b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -26,6 +26,7 @@ public interface PDF {
String PDF_PREFIX = "pdf"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PDFA_PREFIX = "pdfa"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PDFAID_PREFIX = "pdfaid"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+ String PDF_PREFLIGHT_PREFIX = "pdf-preflight"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
/**
* Prefix to be used for properties that record what was stored
@@ -92,4 +93,15 @@ public interface PDF {
Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX+"hasMarkedContent");
+
+ Property PREFLIGHT_IS_VALID = Property.internalBoolean(PDF_PREFLIGHT_PREFIX+"isValid");
+ Property PREFLIGHT_PARSE_EXCEPTION = Property.internalText(PDF_PREFLIGHT_PREFIX+"parseException");
+ Property PREFLIGHT_VALIDATION_ERRORS = Property.internalTextBag(PDF_PREFLIGHT_PREFIX+"validationErrors");
+ Property PREFLIGHT_SPECIFICATION = Property.internalText(PDF_PREFLIGHT_PREFIX+"specification");
+ Property PREFLIGHT_TRAILER_COUNT = Property.internalInteger(PDF_PREFLIGHT_PREFIX+"trailerCount");
+ Property PREFLIGHT_XREF_TYPE = Property.internalText(PDF_PREFLIGHT_PREFIX+"xrefType");
+ Property PREFLIGHT_ICC_PROFILE = Property.internalText(PDF_PREFLIGHT_PREFIX+"iccProfile");
+ Property PREFLIGHT_IS_LINEARIZED = Property.internalBoolean(PDF_PREFLIGHT_PREFIX+"isLinearized");
+
+ Property PREFLIGHT_INCREMENTAL_UPDATES = Property.internalBoolean(PDF_PREFLIGHT_PREFIX+"hasIncrementalUpdates");
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index c863e1b..5966e82 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -30,7 +30,9 @@ import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Path;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -448,7 +450,9 @@ public abstract class TikaTest {
public static void debug(List<Metadata> list) {
int i = 0;
for (Metadata m : list) {
- for (String n : m.names()) {
+ List<String> names = Arrays.asList(m.names());
+ Collections.sort(names);
+ for (String n : names) {
for (String v : m.getValues(n)) {
System.out.println(i + ": "+n + " : "+v);
}
@@ -458,7 +462,9 @@ public abstract class TikaTest {
}
public static void debug(Metadata metadata) {
- for (String n : metadata.names()) {
+ List<String> names = Arrays.asList(metadata.names());
+ Collections.sort(names);
+ for (String n : names) {
for (String v : metadata.getValues(n)) {
System.out.println(n + " : "+v);
}
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 463e60f..07f84c0 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -219,6 +219,11 @@
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
+ <artifactId>preflight</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>${jempbox.version}</version>
</dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 41644bf..ac57724 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -20,6 +20,7 @@ import javax.xml.stream.XMLStreamException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Path;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
@@ -137,9 +138,11 @@ public class PDFParser extends AbstractParser implements Initializable {
}
if (tstream != null && tstream.hasFile()) {
// File based -- send file directly to PDFBox
- pdfDocument = PDDocument.load(tstream.getPath().toFile(), password, memoryUsageSetting);
+ pdfDocument = getPDDocument(tstream, password, memoryUsageSetting, metadata,
+ context);
} else {
- pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password, memoryUsageSetting);
+ pdfDocument = getPDDocument(new CloseShieldInputStream(stream), password,
+ memoryUsageSetting, metadata, context);
}
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
@@ -176,6 +179,18 @@ public class PDFParser extends AbstractParser implements Initializable {
}
}
+ protected PDDocument getPDDocument(InputStream inputStream, String password,
+ MemoryUsageSetting memoryUsageSetting,
+ Metadata metadata, ParseContext parseContext) throws IOException {
+ return PDDocument.load(inputStream, password, memoryUsageSetting);
+ }
+
+ protected PDDocument getPDDocument(Path path, String password,
+ MemoryUsageSetting memoryUsageSetting,
+ Metadata metadata, ParseContext parseContext) throws IOException {
+ return PDDocument.load(path.toFile(), password, memoryUsageSetting);
+ }
+
private boolean hasMarkedContent(PDDocument pdDocument) {
PDStructureTreeRoot root = pdDocument.getDocumentCatalog().getStructureTreeRoot();
if (root == null) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
new file mode 100644
index 0000000..3676bf6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFPreflightParser.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.pdfparser.XrefTrailerResolver;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.preflight.Format;
+import org.apache.pdfbox.preflight.PreflightConfiguration;
+import org.apache.pdfbox.preflight.PreflightContext;
+import org.apache.pdfbox.preflight.PreflightDocument;
+import org.apache.pdfbox.preflight.ValidationResult;
+import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
+import org.apache.pdfbox.preflight.parser.PreflightParser;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.ExceptionUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.util.List;
+
+import static org.apache.pdfbox.preflight.PreflightConstants.DICTIONARY_KEY_LINEARIZED;
+
+public class PDFPreflightParser extends PDFParser {
+
+ private static final PDFPreflightParserConfig DEFAULT = new PDFPreflightParserConfig();
+
+ @Override
+ protected PDDocument getPDDocument(InputStream inputStream, String password,
+ MemoryUsageSetting memoryUsageSetting,
+ Metadata metadata, ParseContext parseContext) throws IOException {
+ try (TikaInputStream tis = TikaInputStream.get(inputStream)) {
+ return getPDDocument(tis.getPath(), password, memoryUsageSetting, metadata, parseContext);
+ }
+ }
+
+ @Override
+ protected PDDocument getPDDocument(Path path, String password,
+ MemoryUsageSetting memoryUsageSetting,
+ Metadata metadata, ParseContext context) throws IOException {
+ PDFPreflightParserConfig pppConfig = context.get(PDFPreflightParserConfig.class, DEFAULT);
+
+ PreflightConfiguration configuration = new PreflightConfiguration();
+ configuration.setMaxErrors(pppConfig.getMaxErrors());
+ PreflightParser preflightParser = new PreflightParser(path.toFile());
+
+ preflightParser.setLenient(pppConfig.isLenient);
+ try {
+ preflightParser.parse(pppConfig.getFormat(), configuration);
+ } catch (SyntaxValidationException e) {
+ //back off to try to load the file normally
+ return handleSyntaxException(path, password, memoryUsageSetting, metadata, e);
+ }
+
+ PreflightDocument preflightDocument = preflightParser.getPreflightDocument();
+ preflightDocument.validate();
+ extractPreflight(preflightDocument, metadata);
+
+ //need to return this to ensure that it gets closed
+ //the preflight document can keep some other resources open.
+ return preflightParser.getPreflightDocument();
+ }
+
+ private void extractPreflight(PreflightDocument preflightDocument, Metadata metadata) {
+ ValidationResult result = preflightDocument.getResult();
+ metadata.set(PDF.PREFLIGHT_SPECIFICATION, preflightDocument.getSpecification().toString());
+ metadata.set(PDF.PREFLIGHT_IS_VALID, Boolean.toString(result.isValid()));
+
+
+ List<ValidationResult.ValidationError> errors = result.getErrorsList();
+ for (ValidationResult.ValidationError err : errors) {
+ metadata.add(PDF.PREFLIGHT_VALIDATION_ERRORS,
+ err.getErrorCode() + " : " + err.getDetails());
+ }
+
+ PreflightContext preflightContext = preflightDocument.getContext();
+
+ XrefTrailerResolver resolver = preflightContext.getXrefTrailerResolver();
+ int trailerCount = resolver.getTrailerCount();
+
+ metadata.set(PDF.PREFLIGHT_TRAILER_COUNT, trailerCount);
+ metadata.set(PDF.PREFLIGHT_XREF_TYPE, resolver.getXrefType().toString());
+ if (preflightContext.getIccProfileWrapper() != null &&
+ preflightContext.getIccProfileWrapper().getProfile() != null) {
+ metadata.set(
+ PDF.PREFLIGHT_ICC_PROFILE,
+ preflightContext.getIccProfileWrapper().getProfile().toString());
+ }
+ COSDictionary linearized = getLinearizedDictionary(preflightDocument);
+ if (linearized != null) {
+ metadata.set(PDF.PREFLIGHT_IS_LINEARIZED, "true");
+ if (trailerCount > 2) {
+ metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "true");
+ } else {
+ metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "false");
+ }
+ } else {
+ metadata.set(PDF.PREFLIGHT_IS_LINEARIZED, "false");
+ if (trailerCount > 1) {
+ metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "true");
+ } else {
+ metadata.set(PDF.PREFLIGHT_INCREMENTAL_UPDATES, "false");
+ }
+ }
+ }
+
+ /**
+ * Copied verbatim from PDFBox
+ *
+ * According to the PDF Reference, A linearized PDF contain a dictionary as first object (linearized dictionary) and
+ * only this one in the first section.
+ *
+ * @param document the document to validate.
+ * @return the linearization dictionary or null.
+ */
+ protected static COSDictionary getLinearizedDictionary(PDDocument document) {
+ // ---- Get Ref to obj
+ COSDocument cDoc = document.getDocument();
+ List<?> lObj = cDoc.getObjects();
+ for (Object object : lObj) {
+ COSBase curObj = ((COSObject) object).getObject();
+ if (curObj instanceof COSDictionary
+ && ((COSDictionary) curObj).keySet().contains(COSName.getPDFName(DICTIONARY_KEY_LINEARIZED))) {
+ return (COSDictionary) curObj;
+ }
+ }
+ return null;
+ }
+
+ private PDDocument handleSyntaxException(Path path,
+ String password,
+ MemoryUsageSetting memoryUsageSetting, Metadata metadata,
+ SyntaxValidationException e)
+ throws IOException {
+ metadata.add(PDF.PREFLIGHT_PARSE_EXCEPTION, ExceptionUtils.getStackTrace(e));
+ return PDDocument.load(path.toFile(), password, memoryUsageSetting);
+ }
+
+ private static class PDFPreflightParserConfig {
+ private int maxErrors = 100;
+ private boolean isLenient = true;
+ private Format format = Format.PDF_A1B;
+
+ public int getMaxErrors() {
+ return maxErrors;
+ }
+
+ public boolean isLenient() {
+ return isLenient;
+ }
+
+ public Format getFormat() {
+ return format;
+ }
+ }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
new file mode 100644
index 0000000..3330e97
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFPreflightParserTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.util.List;
+
+import static junit.framework.TestCase.assertEquals;
+
+public class PDFPreflightParserTest extends TikaTest {
+
+ private static Parser PREFLIGHT_AUTO_DETECT_PARSER;
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+ try (InputStream is = PDFPreflightParser.class.getResourceAsStream("tika-preflight-config.xml")) {
+ PREFLIGHT_AUTO_DETECT_PARSER = new AutoDetectParser(new TikaConfig(is).getParser());
+ }
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPDFFileEmbInAnnotation.pdf",
+ PREFLIGHT_AUTO_DETECT_PARSER);
+ assertEquals(2, metadataList.size());
+
+ Metadata m = metadataList.get(0);
+ assertEquals("true", m.get(PDF.PREFLIGHT_IS_LINEARIZED));
+ assertEquals("true", m.get(PDF.PREFLIGHT_IS_VALID));
+ assertEquals("PDF_A1B", m.get(PDF.PREFLIGHT_SPECIFICATION));
+ assertEquals("2", m.get(PDF.PREFLIGHT_TRAILER_COUNT));
+ assertEquals("STREAM", m.get(PDF.PREFLIGHT_XREF_TYPE));
+ assertEquals("false", m.get(PDF.PREFLIGHT_INCREMENTAL_UPDATES));
+ }
+}
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
new file mode 100644
index 0000000..67c02f4
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-preflight-config.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFPreflightParser"/>
+ </parsers>
+</properties>