You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2020/12/20 04:58:44 UTC

[tika] branch main updated: TIKA-3246: call tailored fixup when getting AcroForm the first time to avoid the creation of appearances which aren't needed in tika (newly needed in PDFBox 2.0.22)

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 3a4c529  TIKA-3246: call tailored fixup when getting AcroForm the first time to avoid the creation of appearances which aren't needed in tika (newly needed in PDFBox 2.0.22)
3a4c529 is described below

commit 3a4c529a201c9c3d9b56cbdf8c2f8b702d74768e
Author: THausherr <ti...@snafu.de>
AuthorDate: Sun Dec 20 05:55:16 2020 +0100

    TIKA-3246: call tailored fixup when getting AcroForm the first time to avoid the creation of appearances which aren't needed in tika (newly needed in PDFBox 2.0.22)
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  8 ++---
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 36 ++++++++++++++++++----
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2ba928a..27415d3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -233,8 +233,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
 
         //now try the xfa
-        if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
-            pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+        if (pdfDocument.getDocumentCatalog().getAcroForm(null) != null &&
+            pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA() != null) {
 
             Metadata xfaMetadata = new Metadata();
             xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
@@ -243,7 +243,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     supportedTypes.contains(XFA_MEDIA_TYPE)) {
                 byte[] bytes = null;
                 try {
-                    bytes = pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
+                    bytes = pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes();
                 } catch (IOException e) {
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                 }
@@ -743,7 +743,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (catalog == null)
             return;
 
-        PDAcroForm form = catalog.getAcroForm();
+        PDAcroForm form = catalog.getAcroForm(null);
         if (form == null)
             return;
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index dd18464..9ca5ce9 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -39,6 +39,10 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.pdfbox.pdmodel.fixup.AbstractFixup;
+import org.apache.pdfbox.pdmodel.fixup.PDDocumentFixup;
+import org.apache.pdfbox.pdmodel.fixup.processor.AcroFormDefaultsProcessor;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
 import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
@@ -260,9 +264,13 @@ public class PDFParser extends AbstractParser implements Initializable {
         if (document.getDocumentCatalog().getLanguage() != null) {
             metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
         }
-        if (document.getDocumentCatalog().getAcroForm() != null &&
-            document.getDocumentCatalog().getAcroForm().getFields() != null &&
-            document.getDocumentCatalog().getAcroForm().getFields().size() > 0) {
+        // TIKA-3246: Do this for the first call of getAcroForm(),
+        // subsequent calls should use the same fixup or null to avoid a default fixup.
+        // Do not call without parameters (would mean default fixup which is slower because
+        // it creates annotation appearances)
+        PDDocumentFixup fixup = new TikaAcroFormFixup(document);
+        PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(fixup);
+        if (acroForm != null && acroForm.getFields() != null && !acroForm.getFields().isEmpty()) {
             metadata.set(PDF.HAS_ACROFORM_FIELDS, "true");
         }
         PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(), metadata, context);
@@ -353,8 +361,8 @@ public class PDFParser extends AbstractParser implements Initializable {
 
     private boolean hasXFA(PDDocument pdDocument) {
         return pdDocument.getDocumentCatalog() != null &&
-                pdDocument.getDocumentCatalog().getAcroForm() != null &&
-                pdDocument.getDocumentCatalog().getAcroForm().hasXFA();
+                pdDocument.getDocumentCatalog().getAcroForm(null) != null &&
+                pdDocument.getDocumentCatalog().getAcroForm(null).hasXFA();
     }
 
     private boolean shouldHandleXFAOnly(boolean hasXFA, PDFParserConfig config) {
@@ -368,7 +376,7 @@ public class PDFParser extends AbstractParser implements Initializable {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         try (InputStream is = new ByteArrayInputStream(
-                pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
+                pdDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes())) {
             ex.extract(is, xhtml, metadata, context);
         } catch (XMLStreamException e) {
             throw new TikaException("XML error in XFA", e);
@@ -639,4 +647,20 @@ public class PDFParser extends AbstractParser implements Initializable {
             HAS_WARNED = true;
         }
     }
+
+    /**
+     * Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan
+     * widgets, which we don't need.
+     */
+    class TikaAcroFormFixup extends AbstractFixup
+    {
+        TikaAcroFormFixup(PDDocument document) {
+            super(document);
+        }
+
+        @Override
+        public void apply() {
+            new AcroFormDefaultsProcessor(document).process();
+        }
+    }
 }