You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2023/08/12 18:06:54 UTC

[tika] branch main updated (83689cc14 -> 7c3831715)

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 83689cc14 TIKA-4114: add comment
     new 9ddb2de3e TIKA-4114: add comment
     new 4d711ca48 TIKA-4114: add comment
     new e0b56b321 TIKA-4114: include exception and add dummy throw so that code will with with PDFBox 2.0 and 3.0
     new 7c3831715 TIKA-4114: avoid methods that no longer exists in PDFBox 3.0

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/tika/fuzzing/pdf/EvilCOSWriter.java    |  2 ++
 .../tika/parser/pdf/PDFMarkedContent2XHTML.java       | 18 ++++++++++++------
 .../apache/tika/parser/xmp/XMPMetadataExtractor.java  | 19 ++++++++++++++-----
 .../org/apache/tika/parser/crypto/TSDParserTest.java  |  1 +
 4 files changed, 29 insertions(+), 11 deletions(-)


[tika] 01/04: TIKA-4114: add comment

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9ddb2de3e978d45fb92ef0ff8940d2702b2c7eb5
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sat Aug 12 19:03:08 2023 +0200

    TIKA-4114: add comment
---
 .../src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 00cccbdc9..6848647e0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -43,6 +43,7 @@ public class TSDParserTest extends TikaTest {
         assertEquals(2, list.size());
         assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
         assertNotNull(list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+        //TODO PDFBOX30 adjust the assertion below, compare the old and new stack traces
         assertContains("org.apache.pdfbox.pdmodel.PDDocument.load",
                 list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
     }


[tika] 04/04: TIKA-4114: avoid methods that no longer exists in PDFBox 3.0

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7c383171529f03ca5c6be8d3398c94a28bb53acf
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sat Aug 12 20:06:39 2023 +0200

    TIKA-4114: avoid methods that no longer exists in PDFBox 3.0
---
 .../apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index a3a49a367..1bdbebc09 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -100,7 +100,9 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
      *
      * @param pdDocument PDF document
      * @param handler    SAX content handler
+     * @param context
      * @param metadata   PDF metadata
+     * @param config
      * @throws SAXException  if the content handler fails to process SAX events
      * @throws TikaException if there was an exception outside of per page processing
      */
@@ -273,25 +275,29 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
             for (COSBase k : ((COSArray) kids)) {
                 recurse(k, currentPageRef, depth, paragraphs, roleMap);
             }
-        } else if (kids instanceof COSObject) {
-            COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE);
+        } else if (kids instanceof COSObject && 
+                ((COSObject) kids).getObject() instanceof COSDictionary) {
+            //TODO should be merged with COSDictionary segment below?
+            // and maybe dereference COSObject first, i.e. before the first "if"?
+            COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject();
+            COSBase cosType = dict.getItem(COSName.TYPE);
             if (cosType != null && cosType instanceof COSName) {
                 if ("OBJR".equals(((COSName) cosType).getName())) {
-                    recurse(((COSObject) kids).getDictionaryObject(COSName.OBJ), currentPageRef,
+                    recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef,
                             depth + 1, paragraphs, roleMap);
                 }
             }
 
-            COSBase n = ((COSObject) kids).getItem(COSName.S);
+            COSBase n = dict.getItem(COSName.S);
             String name = "";
             if (n instanceof COSName) {
                 name = ((COSName) n).getName();
             }
-            COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
+            COSBase grandkids = dict.getItem(COSName.K);
             if (grandkids == null) {
                 return;
             }
-            COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);
+            COSBase pageBase = dict.getItem(COSName.PG);
 
             if (pageBase != null && pageBase instanceof COSObject) {
                 currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),


[tika] 02/04: TIKA-4114: add comment

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4d711ca48af8ca82e30921a6f02377c7201301e8
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sat Aug 12 19:50:38 2023 +0200

    TIKA-4114: add comment
---
 .../src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java        | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
index 558905bac..697022215 100644
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
+++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
@@ -88,6 +88,8 @@ import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 
+//TODO PDFBOX30 replace COSWriterXRefEntry with XReferenceEntry (and much more)
+
 public class EvilCOSWriter implements ICOSVisitor, Closeable {
 
     /**


[tika] 03/04: TIKA-4114: include exception and add dummy throw so that code will with with PDFBox 2.0 and 3.0

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e0b56b321dcb3c380218b43f77a293a79e511012
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sat Aug 12 20:03:37 2023 +0200

    TIKA-4114: include exception and add dummy throw so that code will with with PDFBox 2.0 and 3.0
---
 .../apache/tika/parser/xmp/XMPMetadataExtractor.java  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
index d4a3b4001..157aedb90 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
@@ -32,6 +32,7 @@ import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.XMP;
+import org.apache.xmpbox.type.BadFieldValueException;
 
 /**
  * XMP Metadata Extractor based on Apache XmpBox.
@@ -79,11 +80,19 @@ public class XMPMetadataExtractor {
             return;
         }
         if (schemaDublinCore != null) {
-            addMetadata(metadata, DublinCore.TITLE, schemaDublinCore.getTitle());
-            addMetadata(metadata, DublinCore.FORMAT, schemaDublinCore.getFormat());
-            addMetadata(metadata, DublinCore.DESCRIPTION, schemaDublinCore.getDescription());
-            addMetadata(metadata, DublinCore.CREATOR, schemaDublinCore.getCreators());
-            addMetadata(metadata, DublinCore.SUBJECT, schemaDublinCore.getSubjects());
+            try {
+                addMetadata(metadata, DublinCore.TITLE, schemaDublinCore.getTitle());
+                addMetadata(metadata, DublinCore.FORMAT, schemaDublinCore.getFormat());
+                addMetadata(metadata, DublinCore.DESCRIPTION, schemaDublinCore.getDescription());
+                addMetadata(metadata, DublinCore.CREATOR, schemaDublinCore.getCreators());
+                addMetadata(metadata, DublinCore.SUBJECT, schemaDublinCore.getSubjects());
+                //TODO PDFBOX30 this segment no longer needed with 3.0
+                if (false != false)
+                    throw new BadFieldValueException("");
+            }
+            catch (BadFieldValueException ex) {
+                throw new IOException(ex);
+            }
         }
     }