You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2023/08/12 18:06:58 UTC

[tika] 04/04: TIKA-4114: avoid methods that no longer exists in PDFBox 3.0

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7c383171529f03ca5c6be8d3398c94a28bb53acf
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sat Aug 12 20:06:39 2023 +0200

    TIKA-4114: avoid methods that no longer exists in PDFBox 3.0
---
 .../apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index a3a49a367..1bdbebc09 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -100,7 +100,9 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
      *
      * @param pdDocument PDF document
      * @param handler    SAX content handler
+     * @param context
      * @param metadata   PDF metadata
+     * @param config
      * @throws SAXException  if the content handler fails to process SAX events
      * @throws TikaException if there was an exception outside of per page processing
      */
@@ -273,25 +275,29 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
             for (COSBase k : ((COSArray) kids)) {
                 recurse(k, currentPageRef, depth, paragraphs, roleMap);
             }
-        } else if (kids instanceof COSObject) {
-            COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE);
+        } else if (kids instanceof COSObject && 
+                ((COSObject) kids).getObject() instanceof COSDictionary) {
+            //TODO should be merged with COSDictionary segment below?
+            // and maybe dereference COSObject first, i.e. before the first "if"?
+            COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject();
+            COSBase cosType = dict.getItem(COSName.TYPE);
             if (cosType != null && cosType instanceof COSName) {
                 if ("OBJR".equals(((COSName) cosType).getName())) {
-                    recurse(((COSObject) kids).getDictionaryObject(COSName.OBJ), currentPageRef,
+                    recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef,
                             depth + 1, paragraphs, roleMap);
                 }
             }
 
-            COSBase n = ((COSObject) kids).getItem(COSName.S);
+            COSBase n = dict.getItem(COSName.S);
             String name = "";
             if (n instanceof COSName) {
                 name = ((COSName) n).getName();
             }
-            COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
+            COSBase grandkids = dict.getItem(COSName.K);
             if (grandkids == null) {
                 return;
             }
-            COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);
+            COSBase pageBase = dict.getItem(COSName.PG);
 
             if (pageBase != null && pageBase instanceof COSObject) {
                 currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),