You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/06 14:58:21 UTC

[tika] branch TIKA-3347 updated: TIKA-3347 -- fix logic in PDFMarkedContent2XHTML

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3347
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3347 by this push:
     new b064dc0  TIKA-3347 -- fix logic in PDFMarkedContent2XHTML
b064dc0 is described below

commit b064dc091257cbb42da57bf222406e37d32b9ae8
Author: tballison <ta...@apache.org>
AuthorDate: Tue Apr 6 10:58:11 2021 -0400

    TIKA-3347 -- fix logic in PDFMarkedContent2XHTML
---
 .../tika/parser/pdf/PDFMarkedContent2XHTML.java    | 37 +++++++++++-----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index b2c5a82..39a6c3a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -277,6 +277,17 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
                     currentPageRef, depth++, paragraphs, roleMap);
         } else if (kids instanceof COSDictionary) {
             COSDictionary kidsDictionary = (COSDictionary)kids;
+            //short circuit look for anchor/uri
+            if (kidsDictionary.containsKey(COSName.A)) {
+                COSDictionary anchor = kidsDictionary.getCOSDictionary(COSName.A);
+                //check for subtype /Link ?
+                //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
+                if (anchor != null) {
+                    state.uri = anchor.getString(COSName.URI);
+                }
+                return;
+            }
+            //try the other types of dicts
             COSBase cosType = kidsDictionary.getItem(COSName.TYPE);
             if (cosType != null && cosType instanceof COSName) {
                 if ("OBJR".equals(((COSName) cosType).getName())) {
@@ -292,7 +303,13 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
             }
             COSBase grandkids = kidsDictionary.getItem(COSName.K);
             if (grandkids == null) {
-                return;
+                //if grandkids object doesn't exist, try straight obj
+                if (kidsDictionary.containsKey(COSName.OBJ)) {
+                    recurse(kidsDictionary.getDictionaryObject(COSName.OBJ), currentPageRef,
+                            depth + 1, paragraphs, roleMap);
+                } else {
+                    return;
+                }
             }
             COSBase pageBase = kidsDictionary.getItem(COSName.PG);
 
@@ -350,24 +367,6 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
             } else {
                 //TODO: log can't find mcid
             }
-        } else if (kids instanceof COSDictionary) {
-            //TODO: check for other types of dictionary?
-            COSDictionary dict = (COSDictionary) kids;
-            COSDictionary anchor = dict.getCOSDictionary(COSName.A);
-            //check for subtype /Link ?
-            //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
-            if (anchor != null) {
-                state.uri = anchor.getString(COSName.URI);
-            } else {
-                if (dict.containsKey(COSName.K)) {
-                    recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1,
-                            paragraphs, roleMap);
-                } else if (dict.containsKey(COSName.OBJ)) {
-                    recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
-                            paragraphs, roleMap);
-
-                }
-            }
         } else {
             //TODO: handle a different object?
         }