You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/06 14:58:21 UTC
[tika] branch TIKA-3347 updated: TIKA-3347 -- fix logic in
PDFMarkedContent2XHTML
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3347
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3347 by this push:
new b064dc0 TIKA-3347 -- fix logic in PDFMarkedContent2XHTML
b064dc0 is described below
commit b064dc091257cbb42da57bf222406e37d32b9ae8
Author: tballison <ta...@apache.org>
AuthorDate: Tue Apr 6 10:58:11 2021 -0400
TIKA-3347 -- fix logic in PDFMarkedContent2XHTML
---
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 37 +++++++++++-----------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index b2c5a82..39a6c3a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -277,6 +277,17 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
currentPageRef, depth++, paragraphs, roleMap);
} else if (kids instanceof COSDictionary) {
COSDictionary kidsDictionary = (COSDictionary)kids;
+ //short circuit look for anchor/uri
+ if (kidsDictionary.containsKey(COSName.A)) {
+ COSDictionary anchor = kidsDictionary.getCOSDictionary(COSName.A);
+ //check for subtype /Link ?
+ //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
+ if (anchor != null) {
+ state.uri = anchor.getString(COSName.URI);
+ }
+ return;
+ }
+ //try the other types of dicts
COSBase cosType = kidsDictionary.getItem(COSName.TYPE);
if (cosType != null && cosType instanceof COSName) {
if ("OBJR".equals(((COSName) cosType).getName())) {
@@ -292,7 +303,13 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
}
COSBase grandkids = kidsDictionary.getItem(COSName.K);
if (grandkids == null) {
- return;
+ //if grandkids object doesn't exist, try straight obj
+ if (kidsDictionary.containsKey(COSName.OBJ)) {
+ recurse(kidsDictionary.getDictionaryObject(COSName.OBJ), currentPageRef,
+ depth + 1, paragraphs, roleMap);
+ } else {
+ return;
+ }
}
COSBase pageBase = kidsDictionary.getItem(COSName.PG);
@@ -350,24 +367,6 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
} else {
//TODO: log can't find mcid
}
- } else if (kids instanceof COSDictionary) {
- //TODO: check for other types of dictionary?
- COSDictionary dict = (COSDictionary) kids;
- COSDictionary anchor = dict.getCOSDictionary(COSName.A);
- //check for subtype /Link ?
- //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
- if (anchor != null) {
- state.uri = anchor.getString(COSName.URI);
- } else {
- if (dict.containsKey(COSName.K)) {
- recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1,
- paragraphs, roleMap);
- } else if (dict.containsKey(COSName.OBJ)) {
- recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
- paragraphs, roleMap);
-
- }
- }
} else {
//TODO: handle a different object?
}