You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2023/08/12 18:06:58 UTC
[tika] 04/04: TIKA-4114: avoid methods that no longer exists in PDFBox 3.0
This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7c383171529f03ca5c6be8d3398c94a28bb53acf
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sat Aug 12 20:06:39 2023 +0200
TIKA-4114: avoid methods that no longer exists in PDFBox 3.0
---
.../apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index a3a49a367..1bdbebc09 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -100,7 +100,9 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
*
* @param pdDocument PDF document
* @param handler SAX content handler
+ * @param context
* @param metadata PDF metadata
+ * @param config
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
@@ -273,25 +275,29 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
for (COSBase k : ((COSArray) kids)) {
recurse(k, currentPageRef, depth, paragraphs, roleMap);
}
- } else if (kids instanceof COSObject) {
- COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE);
+ } else if (kids instanceof COSObject &&
+ ((COSObject) kids).getObject() instanceof COSDictionary) {
+ //TODO should be merged with COSDictionary segment below?
+ // and maybe dereference COSObject first, i.e. before the first "if"?
+ COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject();
+ COSBase cosType = dict.getItem(COSName.TYPE);
if (cosType != null && cosType instanceof COSName) {
if ("OBJR".equals(((COSName) cosType).getName())) {
- recurse(((COSObject) kids).getDictionaryObject(COSName.OBJ), currentPageRef,
+ recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef,
depth + 1, paragraphs, roleMap);
}
}
- COSBase n = ((COSObject) kids).getItem(COSName.S);
+ COSBase n = dict.getItem(COSName.S);
String name = "";
if (n instanceof COSName) {
name = ((COSName) n).getName();
}
- COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
+ COSBase grandkids = dict.getItem(COSName.K);
if (grandkids == null) {
return;
}
- COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);
+ COSBase pageBase = dict.getItem(COSName.PG);
if (pageBase != null && pageBase instanceof COSObject) {
currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),