You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2023/08/13 09:23:18 UTC
[tika] branch main updated: TIKA-4114: remove unneeded code
This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 48e5e72c7 TIKA-4114: remove unneeded code
48e5e72c7 is described below
commit 48e5e72c73e8f497fbd3ec50cbbcdedc9a584915
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sun Aug 13 11:23:08 2023 +0200
TIKA-4114: remove unneeded code
---
.../apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 1bdbebc09..bb65d294c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -37,6 +37,7 @@ import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDObjectReference;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
@@ -280,12 +281,11 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
//TODO should be merged with COSDictionary segment below?
// and maybe dereference COSObject first, i.e. before the first "if"?
COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject();
- COSBase cosType = dict.getItem(COSName.TYPE);
- if (cosType != null && cosType instanceof COSName) {
- if ("OBJR".equals(((COSName) cosType).getName())) {
- recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef,
- depth + 1, paragraphs, roleMap);
- }
+ COSName type = dict.getCOSName(COSName.TYPE);
+ if (COSName.getPDFName(PDObjectReference.TYPE).equals(type)) // OBJR
+ {
+ recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef,depth + 1, paragraphs,
+ roleMap);
}
COSBase n = dict.getItem(COSName.S);
@@ -299,7 +299,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
}
COSBase pageBase = dict.getItem(COSName.PG);
- if (pageBase != null && pageBase instanceof COSObject) {
+ if (pageBase instanceof COSObject) {
currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),
((COSObject) pageBase).getGenerationNumber());
}
@@ -368,7 +368,6 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
} else if (dict.containsKey(COSName.OBJ)) {
recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
paragraphs, roleMap);
-
}
}
} else {