You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:18:50 UTC
[tika] branch TIKA-4091 updated: TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4091
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4091 by this push:
new c7860a408 TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091
c7860a408 is described below
commit c7860a408a3e35cb1a247122c35405c1ee28d1ba
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 15 11:18:33 2023 -0400
TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091
---
.../parser/microsoft/AbstractPOIFSExtractor.java | 42 +++++++++++++++-------
1 file changed, 30 insertions(+), 12 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 45beee741..df4668eb8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -260,21 +260,19 @@ abstract class AbstractPOIFSExtractor {
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+ String contentsEntryName = getContentsEntryName(dir);
+ if (contentsEntryName == null) {
+ //log or record exception?
+ return;
+ }
// Grab the contents and process
DocumentEntry contentsEntry;
- /*if (dir.hasEntry("CorelDRAW")) {
- contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
- */
- //TODO: modify getEntry to case insensitive when available in POI
+
try {
- contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
- } catch (FileNotFoundException fnfe1) {
- try {
- contentsEntry = (DocumentEntry) dir.getEntry("Contents");
- } catch (FileNotFoundException fnfe2) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
- return;
- }
+ contentsEntry = (DocumentEntry) dir.getEntry(contentsEntryName);
+ } catch (FileNotFoundException fnfe) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata);
+ return;
}
int length = contentsEntry.getSize();
@@ -308,6 +306,26 @@ abstract class AbstractPOIFSExtractor {
}
}
+ private String getContentsEntryName(DirectoryEntry dir) {
+ /*
+ if (dir.hasEntry("CorelDRAW")) {
+ contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
+ */
+ //TODO: modify getEntry to case insensitive when available in POI
+ if (dir.hasEntry("CONTENTS")) {
+ return "CONTENTS";
+ } else if (dir.hasEntry("Contents")) {
+ return "Contents";
+ } else {
+ for (String n : dir.getEntryNames()) {
+ if ("contents".equalsIgnoreCase(n)) {
+ return n;
+ }
+ }
+ }
+ return null;
+ }
+
private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)