You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/14 15:41:32 UTC
[tika] 01/02: TIKA-3477 -- don't close embedded word doc because
that in turn closes the root. Unrelated issue,
avoid NPE if ooxml part doesn't exist.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 070222a6a968aca6aff6fdf438379dfe4723e7e0
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 14 10:19:25 2021 -0400
TIKA-3477 -- don't close embedded word doc because that in turn closes the root. Unrelated issue, avoid NPE if ooxml part doesn't exist.
---
.../apache/tika/parser/microsoft/WordExtractor.java | 19 ++++++++++---------
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 3 +++
2 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index f8ff5e1..7816073 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -85,11 +85,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
private final Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();
- private final Metadata metadata;
-
public WordExtractor(ParseContext context, Metadata metadata) {
- super(context);
- this.metadata = metadata;
+ super(context, metadata);
}
private static int countParagraphs(Range... ranges) {
@@ -218,14 +215,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
p = pictures.nextUnclaimed();
}
- // Handle any embeded office documents
+ // Handle any embedded office documents
try {
+
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
}
}
+
} catch (FileNotFoundException e) {
//swallow
}
@@ -237,7 +236,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
return;
}
for (SavedByEntry sbe : savedByTable.getEntries()) {
- metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+ parentMetadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
}
}
@@ -596,9 +595,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml)
throws IOException, SAXException {
Word6Extractor extractor;
- try (HWPFOldDocument doc = new HWPFOldDocument(root)) {
- extractor = new Word6Extractor(doc);
- }
+ //DO NOT put this in a try/autoclose. This will close the root
+ //which we don't want to do because there may be other
+ //documents in the root.
+ HWPFOldDocument doc = new HWPFOldDocument(root);
+ extractor = new Word6Extractor(doc);
for (String p : extractor.getParagraphText()) {
xhtml.element("p", p);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index cd6c6e5..f23ae12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -167,6 +167,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
for (PackageRelationship rel : opcPackage
.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
PackagePart tPart = opcPackage.getPart(rel);
+ if (tPart == null) {
+ continue;
+ }
InputStream tStream = tPart.getInputStream();
Metadata thumbnailMetadata = new Metadata();
String thumbName = tPart.getPartName().getName();