You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/30 13:18:28 UTC
[tika] branch main updated: TIKA-3841 -- add defensive try/catch until we can fix the underlying POI issue.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 437c852b2 TIKA-3841 -- add defensive try/catch until we can fix the underlying POI issue.
437c852b2 is described below
commit 437c852b2c5fc8c4bbb4aa21747fabb634897ff7
Author: tballison <ta...@apache.org>
AuthorDate: Tue Aug 30 09:18:17 2022 -0400
TIKA-3841 -- add defensive try/catch until we can fix the underlying POI issue.
---
.../org/apache/tika/parser/microsoft/WordExtractor.java | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 95bb4a20e..5207ec05b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -52,6 +52,8 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -72,6 +74,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
private static final String LIST_DELIMITER = " ";
private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<>();
private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
+ private static final Logger LOG = LoggerFactory.getLogger(WordExtractor.class);
static {
fixedParagraphStyles.put("Default", defaultParagraphStyle);
@@ -249,12 +252,17 @@ public class WordExtractor extends AbstractPOIFSExtractor {
ListManager listManager = new ListManager(document);
for (Range r : ranges) {
if (r != null) {
- for (int i = 0; i < r.numParagraphs(); i++) {
- Paragraph p = r.getParagraph(i);
+ try {
+ for (int i = 0; i < r.numParagraphs(); i++) {
+ Paragraph p = r.getParagraph(i);
- i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures,
+ i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures,
pictureTable, listManager, xhtml);
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.warn("TIKA-3841 -- content may be missing from this header/footer");
}
+
}
}
xhtml.endElement("div");