You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/30 13:18:28 UTC

[tika] branch main updated: TIKA-3841 -- add defensive try/catch until we can fix the underlying POI issue.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 437c852b2 TIKA-3841 -- add defensive try/catch until we can fix the underlying POI issue.
437c852b2 is described below

commit 437c852b2c5fc8c4bbb4aa21747fabb634897ff7
Author: tballison <ta...@apache.org>
AuthorDate: Tue Aug 30 09:18:17 2022 -0400

    TIKA-3841 -- add defensive try/catch until we can fix the underlying POI issue.
---
 .../org/apache/tika/parser/microsoft/WordExtractor.java    | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 95bb4a20e..5207ec05b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -52,6 +52,8 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
@@ -72,6 +74,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     private static final String LIST_DELIMITER = " ";
     private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<>();
     private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
+    private static final Logger LOG = LoggerFactory.getLogger(WordExtractor.class);
 
     static {
         fixedParagraphStyles.put("Default", defaultParagraphStyle);
@@ -249,12 +252,17 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             ListManager listManager = new ListManager(document);
             for (Range r : ranges) {
                 if (r != null) {
-                    for (int i = 0; i < r.numParagraphs(); i++) {
-                        Paragraph p = r.getParagraph(i);
+                    try {
+                        for (int i = 0; i < r.numParagraphs(); i++) {
+                            Paragraph p = r.getParagraph(i);
 
-                        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures,
+                            i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures,
                                 pictureTable, listManager, xhtml);
+                        }
+                    } catch (ArrayIndexOutOfBoundsException e) {
+                        LOG.warn("TIKA-3841 -- content may be missing from this header/footer");
                     }
+
                 }
             }
             xhtml.endElement("div");