You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/01/13 13:47:01 UTC

tika git commit: TIKA-2134 -- handle missing parts more robustly

Repository: tika
Updated Branches:
  refs/heads/master 8eb7d352f -> 526fc08f2


TIKA-2134 -- handle missing parts more robustly


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/526fc08f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/526fc08f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/526fc08f

Branch: refs/heads/master
Commit: 526fc08f2a0a4c44ea191a10748ca9ef0e834cb3
Parents: 8eb7d35
Author: tballison <ta...@mitre.org>
Authored: Fri Jan 13 08:46:54 2017 -0500
Committer: tballison <ta...@mitre.org>
Committed: Fri Jan 13 08:46:54 2017 -0500

----------------------------------------------------------------------
 .../microsoft/ooxml/AbstractOOXMLExtractor.java     |  4 ++++
 .../ooxml/XSSFExcelExtractorDecorator.java          | 16 +++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/526fc08f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 9c12fc5..426092e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -181,6 +181,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
         Set<String> seen = new HashSet<>();
         try {
             for (PackagePart source : getMainDocumentParts()) {
+                if (source == null) {
+                    //parts can go missing; silently ignore --  TIKA-2134
+                    continue;
+                }
                 for (PackageRelationship rel : source.getRelationships()) {
                     try {
                         handleEmbeddedPart(source, rel, handler, metadata, seen);

http://git-wip-us.apache.org/repos/asf/tika/blob/526fc08f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 0f6957c..45a6a84 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -159,7 +159,13 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
             for (String footer : sheetExtractor.footers) {
                 extractHeaderFooter(footer, xhtml);
             }
-            processShapes(iter.getShapes(), xhtml);
+            List<XSSFShape> shapes = null;
+            try {
+                shapes = iter.getShapes();
+            } catch (NullPointerException e) {
+                //missing shape
+            }
+            processShapes(shapes, xhtml);
 
             //for now dump sheet hyperlinks at bottom of page
             //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
@@ -176,8 +182,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
             for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                 if (rel.getTargetMode() == TargetMode.INTERNAL) {
                     PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
-                    for (PackageRelationship drawRel : rel.getPackage()
-                            .getPart(relName)
+                    PackagePart part = rel.getPackage().getPart(relName);
+                    //parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
+                    if (part == null) {
+                        continue;
+                    }
+                    for (PackageRelationship drawRel : part
                             .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                         drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
                     }