You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 14:22:10 UTC

[tika] branch branch_1x updated: TIKA-2703 make sure to process shape's parent drawing only once.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 2cf0a96  TIKA-2703 make sure to process shape's parent drawing only once.
2cf0a96 is described below

commit 2cf0a964c3b1eb200864d3cfec881b4a9bad45fe
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 10:21:06 2018 -0400

    TIKA-2703 make sure to process shape's parent drawing only once.
---
 .../ooxml/XSSFExcelExtractorDecorator.java         | 62 +++++++++++++---------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 3141148..4560153 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -16,14 +16,16 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import javax.xml.parsers.SAXParser;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -254,6 +256,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         if (shapes == null) {
             return;
         }
+        //We don't currently have an obvious way to get drawings
+        //directly from sheetIter. Therefore, we grab the shapes and process those.
+        //To get the diagrams and charts, we need to get the parent drawing for each
+        //shape, and we need to make sure that we only process each parent shape once!
+        //SEE TIKA-2703 TODO: add unit test
+        Set<String> seenParentDrawings = new HashSet<>();
         for (XSSFShape shape : shapes) {
             if (shape instanceof XSSFSimpleShape) {
                 String sText = ((XSSFSimpleShape) shape).getText();
@@ -262,30 +270,34 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
                 }
                 extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
             }
-            XSSFDrawing drawing = shape.getDrawing();
-            if (drawing != null) {
-                //dump diagram data
-                handleGeneralTextContainingPart(
-                        AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
-                        "diagram-data",
-                        drawing.getPackagePart(),
-                        metadata,
-                        new OOXMLWordAndPowerPointTextHandler(
-                                new OOXMLTikaBodyPartHandler(xhtml),
-                                new HashMap<String, String>()//empty
-                        )
-                );
-                //dump chart data
-                handleGeneralTextContainingPart(
-                        XSSFRelation.CHART.getRelation(),
-                        "chart",
-                        drawing.getPackagePart(),
-                        metadata,
-                        new OOXMLWordAndPowerPointTextHandler(
-                                new OOXMLTikaBodyPartHandler(xhtml),
-                                new HashMap<String, String>()//empty
-                        )
-                );
+
+            XSSFDrawing parentDrawing = shape.getDrawing();
+            if (parentDrawing != null) {
+                if (! seenParentDrawings.contains(parentDrawing.getPackagePart().getPartName().toString())) {
+                    //dump diagram data
+                    handleGeneralTextContainingPart(
+                            AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+                            "diagram-data",
+                            parentDrawing.getPackagePart(),
+                            metadata,
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml),
+                                    new HashMap<String, String>()//empty
+                            )
+                    );
+                    //dump chart data
+                    handleGeneralTextContainingPart(
+                            XSSFRelation.CHART.getRelation(),
+                            "chart",
+                            parentDrawing.getPackagePart(),
+                            metadata,
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml),
+                                    new HashMap<String, String>()//empty
+                            )
+                    );
+                }
+                seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString());
             }
         }
     }