You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 14:22:10 UTC
[tika] branch branch_1x updated: TIKA-2703 make sure to process
shape's parent drawing only once.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 2cf0a96 TIKA-2703 make sure to process shape's parent drawing only once.
2cf0a96 is described below
commit 2cf0a964c3b1eb200864d3cfec881b4a9bad45fe
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 10:21:06 2018 -0400
TIKA-2703 make sure to process shape's parent drawing only once.
---
.../ooxml/XSSFExcelExtractorDecorator.java | 62 +++++++++++++---------
1 file changed, 37 insertions(+), 25 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 3141148..4560153 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -16,14 +16,16 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import javax.xml.parsers.SAXParser;
+
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -254,6 +256,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
if (shapes == null) {
return;
}
+ //We don't currently have an obvious way to get drawings
+ //directly from sheetIter. Therefore, we grab the shapes and process those.
+ //To get the diagrams and charts, we need to get the parent drawing for each
+ //shape, and we need to make sure that we only process each parent shape once!
+ //SEE TIKA-2703 TODO: add unit test
+ Set<String> seenParentDrawings = new HashSet<>();
for (XSSFShape shape : shapes) {
if (shape instanceof XSSFSimpleShape) {
String sText = ((XSSFSimpleShape) shape).getText();
@@ -262,30 +270,34 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
}
- XSSFDrawing drawing = shape.getDrawing();
- if (drawing != null) {
- //dump diagram data
- handleGeneralTextContainingPart(
- AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
- "diagram-data",
- drawing.getPackagePart(),
- metadata,
- new OOXMLWordAndPowerPointTextHandler(
- new OOXMLTikaBodyPartHandler(xhtml),
- new HashMap<String, String>()//empty
- )
- );
- //dump chart data
- handleGeneralTextContainingPart(
- XSSFRelation.CHART.getRelation(),
- "chart",
- drawing.getPackagePart(),
- metadata,
- new OOXMLWordAndPowerPointTextHandler(
- new OOXMLTikaBodyPartHandler(xhtml),
- new HashMap<String, String>()//empty
- )
- );
+
+ XSSFDrawing parentDrawing = shape.getDrawing();
+ if (parentDrawing != null) {
+ if (! seenParentDrawings.contains(parentDrawing.getPackagePart().getPartName().toString())) {
+ //dump diagram data
+ handleGeneralTextContainingPart(
+ AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+ "diagram-data",
+ parentDrawing.getPackagePart(),
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
+ );
+ //dump chart data
+ handleGeneralTextContainingPart(
+ XSSFRelation.CHART.getRelation(),
+ "chart",
+ parentDrawing.getPackagePart(),
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
+ );
+ }
+ seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString());
}
}
}