You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/06/09 01:03:57 UTC
[tika] branch master updated: TIKA-2254 -- extract text from charts
in ooxml.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new d2820ce TIKA-2254 -- extract text from charts in ooxml.
d2820ce is described below
commit d2820ce62545c847a2d3e79b7b4b8a3f2022a619
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Jun 8 21:03:48 2017 -0400
TIKA-2254 -- extract text from charts in ooxml.
---
CHANGES.txt | 3 ++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 ++
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 17 ++++++++++-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 5 ++++
.../ooxml/SXWPFWordExtractorDecorator.java | 3 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 12 +++++++-
.../ooxml/XSSFBExcelExtractorDecorator.java | 17 ++++++++++-
.../ooxml/XSSFExcelExtractorDecorator.java | 16 ++++++++++-
.../ooxml/XWPFWordExtractorDecorator.java | 11 +++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 32 +++++++++++++++++++++
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 9 ++++++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 9 ++++++
.../resources/test-documents/testEXCEL_charts.xlsb | Bin 0 -> 12835 bytes
.../resources/test-documents/testEXCEL_charts.xlsx | Bin 0 -> 31677 bytes
.../resources/test-documents/testPPT_charts.pptx | Bin 0 -> 34766 bytes
.../resources/test-documents/testWORD_charts.docx | Bin 0 -> 15586 bytes
16 files changed, 131 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index f770182..7c2eac2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15.1 - ??/??/????
+ * Extract text from charts in .docx, .pptx, .xlsx and .xlsb
+ (TIKA-2254).
+
* Extract text from diagrams in .docx, .pptx, .xlsx and .xlsb
(TIKA-1945).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 67c5f15..0b3bbd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -83,6 +83,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject";
static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
+ //once we add this to XWPFRelation, we should swap that out and remove this
+ static final String RELATION_CHART = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart";
private static final String TYPE_OLE_OBJECT =
"application/vnd.openxmlformats-officedocument.oleObject";
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index c430f7d..c4afd00 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -32,10 +32,14 @@ import org.xml.sax.helpers.DefaultHandler;
*
* <p/>
*
- * This class does not check for namespaces, and it can be applied
+ * This class does not generally check for namespaces, and it can be applied
* to PPTX and DOCX for text extraction.
*
* <p/>
+ * This can be used to scrape content from charts. It currently ignores
+ * formula (<c:f/>) elements
+ *
+ * <p/>
* This does not work with .xlsx or .vsdx.
*
* TODO: move this into POI?
@@ -80,6 +84,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private final static String FALLBACK = "Fallback";
private final static String OLE_OBJECT = "OLEObject";
private final static String CR = "cr";
+ private final static String V = "v";
public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
@@ -87,6 +92,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
private final static String V_NS = "urn:schemas-microsoft-com:vml";
+ private final static String C_NS = "http://schemas.openxmlformats.org/drawingml/2006/chart";
private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
@@ -144,6 +150,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private boolean inDelText = false;
private boolean inHlinkClick = false;
private boolean inTextBox = false;
+ private boolean inV = false; //in c:v in chart file
private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
@@ -319,6 +326,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
} else if (ENDNOTE_REFERENCE.equals(localName)) {
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.endnoteReference(id);
+ } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
+ inV = true;
}
}
@@ -405,6 +414,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
bodyContentsHandler.hyperlinkEnd();
} else if (PICT.equals(localName)) {
handlePict();
+ } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
+ inV = false;
+ handleEndOfRun();
}
}
@@ -448,6 +460,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
runBuffer.append(ch, start, length);
} else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
runBuffer.append(ch, start, length);
+ } else if (inV) {
+ runBuffer.append(ch, start, length);
+ runBuffer.append(TAB_CHAR, 0, 1);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 69526f9..47d0a07 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -221,6 +221,11 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
+ handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(),
+ "chart", slidePart,
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
}
/**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 3525794..1f0eb77 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -157,7 +157,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
XWPFRelation.COMMENT.getRelation(),
XWPFRelation.FOOTER.getRelation(),
XWPFRelation.ENDNOTE.getRelation(),
- AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA
+ AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+ AbstractOOXMLExtractor.RELATION_CHART
}) {
try {
PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 159ec00..35dba6d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -178,7 +178,17 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
new HashMap<String, String>()//empty
)
);
-
+ //now dump chart data
+ handleGeneralTextContainingPart(
+ XSLFRelation.CHART.getRelation(),
+ "chart",
+ slide.getPackagePart(),
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
+ );
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 13eb6b4..14744d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@@ -191,7 +192,21 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
"diagram-data",
drawing.getPackagePart(),
metadata,
- xhtml
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
+ );
+ //dump chart data
+ handleGeneralTextContainingPart(
+ XSSFRelation.CHART.getRelation(),
+ "chart",
+ drawing.getPackagePart(),
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index ccfaf6f..b554354 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -263,7 +263,21 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
"diagram-data",
drawing.getPackagePart(),
metadata,
- xhtml
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
+ );
+ //dump chart data
+ handleGeneralTextContainingPart(
+ XSSFRelation.CHART.getRelation(),
+ "chart",
+ drawing.getPackagePart(),
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 7e658a3..e7893c8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -130,6 +130,17 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
new HashMap<String, String>()//empty
)
);
+ //handle chart data
+ handleGeneralTextContainingPart(
+ AbstractOOXMLExtractor.RELATION_CHART,
+ "chart",
+ document.getPackagePart(),
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<String, String>()//empty
+ )
+ );
// then all document footers
if (hfPolicy != null) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ee66e5a..bb18e46 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1580,6 +1580,38 @@ public class OOXMLParserTest extends TikaTest {
assertContains("President", getXML("testPPT_diagramData.pptx").xml);
}
+ @Test
+ public void testXLSXChartData() throws Exception {
+ String xml = getXML("testEXCEL_charts.xlsx").xml;
+ assertContains("peach", xml);
+ assertContains("March\tApril", xml);
+ assertNotContained("chartSpace", xml);
+ }
+
+ @Test
+ public void testXLSBChartData() throws Exception {
+ String xml = getXML("testEXCEL_charts.xlsb").xml;
+ assertContains("peach", xml);
+ assertContains("March\tApril", xml);
+ assertNotContained("chartSpace", xml);
+ }
+
+ @Test
+ public void testDOCXChartData() throws Exception {
+ String xml = getXML("testWORD_charts.docx").xml;
+ assertContains("peach", xml);
+ assertContains("March\tApril", xml);
+ assertNotContained("chartSpace", xml);
+ }
+
+ @Test
+ public void testPPTXChartData() throws Exception {
+ String xml = getXML("testPPT_charts.pptx").xml;
+ assertContains("peach", xml);
+ assertContains("March\tApril", xml);
+ assertNotContained("chartSpace", xml);
+ }
+
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 1de2b52..cad4913 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -571,4 +571,13 @@ public class SXSLFExtractorTest extends TikaTest {
public void testDiagramData() throws Exception {
assertContains("President", getXML("testPPT_diagramData.pptx", parseContext).xml);
}
+
+ @Test
+ public void testPPTXChartData() throws Exception {
+ String xml = getXML("testPPT_charts.pptx", parseContext).xml;
+ assertContains("peach", xml);
+ assertContains("March\tApril", xml);
+ assertNotContained("chartSpace", xml);
+ }
+
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 8e7ab6d..6994a3a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -801,4 +801,13 @@ public class SXWPFExtractorTest extends TikaTest {
public void testDiagramData() throws Exception {
assertContains("From here", getXML("testWORD_diagramData.docx", parseContext).xml);
}
+
+ @Test
+ public void testDOCXChartData() throws Exception {
+ String xml = getXML("testWORD_charts.docx", parseContext).xml;
+ assertContains("peach", xml);
+ assertContains("March\tApril", xml);
+ assertNotContained("chartSpace", xml);
+ }
+
}
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsb b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsb
new file mode 100644
index 0000000..6d71c88
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsb differ
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsx
new file mode 100644
index 0000000..2c5d0df
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsx differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_charts.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_charts.pptx
new file mode 100644
index 0000000..af549b1
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_charts.pptx differ
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_charts.docx b/tika-parsers/src/test/resources/test-documents/testWORD_charts.docx
new file mode 100644
index 0000000..7e9d985
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_charts.docx differ
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].