You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/06/09 01:03:57 UTC

[tika] branch master updated: TIKA-2254 -- extract text from charts in ooxml.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d2820ce  TIKA-2254 -- extract text from charts in ooxml.
d2820ce is described below

commit d2820ce62545c847a2d3e79b7b4b8a3f2022a619
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Jun 8 21:03:48 2017 -0400

    TIKA-2254 -- extract text from charts in ooxml.
---
 CHANGES.txt                                        |   3 ++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   2 ++
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |  17 ++++++++++-
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |   5 ++++
 .../ooxml/SXWPFWordExtractorDecorator.java         |   3 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |  12 +++++++-
 .../ooxml/XSSFBExcelExtractorDecorator.java        |  17 ++++++++++-
 .../ooxml/XSSFExcelExtractorDecorator.java         |  16 ++++++++++-
 .../ooxml/XWPFWordExtractorDecorator.java          |  11 +++++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  32 +++++++++++++++++++++
 .../parser/microsoft/ooxml/SXSLFExtractorTest.java |   9 ++++++
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java |   9 ++++++
 .../resources/test-documents/testEXCEL_charts.xlsb | Bin 0 -> 12835 bytes
 .../resources/test-documents/testEXCEL_charts.xlsx | Bin 0 -> 31677 bytes
 .../resources/test-documents/testPPT_charts.pptx   | Bin 0 -> 34766 bytes
 .../resources/test-documents/testWORD_charts.docx  | Bin 0 -> 15586 bytes
 16 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index f770182..7c2eac2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15.1 - ??/??/????
 
+  * Extract text from charts in .docx, .pptx, .xlsx and .xlsb
+    (TIKA-2254).
+
   * Extract text from diagrams in .docx, .pptx, .xlsx and .xlsb
     (TIKA-1945).
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 67c5f15..0b3bbd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -83,6 +83,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
     static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject";
     static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
     static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
+    //once we add this to XWPFRelation, we should swap that out and remove this
+    static final String RELATION_CHART = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart";
 
     private static final String TYPE_OLE_OBJECT =
             "application/vnd.openxmlformats-officedocument.oleObject";
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index c430f7d..c4afd00 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -32,10 +32,14 @@ import org.xml.sax.helpers.DefaultHandler;
  *
  * <p/>
  *
- * This class does not check for namespaces, and it can be applied
+ * This class does not generally check for namespaces, and it can be applied
  * to PPTX and DOCX for text extraction.
  *
  * <p/>
+ * This can be used to scrape content from charts.  It currently ignores
+ * formula (&lt;c:f/&gt;) elements
+ *
+ * <p/>
  * This does not work with .xlsx or .vsdx.
  *
  * TODO: move this into POI?
@@ -80,6 +84,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String FALLBACK = "Fallback";
     private final static String OLE_OBJECT = "OLEObject";
     private final static String CR = "cr";
+    private final static String V = "v";
 
     public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
@@ -87,6 +92,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
     private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
     private final static String V_NS = "urn:schemas-microsoft-com:vml";
+    private final static String C_NS = "http://schemas.openxmlformats.org/drawingml/2006/chart";
 
     private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
 
@@ -144,6 +150,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private boolean inDelText = false;
     private boolean inHlinkClick = false;
     private boolean inTextBox = false;
+    private boolean inV = false; //in c:v in chart file
 
     private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
 
@@ -319,6 +326,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         } else if (ENDNOTE_REFERENCE.equals(localName)) {
             String id = atts.getValue(W_NS, "id");
             bodyContentsHandler.endnoteReference(id);
+        } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
+            inV = true;
         }
 
     }
@@ -405,6 +414,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             bodyContentsHandler.hyperlinkEnd();
         } else if (PICT.equals(localName)) {
             handlePict();
+        } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
+            inV = false;
+            handleEndOfRun();
         }
     }
 
@@ -448,6 +460,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             runBuffer.append(ch, start, length);
         } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
             runBuffer.append(ch, start, length);
+        } else if (inV) {
+            runBuffer.append(ch, start, length);
+            runBuffer.append(TAB_CHAR, 0, 1);
         }
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 69526f9..47d0a07 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -221,6 +221,11 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
                 new OOXMLWordAndPowerPointTextHandler(
                         new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
 
+        handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(),
+                "chart", slidePart,
+                metadata,
+                new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
     }
 
     /**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 3525794..1f0eb77 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -157,7 +157,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                 XWPFRelation.COMMENT.getRelation(),
                 XWPFRelation.FOOTER.getRelation(),
                 XWPFRelation.ENDNOTE.getRelation(),
-                AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA
+                AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+                AbstractOOXMLExtractor.RELATION_CHART
         }) {
             try {
                 PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 159ec00..35dba6d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -178,7 +178,17 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
                             new HashMap<String, String>()//empty
                     )
             );
-
+            //now dump chart data
+            handleGeneralTextContainingPart(
+                    XSLFRelation.CHART.getRelation(),
+                    "chart",
+                    slide.getPackagePart(),
+                    metadata,
+                    new OOXMLWordAndPowerPointTextHandler(
+                            new OOXMLTikaBodyPartHandler(xhtml),
+                            new HashMap<String, String>()//empty
+                    )
+            );
         }
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 13eb6b4..14744d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 
@@ -191,7 +192,21 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
                         "diagram-data",
                         drawing.getPackagePart(),
                         metadata,
-                        xhtml
+                        new OOXMLWordAndPowerPointTextHandler(
+                                new OOXMLTikaBodyPartHandler(xhtml),
+                                new HashMap<String, String>()//empty
+                        )
+                );
+                //dump chart data
+                handleGeneralTextContainingPart(
+                        XSSFRelation.CHART.getRelation(),
+                        "chart",
+                        drawing.getPackagePart(),
+                        metadata,
+                        new OOXMLWordAndPowerPointTextHandler(
+                                new OOXMLTikaBodyPartHandler(xhtml),
+                                new HashMap<String, String>()//empty
+                        )
                 );
             }
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index ccfaf6f..b554354 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -263,7 +263,21 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
                         "diagram-data",
                         drawing.getPackagePart(),
                         metadata,
-                        xhtml
+                        new OOXMLWordAndPowerPointTextHandler(
+                                new OOXMLTikaBodyPartHandler(xhtml),
+                                new HashMap<String, String>()//empty
+                        )
+                );
+                //dump chart data
+                handleGeneralTextContainingPart(
+                        XSSFRelation.CHART.getRelation(),
+                        "chart",
+                        drawing.getPackagePart(),
+                        metadata,
+                        new OOXMLWordAndPowerPointTextHandler(
+                                new OOXMLTikaBodyPartHandler(xhtml),
+                                new HashMap<String, String>()//empty
+                        )
                 );
             }
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 7e658a3..e7893c8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -130,6 +130,17 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                         new HashMap<String, String>()//empty
                 )
         );
+        //handle chart data
+        handleGeneralTextContainingPart(
+                AbstractOOXMLExtractor.RELATION_CHART,
+                "chart",
+                document.getPackagePart(),
+                metadata,
+                new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml),
+                        new HashMap<String, String>()//empty
+                )
+        );
 
         // then all document footers
         if (hfPolicy != null) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ee66e5a..bb18e46 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1580,6 +1580,38 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("President", getXML("testPPT_diagramData.pptx").xml);
     }
 
+    @Test
+    public void testXLSXChartData() throws Exception {
+        String xml = getXML("testEXCEL_charts.xlsx").xml;
+        assertContains("peach", xml);
+        assertContains("March\tApril", xml);
+        assertNotContained("chartSpace", xml);
+    }
+
+    @Test
+    public void testXLSBChartData() throws Exception {
+        String xml = getXML("testEXCEL_charts.xlsb").xml;
+        assertContains("peach", xml);
+        assertContains("March\tApril", xml);
+        assertNotContained("chartSpace", xml);
+    }
+
+    @Test
+    public void testDOCXChartData() throws Exception {
+        String xml = getXML("testWORD_charts.docx").xml;
+        assertContains("peach", xml);
+        assertContains("March\tApril", xml);
+        assertNotContained("chartSpace", xml);
+    }
+
+    @Test
+    public void testPPTXChartData() throws Exception {
+        String xml = getXML("testPPT_charts.pptx").xml;
+        assertContains("peach", xml);
+        assertContains("March\tApril", xml);
+        assertNotContained("chartSpace", xml);
+    }
+
 }
 
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 1de2b52..cad4913 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -571,4 +571,13 @@ public class SXSLFExtractorTest extends TikaTest {
     public void testDiagramData() throws Exception {
         assertContains("President", getXML("testPPT_diagramData.pptx", parseContext).xml);
     }
+
+    @Test
+    public void testPPTXChartData() throws Exception {
+        String xml = getXML("testPPT_charts.pptx", parseContext).xml;
+        assertContains("peach", xml);
+        assertContains("March\tApril", xml);
+        assertNotContained("chartSpace", xml);
+    }
+
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 8e7ab6d..6994a3a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -801,4 +801,13 @@ public class SXWPFExtractorTest extends TikaTest {
     public void testDiagramData() throws Exception {
         assertContains("From here", getXML("testWORD_diagramData.docx", parseContext).xml);
     }
+
+    @Test
+    public void testDOCXChartData() throws Exception {
+        String xml = getXML("testWORD_charts.docx", parseContext).xml;
+        assertContains("peach", xml);
+        assertContains("March\tApril", xml);
+        assertNotContained("chartSpace", xml);
+    }
+
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsb b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsb
new file mode 100644
index 0000000..6d71c88
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsb differ
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsx
new file mode 100644
index 0000000..2c5d0df
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_charts.xlsx differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_charts.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_charts.pptx
new file mode 100644
index 0000000..af549b1
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_charts.pptx differ
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_charts.docx b/tika-parsers/src/test/resources/test-documents/testWORD_charts.docx
new file mode 100644
index 0000000..7e9d985
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_charts.docx differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].