You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/09 18:27:22 UTC
[tika] branch master updated: TIKA-2569 -- Extract text from
grouped text boxes in PPT.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 4c510d6 TIKA-2569 -- Extract text from grouped text boxes in PPT.
4c510d6 is described below
commit 4c510d6a9910044825c6ee8df87c419a3370ab4e
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Feb 9 13:27:12 2018 -0500
TIKA-2569 -- Extract text from grouped text boxes in PPT.
---
CHANGES.txt | 2 +
.../tika/parser/microsoft/HSLFExtractor.java | 33 +++++++++++++
.../parser/microsoft/PowerPointParserTest.java | 51 ++++++++++++++++++++-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 26 +++++++++++
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 27 +++++++++++
.../resources/test-documents/testPPT_groups.ppt | Bin 0 -> 161792 bytes
.../resources/test-documents/testPPT_groups.pptx | Bin 0 -> 59888 bytes
7 files changed, 138 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 1fe0948..9f6eabf 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,8 @@ Release 2.0.0 - ???
Other changes
+ * Extract text from grouped text boxes in PPT (TIKA-2569).
+
* Extract language metadata item from PDF files via Matt Sheppard (TIKA-2559)
* RFC822 with multipart/mixed, first text element should be treated
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 1f5bad7..d0a1abe 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@@ -30,15 +31,18 @@ import org.apache.poi.hslf.record.DocInfoListContainer;
import org.apache.poi.hslf.record.RecordTypes;
import org.apache.poi.hslf.record.VBAInfoAtom;
import org.apache.poi.hslf.record.VBAInfoContainer;
+import org.apache.poi.hslf.usermodel.HSLFGroupShape;
import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
import org.apache.poi.hslf.usermodel.HSLFNotes;
import org.apache.poi.hslf.usermodel.HSLFObjectData;
import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFPictureShape;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlide;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFTable;
import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextBox;
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hslf.usermodel.HSLFTextRun;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
@@ -117,6 +121,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
}
}
+ extractGroupText(xhtml, slide.getShapes(), 0);
+
// Slide footer, if present
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
xhtml.startElement("p", "class", "slide-footer");
@@ -218,6 +224,33 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
xhtml.endElement("div");
}
+ //Extract any text that's within an HSLFTextShape that's a descendant of
+ //an HSLFGroupShape.
+ private void extractGroupText(XHTMLContentHandler xhtml, List<HSLFShape> shapes, int depth) throws SAXException {
+
+ if (shapes == null) {
+ return;
+ }
+
+ //Only process items with depth > 0 because they should have been included
+ //already in slide.getTextParagraphs above.
+
+ //However, cells are considered grouped within the table, so ignore them.
+ //I don't believe that cells can be inside a text box or other
+ //grouped text containing object, so always ignore them.
+ List<List<HSLFTextParagraph>> paragraphList = new ArrayList<>();
+ for (HSLFShape shape : shapes) {
+ if (shape instanceof HSLFGroupShape) {
+ //work recursively, HSLFGroupShape can contain HSLFGroupShape
+ extractGroupText(xhtml, ((HSLFGroupShape)shape).getShapes(), depth+1);
+ } else if (shape instanceof HSLFTextShape
+ && ! (shape instanceof HSLFTableCell) && depth > 0) {
+ paragraphList.add(((HSLFTextShape)shape).getTextParagraphs());
+ }
+ }
+ textRunsToText(xhtml, paragraphList);
+ }
+
private void extractMacros(HSLFSlideShow ppt, XHTMLContentHandler xhtml) {
//get macro persist id
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 68df743..4ad6bd7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -32,6 +32,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -64,7 +65,7 @@ public class PowerPointParserTest extends TikaTest {
assertContains("<p>[1] This is a footnote.", xml);
assertContains("<p>This is the header text.</p>", xml);
assertContains("<p>This is the footer text.</p>", xml);
- assertContains("<p>Here is a text box</p>", xml);
+ assertContainsCount("<p>Here is a text box</p>", xml, 1);
assertContains("<p>Bold ", xml);
assertContains("italic underline superscript subscript", xml);
assertContains("underline", xml);
@@ -294,4 +295,52 @@ public class PowerPointParserTest extends TikaTest {
public void testEncrypted() throws Exception {
getXML("testPPT_protected_passtika.ppt");
}
+
+ @Test
+ public void testGroups() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.ppt");
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ //this tests that we're ignoring text shapes at depth=0
+ //i.e. POI has already included them in the slide's getTextParagraphs()
+ assertContainsCount("Text box1", content, 1);
+
+
+ //the WordArt and text box count tests will fail
+ //if this content is available via getTextParagraphs() of the slide in POI
+ //i.e. when POI is fixed, these tests will fail, and
+ //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
+ assertContainsCount("WordArt1", content, 1);
+ assertContainsCount("WordArt2", content, 1);
+ assertContainsCount("Ungrouped text box", content, 1);//should only be 1
+ assertContains("Text box2", content);
+ assertContains("Text box3", content);
+ assertContains("Text box4", content);
+ assertContains("Text box5", content);
+
+ //see below -- need to extract hyperlinks
+ assertContains("tika", content);
+ assertContains("MyTitle", content);
+
+ assertEquals("/embedded-1",
+ metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+ assertEquals("/embedded-2",
+ metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+ }
+
+ @Ignore("until we add smart text extraction")
+ @Test
+ public void testSmartArtText() throws Exception {
+ String content = getXML("testPPT_groups.ppt").xml;
+ assertContains("smart1", content);
+ }
+
+ @Ignore("until we fix hyperlink extraction from text boxes")
+ @Test
+ public void testHyperlinksInTextBoxes() throws Exception {
+ String content = getXML("testPPT_groups.ppt").xml;
+ assertContains("href=\"http://tika.apache.org", content);
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 8529892..f860e0a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1729,6 +1729,32 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
+ public void testPPTXGroups() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx");
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("WordArt1", content);
+ assertContains("WordArt2", content);
+ assertContainsCount("Ungrouped text box", content, 1);//should only be 1
+ assertContains("Text box1", content);
+ assertContains("Text box2", content);
+ assertContains("Text box3", content);
+ assertContains("Text box4", content);
+ assertContains("Text box5", content);
+
+
+ assertContains("href=\"http://tika.apache.org", content);
+ assertContains("smart1", content);
+ assertContains("MyTitle", content);
+
+ assertEquals("/image1.jpg",
+ metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+ assertEquals("/thumbnail.jpeg",
+ metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+ }
+
+ @Test
public void testXLSXPhoneticStrings() throws Exception {
//This unit test and test file come from Apache POI 51519.xlsx
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 1da0864..859cdfb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -582,4 +582,31 @@ public class SXSLFExtractorTest extends TikaTest {
assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
}
+
+ @Test
+ public void testPPTXGroups() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx", parseContext);
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("WordArt1", content);
+ assertContains("WordArt2", content);
+ assertContainsCount("Ungrouped text box", content, 1);//should only be 1
+ assertContains("Text box1", content);
+ assertContains("Text box2", content);
+ assertContains("Text box3", content);
+ assertContains("Text box4", content);
+ assertContains("Text box5", content);
+
+
+ assertContains("href=\"http://tika.apache.org", content);
+ assertContains("smart1", content);
+ assertContains("MyTitle", content);
+
+ assertEquals("/image1.jpg",
+ metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+ assertEquals("/thumbnail.jpeg",
+ metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+ }
+
}
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt
new file mode 100644
index 0000000..7690495
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx
new file mode 100644
index 0000000..b2fd0ed
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx differ
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.