You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/12 15:29:04 UTC

[tika] branch branch_1x updated (856a90d -> fd7ec73)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 856a90d  TIKA-2571 -- rethrow SecurityException
     new d9be32c  TIKA-2569 -- Extract text from grouped text boxes in PPT.
     new fd7ec73  Remove java 8 String.join

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/sax/StandardOrganizations.java |  15 ++++--
 .../tika/parser/microsoft/HSLFExtractor.java       |  33 +++++++++++++
 .../parser/microsoft/PowerPointParserTest.java     |  51 ++++++++++++++++++++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  26 +++++++++++
 .../parser/microsoft/ooxml/SXSLFExtractorTest.java |  27 +++++++++++
 .../resources/test-documents/testPPT_groups.ppt    | Bin 0 -> 161792 bytes
 .../resources/test-documents/testPPT_groups.pptx   | Bin 0 -> 59888 bytes
 8 files changed, 150 insertions(+), 4 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt
 create mode 100644 tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 02/02: Remove java 8 String.join

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit fd7ec73a19bfdfb3918faae28126ae6662bf1823
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Feb 12 10:28:46 2018 -0500

    Remove java 8 String.join
---
 .../java/org/apache/tika/sax/StandardOrganizations.java   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java
index 39ace2d..cff186a 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java
@@ -159,8 +159,17 @@ public class StandardOrganizations {
 	 * @return the regular expression containing the most important technical standard organizations.
 	 */
 	public static String getOrganzationsRegex() {
-		String regex = "(" + String.join("|", organizations.keySet()) + ")";
-		
-		return regex;
+		StringBuilder sb = new StringBuilder();
+		sb.append("(");
+		int i = 0;
+		for (String org : organizations.keySet()) {
+			if (i > 0) {
+				sb.append("|");
+			}
+			sb.append(org);
+			i++;
+		}
+		sb.append(")");
+		return sb.toString();
 	}
 }
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 01/02: TIKA-2569 -- Extract text from grouped text boxes in PPT.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d9be32cfa28ea442013e81f1c60c61ab37a13ccf
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Feb 9 13:27:12 2018 -0500

    TIKA-2569 -- Extract text from grouped text boxes in PPT.
---
 CHANGES.txt                                        |   2 +
 .../tika/parser/microsoft/HSLFExtractor.java       |  33 +++++++++++++
 .../parser/microsoft/PowerPointParserTest.java     |  51 ++++++++++++++++++++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  26 +++++++++++
 .../parser/microsoft/ooxml/SXSLFExtractorTest.java |  27 +++++++++++
 .../resources/test-documents/testPPT_groups.ppt    | Bin 0 -> 161792 bytes
 .../resources/test-documents/testPPT_groups.pptx   | Bin 0 -> 59888 bytes
 7 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index e4faabd..250d2e8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.18 - ???
 
+   * Extract text from grouped text boxes in PPT (TIKA-2569).
+
    * Extract language metadata item from PDF files via Matt Sheppard (TIKA-2559)
 
    * RFC822 with multipart/mixed, first text element should be treated
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 1f5bad7..d0a1abe 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 
@@ -30,15 +31,18 @@ import org.apache.poi.hslf.record.DocInfoListContainer;
 import org.apache.poi.hslf.record.RecordTypes;
 import org.apache.poi.hslf.record.VBAInfoAtom;
 import org.apache.poi.hslf.record.VBAInfoContainer;
+import org.apache.poi.hslf.usermodel.HSLFGroupShape;
 import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
 import org.apache.poi.hslf.usermodel.HSLFNotes;
 import org.apache.poi.hslf.usermodel.HSLFObjectData;
 import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFPictureShape;
 import org.apache.poi.hslf.usermodel.HSLFShape;
 import org.apache.poi.hslf.usermodel.HSLFSlide;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 import org.apache.poi.hslf.usermodel.HSLFTable;
 import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextBox;
 import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
 import org.apache.poi.hslf.usermodel.HSLFTextRun;
 import org.apache.poi.hslf.usermodel.HSLFTextShape;
@@ -117,6 +121,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                 }
             }
 
+            extractGroupText(xhtml, slide.getShapes(), 0);
+
             // Slide footer, if present
             if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
                 xhtml.startElement("p", "class", "slide-footer");
@@ -218,6 +224,33 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
         xhtml.endElement("div");
     }
 
+    //Extract any text that's within an HSLFTextShape that's a descendant of
+    //an HSLFGroupShape.
+    private void extractGroupText(XHTMLContentHandler xhtml, List<HSLFShape> shapes, int depth) throws SAXException {
+
+        if (shapes == null) {
+            return;
+        }
+
+        //Only process items with depth > 0 because they should have been included
+        //already in slide.getTextParagraphs above.
+
+        //However, cells are considered grouped within the table, so ignore them.
+        //I don't believe that cells can be inside a text box or other
+        //grouped text containing object, so always ignore them.
+        List<List<HSLFTextParagraph>> paragraphList = new ArrayList<>();
+        for (HSLFShape shape : shapes) {
+            if (shape instanceof HSLFGroupShape) {
+                //work recursively, HSLFGroupShape can contain HSLFGroupShape
+                extractGroupText(xhtml, ((HSLFGroupShape)shape).getShapes(), depth+1);
+            } else if (shape instanceof HSLFTextShape
+                    && ! (shape instanceof HSLFTableCell) && depth > 0) {
+                paragraphList.add(((HSLFTextShape)shape).getTextParagraphs());
+            }
+        }
+        textRunsToText(xhtml, paragraphList);
+    }
+
     private void extractMacros(HSLFSlideShow ppt, XHTMLContentHandler xhtml) {
 
         //get macro persist id
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index d2fb110..e4e4332 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -31,6 +31,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -64,7 +65,7 @@ public class PowerPointParserTest extends TikaTest {
         assertContains("<p>[1] This is a footnote.", xml);
         assertContains("<p>This is the header text.</p>", xml);
         assertContains("<p>This is the footer text.</p>", xml);
-        assertContains("<p>Here is a text box</p>", xml);
+        assertContainsCount("<p>Here is a text box</p>", xml, 1);
         assertContains("<p>Bold ", xml);
         assertContains("italic underline superscript subscript", xml);
         assertContains("underline", xml);
@@ -293,4 +294,52 @@ public class PowerPointParserTest extends TikaTest {
     public void testEncrypted() throws Exception {
         getXML("testPPT_protected_passtika.ppt");
     }
+
+    @Test
+    public void testGroups() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.ppt");
+        assertEquals(3, metadataList.size());
+        String content =  metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        //this tests that we're ignoring text shapes at depth=0
+        //i.e. POI has already included them in the slide's getTextParagraphs()
+        assertContainsCount("Text box1", content, 1);
+
+
+        //the WordArt and text box count tests will fail
+        //if this content is available via getTextParagraphs() of the slide in POI
+        //i.e. when POI is fixed, these tests will fail, and
+        //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
+        assertContainsCount("WordArt1", content, 1);
+        assertContainsCount("WordArt2", content, 1);
+        assertContainsCount("Ungrouped text box", content, 1);//should only be 1
+        assertContains("Text box2", content);
+        assertContains("Text box3", content);
+        assertContains("Text box4", content);
+        assertContains("Text box5", content);
+
+        //see below -- need to extract hyperlinks
+        assertContains("tika", content);
+        assertContains("MyTitle", content);
+
+        assertEquals("/embedded-1",
+                metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+        assertEquals("/embedded-2",
+                metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+    }
+
+    @Ignore("until we add smart text extraction")
+    @Test
+    public void testSmartArtText() throws Exception {
+        String content = getXML("testPPT_groups.ppt").xml;
+        assertContains("smart1", content);
+    }
+
+    @Ignore("until we fix hyperlink extraction from text boxes")
+    @Test
+    public void testHyperlinksInTextBoxes() throws Exception {
+        String content = getXML("testPPT_groups.ppt").xml;
+        assertContains("href=\"http://tika.apache.org", content);
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 847d347..3cd5c65 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1741,6 +1741,32 @@ public class OOXMLParserTest extends TikaTest {
     }
 
     @Test
+    public void testPPTXGroups() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx");
+        assertEquals(3, metadataList.size());
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains("WordArt1", content);
+        assertContains("WordArt2", content);
+        assertContainsCount("Ungrouped text box", content, 1);//should only be 1
+        assertContains("Text box1", content);
+        assertContains("Text box2", content);
+        assertContains("Text box3", content);
+        assertContains("Text box4", content);
+        assertContains("Text box5", content);
+
+
+        assertContains("href=\"http://tika.apache.org", content);
+        assertContains("smart1", content);
+        assertContains("MyTitle", content);
+
+        assertEquals("/image1.jpg",
+                metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+        assertEquals("/thumbnail.jpeg",
+                metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+    }
+
+    @Test
     public void testXLSXPhoneticStrings() throws Exception {
         //This unit test and test file come from Apache POI 51519.xlsx
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 8385263..cb935c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -591,4 +591,31 @@ public class SXSLFExtractorTest extends TikaTest {
         assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
 
     }
+
+    @Test
+    public void testPPTXGroups() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx", parseContext);
+        assertEquals(3, metadataList.size());
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains("WordArt1", content);
+        assertContains("WordArt2", content);
+        assertContainsCount("Ungrouped text box", content, 1);//should only be 1
+        assertContains("Text box1", content);
+        assertContains("Text box2", content);
+        assertContains("Text box3", content);
+        assertContains("Text box4", content);
+        assertContains("Text box5", content);
+
+
+        assertContains("href=\"http://tika.apache.org", content);
+        assertContains("smart1", content);
+        assertContains("MyTitle", content);
+
+        assertEquals("/image1.jpg",
+                metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+
+        assertEquals("/thumbnail.jpeg",
+                metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+    }
+
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt
new file mode 100644
index 0000000..7690495
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_groups.ppt differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx
new file mode 100644
index 0000000..b2fd0ed
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_groups.pptx differ

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.