You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/07 16:07:38 UTC
[tika] branch master updated: TIKA-2807 -- extract sdt content from
within textbox in docx
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 06cf66c TIKA-2807 -- extract sdt content from within textbox in docx
06cf66c is described below
commit 06cf66cef14863fee0111dddefaebaa051a40c72
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jan 7 11:07:25 2019 -0500
TIKA-2807 -- extract sdt content from within textbox in docx
---
CHANGES.txt | 1 +
.../microsoft/ooxml/XWPFWordExtractorDecorator.java | 6 ++++--
.../tika/parser/microsoft/ooxml/OOXMLParserTest.java | 9 +++++++++
.../tika/parser/microsoft/ooxml/SXWPFExtractorTest.java | 7 +++++++
.../resources/test-documents/testWORD_sdtInTextBox.docx | Bin 0 -> 21117 bytes
5 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 376a109..48107a1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,7 @@ Release 2.0.0 - ???
Release 1.21 - ????
+ * Extract text from SDT element within textboxes in .docx files (TIKA-2807).
* Try to handle truncated OOXML files more robustly (TIKA-2765).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 25c5a7c..511c805 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -328,9 +328,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.characters(footnameText + "\n");
}
- // Also extract any paragraphs embedded in text boxes:
+ // Also extract any paragraphs embedded in text boxes
+ //Note "w:txbxContent//"...must look for all descendant paragraphs
+ //not just the immediate children of txbxContent -- TIKA-2807
if (config.getIncludeShapeBasedContent()) {
- for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
+ for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) {
extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 38e3581..6896f04 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1058,6 +1058,15 @@ public class OOXMLParserTest extends TikaTest {
assertContains("This text is inside of a text box in the footer of the document.", xml);
}
+ //TIKA-2807
+ @Test
+ public void testSDTInTextBox() throws Exception {
+ String xml = getXML("testWORD_sdtInTextBox.docx").xml;
+ System.out.println(xml);
+ assertContains("rich-text-content-control_inside-text-box", xml);
+ assertContainsCount("inside-text", xml, 1);
+ }
+
//TIKA-2346
@Test
public void testTurningOffTextBoxExtraction() throws Exception {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 35f5716..9eeb2de 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -860,5 +860,12 @@ public class SXWPFExtractorTest extends TikaTest {
assertNotContained("unde ", txt);
}
+ //TIKA-2807
+ @Test
+ public void testSDTInTextBox() throws Exception {
+ String xml = getXML("testWORD_sdtInTextBox.docx", parseContext).xml;
+ assertContains("rich-text-content-control_inside-text-box", xml);
+ assertContainsCount("inside-text", xml, 1);
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_sdtInTextBox.docx b/tika-parsers/src/test/resources/test-documents/testWORD_sdtInTextBox.docx
new file mode 100644
index 0000000..2e66d23
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_sdtInTextBox.docx differ