You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2017/04/27 17:02:50 UTC

[tika] 02/02: TIKA-2346 OfficeParserConfig control extraction from shapes from DOCX

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0876aa909ffb77dfbd384ebe2f5de0a873ab489a
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu Apr 27 18:02:06 2017 +0100

    TIKA-2346 OfficeParserConfig control extraction from shapes from DOCX
---
 .../tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index a9eb93f..39a72c6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -290,8 +290,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         }
 
         // Also extract any paragraphs embedded in text boxes:
-        for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
-            extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
+        if (config.getIncludeShapeBasedContent()) {
+            for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
+                extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
+            }
         }
 
         // Finish this paragraph

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.