You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2017/04/27 17:02:48 UTC

[tika] branch master updated (d77fb59 -> 0876aa9)

This is an automated email from the ASF dual-hosted git repository.

nick pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.

      from  d77fb59   Merge branch 'master' of https://github.com/apache/tika
       new  aa4954f   TIKA-2346 Add OfficeParserConfig support to control extraction from shapes from non-shape-based formats
       new  0876aa9   TIKA-2346 OfficeParserConfig control extraction from shapes from DOCX

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tika/parser/microsoft/AbstractOfficeParser.java   |  5 +++++
 .../tika/parser/microsoft/OfficeParserConfig.java     | 19 +++++++++++++++++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java       |  5 +++++
 .../parser/microsoft/ooxml/OOXMLExtractorFactory.java |  5 ++---
 .../microsoft/ooxml/POIXMLTextExtractorDecorator.java |  5 +++++
 .../microsoft/ooxml/SXWPFWordExtractorDecorator.java  |  3 +--
 .../microsoft/ooxml/XSSFExcelExtractorDecorator.java  |  9 +++++++--
 .../microsoft/ooxml/XWPFWordExtractorDecorator.java   |  6 ++++--
 8 files changed, 48 insertions(+), 9 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].

[tika] 01/02: TIKA-2346 Add OfficeParserConfig support to control extraction from shapes from non-shape-based formats

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit aa4954fb44f707779693faea785acc219739ccd5
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu Apr 27 17:58:35 2017 +0100

    TIKA-2346 Add OfficeParserConfig support to control extraction from shapes from non-shape-based formats
---
 .../tika/parser/microsoft/AbstractOfficeParser.java   |  5 +++++
 .../tika/parser/microsoft/OfficeParserConfig.java     | 19 +++++++++++++++++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java       |  5 +++++
 .../parser/microsoft/ooxml/OOXMLExtractorFactory.java |  5 ++---
 .../microsoft/ooxml/POIXMLTextExtractorDecorator.java |  5 +++++
 .../microsoft/ooxml/SXWPFWordExtractorDecorator.java  |  3 +--
 .../microsoft/ooxml/XSSFExcelExtractorDecorator.java  |  9 +++++++--
 7 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 48a756e..489a16d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -67,6 +67,11 @@ public abstract class AbstractOfficeParser extends AbstractParser {
     public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
         defaultOfficeParserConfig.setIncludeMoveFromContent(includeMoveFromContent);
     }
+    
+    @Field
+    public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) {
+        defaultOfficeParserConfig.setIncludeShapeBasedContent(includeShapeBasedContent);
+    }
 
     @Field
     public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index e1947a5..8f0f975 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -25,6 +25,7 @@ public class OfficeParserConfig implements Serializable {
 
     private boolean includeDeletedContent = false;
     private boolean includeMoveFromContent = false;
+    private boolean includeShapeBasedContent = true;
 
     private boolean useSAXDocxExtractor = false;
     private boolean useSAXPptxExtractor = false;
@@ -82,6 +83,24 @@ public class OfficeParserConfig implements Serializable {
         return includeMoveFromContent;
     }
 
+    /**
+     * In Excel and Word, there can be text stored within drawing shapes.
+     * (In PowerPoint everything is in a Shape)
+     * <p/>
+     * If you'd like to skip processing these to look for text, set this to
+     *  <code>false</code>
+     * <p/>
+     * Default: <code>true</code>
+     * @param includeShapeBasedContent
+     */
+    public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) {
+        this.includeShapeBasedContent = includeShapeBasedContent;
+    }
+    
+    public boolean getIncludeShapeBasedContent() {
+        return includeShapeBasedContent;
+    }
+
     public boolean getUseSAXDocxExtractor() {
         return useSAXDocxExtractor;
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 26711b2..ff586ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -93,12 +93,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
     private final EmbeddedDocumentExtractor embeddedExtractor;
     private final ParseContext context;
+    protected OfficeParserConfig config;
     protected POIXMLTextExtractor extractor;
 
     public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
         this.context = context;
         this.extractor = extractor;
         embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        
+        // This has already been set by OOXMLParser's call to configure()
+        // We can rely on this being non-null.
+        this.config = context.get(OfficeParserConfig.class);
     }
 
     /**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 92963a8..f4366cc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -91,8 +91,8 @@ public class OOXMLExtractorFactory {
 
             // Have the appropriate OOXML text extractor picked
             POIXMLTextExtractor poiExtractor = null;
-            //This has already been set by OOXMLParser's call to configure()
-            //We can rely on this being non-null.
+            // This has already been set by OOXMLParser's call to configure()
+            // We can rely on this being non-null.
             OfficeParserConfig config = context.get(OfficeParserConfig.class);
             if (config.getUseSAXDocxExtractor()) {
                 poiExtractor = trySXWPF(pkg);
@@ -107,7 +107,6 @@ public class OOXMLExtractorFactory {
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
                 extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
-
             } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
                 extractor = new XSSFExcelExtractorDecorator(
                         context, poiExtractor, locale);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
index ff44176..f6ec3bf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
@@ -21,6 +21,7 @@ import java.util.List;
 
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -29,6 +30,10 @@ public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
 
     public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
         super(context, extractor);
+        
+        if (extractor instanceof XSSFExcelExtractor) {
+            ((XSSFExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
+        }
     }
 
     @Override
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 89ad4e5..d923a2c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -36,7 +36,6 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
@@ -184,7 +183,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                     new OfflineContentHandler(new EmbeddedContentHandler(
                             new OOXMLWordAndPowerPointTextHandler(
                                     new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
-                                            context.get(OfficeParserConfig.class)), linkedRelationships))));
+                                            config), linkedRelationships))));
         } catch (TikaException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index dbf21d1..11277d5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -93,6 +93,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
     }
 
     protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
+        ((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
         ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
         ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
     }
@@ -163,8 +164,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
             for (String footer : sheetExtractor.footers) {
                 extractHeaderFooter(footer, xhtml);
             }
-            List<XSSFShape> shapes = iter.getShapes();
-            processShapes(shapes, xhtml);
+            
+            // Do text held in shapes, if required
+            if (config.getIncludeShapeBasedContent()) {
+                List<XSSFShape> shapes = iter.getShapes();
+                processShapes(shapes, xhtml);
+            }
 
             //for now dump sheet hyperlinks at bottom of page
             //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.

[tika] 02/02: TIKA-2346 OfficeParserConfig control extraction from shapes from DOCX

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0876aa909ffb77dfbd384ebe2f5de0a873ab489a
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu Apr 27 18:02:06 2017 +0100

    TIKA-2346 OfficeParserConfig control extraction from shapes from DOCX
---
 .../tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index a9eb93f..39a72c6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -290,8 +290,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         }
 
         // Also extract any paragraphs embedded in text boxes:
-        for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
-            extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
+        if (config.getIncludeShapeBasedContent()) {
+            for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
+                extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
+            }
         }
 
         // Finish this paragraph

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.