You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2017/04/27 17:02:49 UTC
[tika] 01/02: TIKA-2346 Add OfficeParserConfig support to control
extraction from shapes from non-shape-based formats
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit aa4954fb44f707779693faea785acc219739ccd5
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu Apr 27 17:58:35 2017 +0100
TIKA-2346 Add OfficeParserConfig support to control extraction from shapes from non-shape-based formats
---
.../tika/parser/microsoft/AbstractOfficeParser.java | 5 +++++
.../tika/parser/microsoft/OfficeParserConfig.java | 19 +++++++++++++++++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 5 +++++
.../parser/microsoft/ooxml/OOXMLExtractorFactory.java | 5 ++---
.../microsoft/ooxml/POIXMLTextExtractorDecorator.java | 5 +++++
.../microsoft/ooxml/SXWPFWordExtractorDecorator.java | 3 +--
.../microsoft/ooxml/XSSFExcelExtractorDecorator.java | 9 +++++++--
7 files changed, 44 insertions(+), 7 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 48a756e..489a16d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -67,6 +67,11 @@ public abstract class AbstractOfficeParser extends AbstractParser {
public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
defaultOfficeParserConfig.setIncludeMoveFromContent(includeMoveFromContent);
}
+
+ @Field
+ public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) {
+ defaultOfficeParserConfig.setIncludeShapeBasedContent(includeShapeBasedContent);
+ }
@Field
public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index e1947a5..8f0f975 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -25,6 +25,7 @@ public class OfficeParserConfig implements Serializable {
private boolean includeDeletedContent = false;
private boolean includeMoveFromContent = false;
+ private boolean includeShapeBasedContent = true;
private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;
@@ -82,6 +83,24 @@ public class OfficeParserConfig implements Serializable {
return includeMoveFromContent;
}
+ /**
+ * In Excel and Word, there can be text stored within drawing shapes.
+ * (In PowerPoint everything is in a Shape)
+ * <p/>
+ * If you'd like to skip processing these to look for text, set this to
+ * <code>false</code>
+ * <p/>
+ * Default: <code>true</code>
+ * @param includeShapeBasedContent
+ */
+ public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) {
+ this.includeShapeBasedContent = includeShapeBasedContent;
+ }
+
+ public boolean getIncludeShapeBasedContent() {
+ return includeShapeBasedContent;
+ }
+
public boolean getUseSAXDocxExtractor() {
return useSAXDocxExtractor;
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 26711b2..ff586ba 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -93,12 +93,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
private final EmbeddedDocumentExtractor embeddedExtractor;
private final ParseContext context;
+ protected OfficeParserConfig config;
protected POIXMLTextExtractor extractor;
public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
this.context = context;
this.extractor = extractor;
embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+ // This has already been set by OOXMLParser's call to configure()
+ // We can rely on this being non-null.
+ this.config = context.get(OfficeParserConfig.class);
}
/**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 92963a8..f4366cc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -91,8 +91,8 @@ public class OOXMLExtractorFactory {
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
- //This has already been set by OOXMLParser's call to configure()
- //We can rely on this being non-null.
+ // This has already been set by OOXMLParser's call to configure()
+ // We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
@@ -107,7 +107,6 @@ public class OOXMLExtractorFactory {
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
-
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(
context, poiExtractor, locale);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
index ff44176..f6ec3bf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
@@ -21,6 +21,7 @@ import java.util.List;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -29,6 +30,10 @@ public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
super(context, extractor);
+
+ if (extractor instanceof XSSFExcelExtractor) {
+ ((XSSFExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
+ }
}
@Override
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 89ad4e5..d923a2c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -36,7 +36,6 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
@@ -184,7 +183,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
new OfflineContentHandler(new EmbeddedContentHandler(
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
- context.get(OfficeParserConfig.class)), linkedRelationships))));
+ config), linkedRelationships))));
} catch (TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index dbf21d1..11277d5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -93,6 +93,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
+ ((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
}
@@ -163,8 +164,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
- List<XSSFShape> shapes = iter.getShapes();
- processShapes(shapes, xhtml);
+
+ // Do text held in shapes, if required
+ if (config.getIncludeShapeBasedContent()) {
+ List<XSSFShape> shapes = iter.getShapes();
+ processShapes(shapes, xhtml);
+ }
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.