You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/28 13:52:13 UTC
[tika] branch master updated: TIKA-2346 -- add unit tests and
configurability for doc, xls and SAX docx parser.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 562e4fa TIKA-2346 -- add unit tests and configurability for doc, xls and SAX docx parser.
562e4fa is described below
commit 562e4faf95bc85b3ab2290055860d3e075a66451
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Apr 28 09:52:01 2017 -0400
TIKA-2346 -- add unit tests and configurability for doc, xls and SAX docx parser.
---
.../parser/microsoft/AbstractPOIFSExtractor.java | 2 ++
.../tika/parser/microsoft/ExcelExtractor.java | 6 +++--
.../tika/parser/microsoft/WordExtractor.java | 12 +++++-----
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 26 ++++++++++++++++++++-
.../ooxml/SXWPFWordExtractorDecorator.java | 2 +-
.../tika/parser/microsoft/ExcelParserTest.java | 16 +++++++++++++
.../tika/parser/microsoft/WordParserTest.java | 11 +++++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 26 +++++++++++++++++++++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 15 ++++++++++++
.../resources/test-documents/testEXCEL_textbox.xls | Bin 0 -> 26112 bytes
10 files changed, 106 insertions(+), 10 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 9c85f30..3e658fd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -49,6 +49,7 @@ abstract class AbstractPOIFSExtractor {
private final EmbeddedDocumentUtil embeddedDocumentUtil;
private PasswordProvider passwordProvider;
protected final Metadata parentMetadata;//metadata of the parent/container document
+ protected final OfficeParserConfig officeParserConfig;
protected AbstractPOIFSExtractor(ParseContext context) {
this(context, null);
@@ -58,6 +59,7 @@ abstract class AbstractPOIFSExtractor {
embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
this.passwordProvider = context.get(PasswordProvider.class);
+ this.officeParserConfig = context.get(OfficeParserConfig.class, new OfficeParserConfig());
this.parentMetadata = parentMetadata;
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 1f336d8..40f0b52 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -455,8 +455,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
break;
case TextObjectRecord.sid:
- TextObjectRecord tor = (TextObjectRecord) record;
- addTextCell(record, tor.getStr().getString());
+ if (extractor.officeParserConfig.getIncludeShapeBasedContent()) {
+ TextObjectRecord tor = (TextObjectRecord) record;
+ addTextCell(record, tor.getStr().getString());
+ }
break;
case SeriesTextRecord.sid: // Chart label or title
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 3180925..cc961f7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -79,7 +79,6 @@ public class WordExtractor extends AbstractPOIFSExtractor {
fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
}
- private final boolean extractDeletedContent;
// True if we are currently in the named style tag:
private boolean curStrikeThrough;
private boolean curBold;
@@ -90,7 +89,6 @@ public class WordExtractor extends AbstractPOIFSExtractor {
public WordExtractor(ParseContext context, Metadata metadata) {
super(context);
this.metadata = metadata;
- extractDeletedContent = context.get(OfficeParserConfig.class).getIncludeDeletedContent();
}
private static int countParagraphs(Range... ranges) {
@@ -189,9 +187,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
}
- // Do everything else
- for (String paragraph : wordExtractor.getMainTextboxText()) {
- xhtml.element("p", paragraph);
+ if (officeParserConfig.getIncludeShapeBasedContent()) {
+ // Do everything else
+ for (String paragraph : wordExtractor.getMainTextboxText()) {
+ xhtml.element("p", paragraph);
+ }
}
for (String paragraph : wordExtractor.getFootnoteText()) {
@@ -668,7 +668,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
return false;
}
return !cr.isMarkedDeleted() ||
- (cr.isMarkedDeleted() && extractDeletedContent);
+ (cr.isMarkedDeleted() && officeParserConfig.getIncludeDeletedContent());
}
public static class TagAndStyle {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index b3a9f68..c430f7d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -102,6 +102,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private final static String MOVE_FROM = "moveFrom";
private final static String MOVE_TO = "moveTo";
private final static String ENDNOTE_REFERENCE = "endnoteReference";
+ private static final String TEXTBOX = "textbox";
+
private final XWPFBodyContentsHandler bodyContentsHandler;
@@ -135,20 +137,28 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private final RunProperties currRunProperties = new RunProperties();
private final ParagraphProperties currPProperties = new ParagraphProperties();
-
+ private final boolean includeTextBox;
private final StringBuilder runBuffer = new StringBuilder();
private boolean inDelText = false;
private boolean inHlinkClick = false;
+ private boolean inTextBox = false;
private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
private DateUtils dateUtils = new DateUtils();
+
public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
Map<String, String> hyperlinks) {
+ this(bodyContentsHandler, hyperlinks, true);
+ }
+
+ public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
+ Map<String, String> hyperlinks, boolean includeTextBox) {
this.bodyContentsHandler = bodyContentsHandler;
this.linkedRelationships = hyperlinks;
+ this.includeTextBox = includeTextBox;
}
@@ -189,6 +199,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
if (inACChoiceDepth > 0) {
return;
}
+
+ if (! includeTextBox && localName.equals(TEXTBOX)) {
+ inTextBox = true;
+ return;
+ }
//these are sorted descending by frequency within docx files
//in our regression corpus.
//yes, I know, likely premature optimization...
@@ -344,6 +359,10 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
return;
}
+ if (! includeTextBox && localName.equals(TEXTBOX)) {
+ inTextBox = false;
+ return;
+ }
if (PIC.equals(localName)) { //PIC_NS
handlePict();
inPic = false;
@@ -417,7 +436,10 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
if (inACChoiceDepth > 0) {
return;
+ } else if (! includeTextBox && inTextBox) {
+ return;
}
+
if (editType.equals(EditType.MOVE_FROM) && inT) {
if (bodyContentsHandler.getIncludeMoveFromText()) {
runBuffer.append(ch, start, length);
@@ -433,6 +455,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
if (inACChoiceDepth > 0) {
return;
+ } else if (! includeTextBox && inTextBox) {
+ return;
}
if (inT) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index d923a2c..0ae4977 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -183,7 +183,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
new OfflineContentHandler(new EmbeddedContentHandler(
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
- config), linkedRelationships))));
+ config), linkedRelationships, config.getIncludeShapeBasedContent()))));
} catch (TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index fc31958..d9abdf8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -472,4 +472,20 @@ public class ExcelParserTest extends TikaTest {
}
+ @Test
+ public void testTextBox() throws Exception {
+ String xml = getXML("testEXCEL_textbox.xls").xml;
+ assertContains("autoshape", xml);
+ }
+
+ //TIKA-2346
+ @Test
+ public void testTurningOffTextBoxExtractionExcel() throws Exception {
+ ParseContext pc = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeShapeBasedContent(false);
+ pc.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testEXCEL_textbox.xls", pc).xml;
+ assertNotContained("autoshape", xml);
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index df6d807..f7036a8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -196,6 +196,17 @@ public class WordParserTest extends TikaTest {
}
}
+ //TIKA-2346
+ @Test
+ public void testTurningOffTextBox() throws Exception {
+ ParseContext pc = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeShapeBasedContent(false);
+ pc.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testWORD_various.doc", pc).xml;
+ assertNotContained("text box", xml);
+ }
+
@Test
public void testVarious() throws Exception {
ContentHandler handler = new BodyContentHandler();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 6420545..525913f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -966,6 +966,20 @@ public class OOXMLParserTest extends TikaTest {
assertContains("This text is inside of a text box in the footer of the document.", xml);
}
+ //TIKA-2346
+ @Test
+ public void testTurningOffTextBoxExtraction() throws Exception {
+ ParseContext pc = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeShapeBasedContent(false);
+ pc.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testWORD_text_box.docx", pc).xml;
+ assertContains("This text is directly in the body of the document.", xml);
+ assertNotContained("This text is inside of a text box in the body of the document.", xml);
+ assertNotContained("This text is inside of a text box in the header of the document.", xml);
+ assertNotContained("This text is inside of a text box in the footer of the document.", xml);
+ }
+
// TIKA-1032:
@Test
public void testEmbeddedPPTXTwoSlides() throws Exception {
@@ -1003,6 +1017,18 @@ public class OOXMLParserTest extends TikaTest {
assertContains("some autoshape", r.xml);
}
+ //TIKA-2346
+ @Test
+ public void testTurningOffTextBoxExtractionExcel() throws Exception {
+
+ ParseContext pc = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeShapeBasedContent(false);
+ pc.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testEXCEL_textbox.xlsx", pc).xml;
+ assertNotContained("autoshape", xml);
+ }
+
//TIKA-792; with room for future missing bean tests
@Test
public void testWordMissingOOXMLBeans() throws Exception {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 206385a..f1e867a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -504,6 +504,21 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("This text is inside of a text box in the footer of the document.", xml);
}
+ //TIKA-2346
+ @Test
+ public void testTurningOffTextBoxExtraction() throws Exception {
+ ParseContext pc = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeShapeBasedContent(false);
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ pc.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testWORD_text_box.docx", pc).xml;
+ assertContains("This text is directly in the body of the document.", xml);
+ assertNotContained("This text is inside of a text box in the body of the document.", xml);
+ assertNotContained("This text is inside of a text box in the header of the document.", xml);
+ assertNotContained("This text is inside of a text box in the footer of the document.", xml);
+ }
+
/**
* Test for missing text described in
* <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xls
new file mode 100644
index 0000000..17e4b8a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xls differ
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].