You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:25:18 UTC
tika git commit: TIKA-2187 -- make "ignore deleted" as the default in
the experimental SAX .docx parser and update the WordExtractor to include
extraction of deleted text if requested by the user.
Repository: tika
Updated Branches:
refs/heads/2.x 32162f59e -> 3d08da79f
TIKA-2187 -- make "ignore deleted" as the default in the experimental SAX .docx parser and update the WordExtractor to include extraction of deleted text if requested by the user.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3d08da79
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3d08da79
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3d08da79
Branch: refs/heads/2.x
Commit: 3d08da79febc75d1ca0fd3293a5f383983057b00
Parents: 32162f5
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:25:10 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:25:10 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 +++
.../parser/microsoft/OfficeParserConfig.java | 2 +-
.../tika/parser/microsoft/WordExtractor.java | 8 +++++++-
.../tika/parser/microsoft/WordParserTest.java | 20 +++++++++++++++++++
.../ooxml/xwpf/ml2006/Word2006MLParserTest.java | 8 ++++----
.../ooxml/xwpf/SXWPFExtractorTest.java | 8 ++++----
.../test-documents/testWORD_2006ml.doc | Bin 0 -> 265728 bytes
7 files changed, 39 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 3e3ef8b..d948af6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
Release 1.15 -???
+ * Change default behavior in experimental .docx parser to ignore
+ deleted text to align with .doc (TIKA-2187).
+
* Added experimental SAX parser for .docx files. To select this parser,
set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321).
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 55f4673..f3cdbfe 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -21,7 +21,7 @@ import java.io.Serializable;
public class OfficeParserConfig implements Serializable {
- private boolean includeDeletedContent = true;
+ private boolean includeDeletedContent = false;
private boolean includeMoveFromContent = false;
private boolean useSAXDocxExtractor = false;
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 54ba55b..6fd8f8e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -78,6 +78,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
}
+ private final boolean extractDeletedContent;
// True if we are currently in the named style tag:
private boolean curStrikeThrough;
private boolean curBold;
@@ -88,6 +89,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
public WordExtractor(ParseContext context, Metadata metadata) {
super(context);
this.metadata = metadata;
+ extractDeletedContent = context.get(OfficeParserConfig.class).getIncludeDeletedContent();
}
private static int countParagraphs(Range... ranges) {
@@ -654,7 +656,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
* @return true if character run should be included in extraction.
*/
private boolean isRendered(final CharacterRun cr) {
- return cr == null || !cr.isMarkedDeleted();
+ if (cr == null) {
+ return false;
+ }
+ return !cr.isMarkedDeleted() ||
+ (cr.isMarkedDeleted() && extractDeletedContent);
}
public static class TagAndStyle {
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index bfb7ca1..9660363 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -534,5 +534,25 @@ public class WordParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
assertContainsAtLeast(minExpected, metadataList);
}
+
+ @Test
+ public void testDeleted() throws Exception {
+ //test classic behavior
+ String xml = getXML("testWORD_2006ml.doc").xml;
+ assertNotContained("frog", xml);
+
+ //moveFrom is deleted in .doc files
+ assertContainsCount("Second paragraph", xml, 1);
+ //now test inclusion of deleted text
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeDeletedContent(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+ XMLResult r = getXML("testWORD_2006ml.doc", context);
+ assertContains("frog", r.xml);
+
+ //moveFrom is deleted in .doc files
+ assertContainsCount("Second paragraph", r.xml, 2);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
index 79f1890..c77b0fa 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
@@ -140,8 +140,8 @@ public class Word2006MLParserTest extends TikaTest {
assertContains("Odd page footer", content);
- //test default includes deleted
- assertContains("frog", content);
+ //test default ignores deleted
+ assertNotContained("frog", content);
assertContains("Mattmann", content);
@@ -157,12 +157,12 @@ public class Word2006MLParserTest extends TikaTest {
public void testSkipDeletedAndMoveFrom() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setIncludeDeletedContent(false);
+ officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.xml", pc);
- assertNotContained("frog", r.xml);
+ assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
index f5512cb..06f0eed 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
@@ -145,8 +145,8 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("Odd page footer", content);
- //test default includes deleted
- assertContains("frog", content);
+ //test default does not include deleted
+ assertNotContained("frog", content);
assertContains("Mattmann", content);
@@ -159,13 +159,13 @@ public class SXWPFExtractorTest extends TikaTest {
public void testSkipDeleted() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setIncludeDeletedContent(false);
+ officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setUseSAXDocxExtractor(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.docx", pc);
- assertNotContained("frog", r.xml);
+ assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc b/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc
new file mode 100644
index 0000000..c8f509a
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc differ