You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:25:18 UTC

tika git commit: TIKA-2187 -- make "ignore deleted" as the default in the experimental SAX .docx parser and update the WordExtractor to include extraction of deleted text if requested by the user.

Repository: tika
Updated Branches:
  refs/heads/2.x 32162f59e -> 3d08da79f


TIKA-2187 -- make "ignore deleted" as the default in the experimental SAX .docx parser and update the WordExtractor to include extraction of deleted text if requested by the user.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3d08da79
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3d08da79
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3d08da79

Branch: refs/heads/2.x
Commit: 3d08da79febc75d1ca0fd3293a5f383983057b00
Parents: 32162f5
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:25:10 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:25:10 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +++
 .../parser/microsoft/OfficeParserConfig.java    |   2 +-
 .../tika/parser/microsoft/WordExtractor.java    |   8 +++++++-
 .../tika/parser/microsoft/WordParserTest.java   |  20 +++++++++++++++++++
 .../ooxml/xwpf/ml2006/Word2006MLParserTest.java |   8 ++++----
 .../ooxml/xwpf/SXWPFExtractorTest.java          |   8 ++++----
 .../test-documents/testWORD_2006ml.doc          | Bin 0 -> 265728 bytes
 7 files changed, 39 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 3e3ef8b..d948af6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.15 -???
 
+  * Change default behavior in experimental .docx parser to ignore
+    deleted text to align with .doc (TIKA-2187).
+
   * Added experimental SAX parser for .docx files. To select this parser,
     set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 55f4673..f3cdbfe 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -21,7 +21,7 @@ import java.io.Serializable;
 
 public class OfficeParserConfig implements Serializable {
 
-    private boolean includeDeletedContent = true;
+    private boolean includeDeletedContent = false;
     private boolean includeMoveFromContent = false;
 
     private boolean useSAXDocxExtractor = false;

http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 54ba55b..6fd8f8e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -78,6 +78,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
         fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
     }
 
+    private final boolean extractDeletedContent;
     // True if we are currently in the named style tag:
     private boolean curStrikeThrough;
     private boolean curBold;
@@ -88,6 +89,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     public WordExtractor(ParseContext context, Metadata metadata) {
         super(context);
         this.metadata = metadata;
+        extractDeletedContent = context.get(OfficeParserConfig.class).getIncludeDeletedContent();
     }
 
     private static int countParagraphs(Range... ranges) {
@@ -654,7 +656,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
      * @return true if character run should be included in extraction.
      */
     private boolean isRendered(final CharacterRun cr) {
-        return cr == null || !cr.isMarkedDeleted();
+        if (cr == null) {
+            return false;
+        }
+        return !cr.isMarkedDeleted() ||
+                (cr.isMarkedDeleted() && extractDeletedContent);
     }
 
     public static class TagAndStyle {

http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index bfb7ca1..9660363 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -534,5 +534,25 @@ public class WordParserTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
         assertContainsAtLeast(minExpected, metadataList);
     }
+
+    @Test
+    public void testDeleted() throws Exception {
+        //test classic behavior
+        String xml = getXML("testWORD_2006ml.doc").xml;
+        assertNotContained("frog", xml);
+
+        //moveFrom is deleted in .doc files
+        assertContainsCount("Second paragraph", xml, 1);
+        //now test inclusion of deleted text
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeDeletedContent(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+        XMLResult r = getXML("testWORD_2006ml.doc", context);
+        assertContains("frog", r.xml);
+
+        //moveFrom is deleted in .doc files
+        assertContainsCount("Second paragraph", r.xml, 2);
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
index 79f1890..c77b0fa 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
@@ -140,8 +140,8 @@ public class Word2006MLParserTest extends TikaTest {
 
         assertContains("Odd page footer", content);
 
-        //test default includes deleted
-        assertContains("frog", content);
+        //test default ignores deleted
+        assertNotContained("frog", content);
 
         assertContains("Mattmann", content);
 
@@ -157,12 +157,12 @@ public class Word2006MLParserTest extends TikaTest {
     public void testSkipDeletedAndMoveFrom() throws Exception {
         ParseContext pc = new ParseContext();
         OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setIncludeDeletedContent(false);
+        officeParserConfig.setIncludeDeletedContent(true);
         officeParserConfig.setIncludeMoveFromContent(true);
         pc.set(OfficeParserConfig.class, officeParserConfig);
 
         XMLResult r = getXML("testWORD_2006ml.xml", pc);
-        assertNotContained("frog", r.xml);
+        assertContains("frog", r.xml);
         assertContainsCount("Second paragraph", r.xml, 2);
 
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
index f5512cb..06f0eed 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
@@ -145,8 +145,8 @@ public class SXWPFExtractorTest extends TikaTest {
 
         assertContains("Odd page footer", content);
 
-        //test default includes deleted
-        assertContains("frog", content);
+        //test default does not include deleted
+        assertNotContained("frog", content);
 
         assertContains("Mattmann", content);
 
@@ -159,13 +159,13 @@ public class SXWPFExtractorTest extends TikaTest {
     public void testSkipDeleted() throws Exception {
         ParseContext pc = new ParseContext();
         OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setIncludeDeletedContent(false);
+        officeParserConfig.setIncludeDeletedContent(true);
         officeParserConfig.setUseSAXDocxExtractor(true);
         officeParserConfig.setIncludeMoveFromContent(true);
         pc.set(OfficeParserConfig.class, officeParserConfig);
 
         XMLResult r = getXML("testWORD_2006ml.docx", pc);
-        assertNotContained("frog", r.xml);
+        assertContains("frog", r.xml);
         assertContainsCount("Second paragraph", r.xml, 2);
 
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/3d08da79/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc b/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc
new file mode 100644
index 0000000..c8f509a
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testWORD_2006ml.doc differ