You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/02 12:59:40 UTC

svn commit: r1164471 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

Author: jukka
Date: Fri Sep  2 10:59:39 2011
New Revision: 1164471

URL: http://svn.apache.org/viewvc?rev=1164471&view=rev
Log:
TIKA-207: MS word doc containing tracked changes produces incorrect text

Patch by Curt Arnold

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1164471&r1=1164470&r2=1164471&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Sep  2 10:59:39 2011
@@ -214,7 +214,7 @@ public class WordExtractor extends Abstr
     private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) 
           throws SAXException {
        // Skip trailing newlines
-       if(cr.text().equals("\r"))
+       if(!isRendered(cr) || cr.text().equals("\r"))
           return;
        
        if(!skipStyling) {
@@ -347,7 +347,7 @@ public class WordExtractor extends Abstr
 
     private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) 
           throws SAXException, IOException, TikaException {
-       if(picture == null) {
+       if(!isRendered(cr) || picture == null) {
           // Oh dear, we've run out...
           // Probably caused by multiple \u0008 images referencing
           //  the same real image
@@ -471,6 +471,17 @@ public class WordExtractor extends Abstr
     }
     
     /**
+     * Determines if character run should be included in the extraction.
+     * 
+     * @param cr character run.
+     * @return true if character run should be included in extraction.
+     */
+    private boolean isRendered(final CharacterRun cr) {
+ 	   return cr == null || !cr.isMarkedDeleted();
+    }
+    
+    
+    /**
      * Provides access to the pictures both by offset, iteration
      *  over the un-claimed, and peeking forward
      */