You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/07/21 20:23:53 UTC

svn commit: r1612373 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/resources/test-documents/

Author: tpalsulich
Date: Mon Jul 21 18:23:52 2014
New Revision: 1612373

URL: http://svn.apache.org/r1612373
Log:
Fix for TIKA-1251: RuntimeException with certain word docs (contributed by Vadim Roizman).

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1612373&r1=1612372&r2=1612373&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Jul 21 18:23:52 2014
@@ -1,7 +1,7 @@
 Release 1.6 - ??/??/2014
 
   * Tika now supports detection of the Persian/Farsi language.
-    (TIkA-1337)
+    (TIKA-1337)
   
   * The Tika Detector interface is now exposed through the JAX-RS
     server (TIKA-1336, TIKA-1336).
@@ -56,6 +56,8 @@ Release 1.6 - ??/??/2014
   * PDF: Images in PDF documents can now be extracted as embedded resources.
     (TIKA-1268)
 
+  * Fixed RuntimeException thrown for certain Word Documents (TIKA-1251).
+
 Release 1.5 - 02/04/2014
 
   * Fixed bug in handling of embedded file processing in PDFs (TIKA-1228).

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1612373&r1=1612372&r2=1612373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Jul 21 18:23:52 2014
@@ -165,13 +165,8 @@ public class WordExtractor extends Abstr
                     for(int i=0; i<r.numParagraphs(); i++) {
                         Paragraph p = r.getParagraph(i);
 
-                        String text = p.text();
-                        if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
-                            // Skip empty header or footer paragraphs
-                        } else {
-                            i += handleParagraph(p, 0, r, document,
-                                    FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
-                        }
+                        i += handleParagraph(p, 0, r, document,
+                                FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
                      }
                 }
             }
@@ -208,6 +203,12 @@ public class WordExtractor extends Abstr
           return (t.numParagraphs()-1);
        }
 
+       String text = p.text();
+       if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
+            // Skip empty paragraphs
+            return 0;
+       }
+
        TagAndStyle tas;
 
        if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1612373&r1=1612372&r2=1612373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Mon Jul 21 18:23:52 2014
@@ -382,4 +382,19 @@ public class WordParserTest extends Tika
     public void testControlCharacter() throws Exception {
       assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
     }
+
+    @Test
+    public void testParagraphsAfterTables() throws Exception {
+        XMLResult result = getXML("test_TIKA-1251.doc");
+
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        assertContains("<p>1. Organisering av vakten:</p>", xml);
+
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc?rev=1612373&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream