You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/07/21 20:23:53 UTC
svn commit: r1612373 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/resources/test-documents/
Author: tpalsulich
Date: Mon Jul 21 18:23:52 2014
New Revision: 1612373
URL: http://svn.apache.org/r1612373
Log:
Fix for TIKA-1251: RuntimeException with certain word docs (contributed by Vadim Roizman).
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1612373&r1=1612372&r2=1612373&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Jul 21 18:23:52 2014
@@ -1,7 +1,7 @@
Release 1.6 - ??/??/2014
* Tika now supports detection of the Persian/Farsi language.
- (TIkA-1337)
+ (TIKA-1337)
* The Tika Detector interface is now exposed through the JAX-RS
server (TIKA-1336, TIKA-1336).
@@ -56,6 +56,8 @@ Release 1.6 - ??/??/2014
* PDF: Images in PDF documents can now be extracted as embedded resources.
(TIKA-1268)
+ * Fixed RuntimeException thrown for certain Word Documents (TIKA-1251).
+
Release 1.5 - 02/04/2014
* Fixed bug in handling of embedded file processing in PDFs (TIKA-1228).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1612373&r1=1612372&r2=1612373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Jul 21 18:23:52 2014
@@ -165,13 +165,8 @@ public class WordExtractor extends Abstr
for(int i=0; i<r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
- String text = p.text();
- if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
- // Skip empty header or footer paragraphs
- } else {
- i += handleParagraph(p, 0, r, document,
- FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
- }
+ i += handleParagraph(p, 0, r, document,
+ FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
}
}
}
@@ -208,6 +203,12 @@ public class WordExtractor extends Abstr
return (t.numParagraphs()-1);
}
+ String text = p.text();
+ if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
+ // Skip empty paragraphs
+ return 0;
+ }
+
TagAndStyle tas;
if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1612373&r1=1612372&r2=1612373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Mon Jul 21 18:23:52 2014
@@ -382,4 +382,19 @@ public class WordParserTest extends Tika
public void testControlCharacter() throws Exception {
assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
}
+
+ @Test
+ public void testParagraphsAfterTables() throws Exception {
+ XMLResult result = getXML("test_TIKA-1251.doc");
+
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ assertContains("<p>1. Organisering av vakten:</p>", xml);
+
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc?rev=1612373&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream