You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/12/14 04:09:03 UTC

svn commit: r1421646 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ main/java/org/apache/tika/parser/microsoft/ooxml/ test/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/parser/microsoft/ooxml/ test/res...

Author: nick
Date: Fri Dec 14 03:08:58 2012
New Revision: 1421646

URL: http://svn.apache.org/viewvc?rev=1421646&view=rev
Log:
TIKA-1044 Fix issue for Word extractors on text that lacks any styling, plus tests based on files from Jonas Wilhelmsson

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Dec 14 03:08:58 2012
@@ -173,7 +173,7 @@ public class WordExtractor extends Abstr
        if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
            StyleDescription style =
               document.getStyleSheet().getStyleDescription(p.getStyleIndex());
-           if (style!=null) {
+           if (style != null && style.getName() != null && style.getName().length() > 0) {
                tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0));
            } else {
                tas = new TagAndStyle("p", null);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Dec 14 03:08:58 2012
@@ -125,7 +125,7 @@ public class XWPFWordExtractorDecorator 
                 paragraph.getStyleID()
           );
 
-          if (style != null) {
+          if (style != null && style.getName() != null) {
              TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
                    style.getName(), paragraph.getPartType() == BodyType.TABLECELL
              );

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Fri Dec 14 03:08:58 2012
@@ -256,6 +256,26 @@ public class WordParserTest extends Tika
     }
     
     /**
+     * TIKA-1044 - Handle documents where parts of the
+     *  text have no formatting or styles applied to them
+     */
+    public void testNoFormat() throws Exception {
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = WordParserTest.class.getResourceAsStream(
+               "/test-documents/testWORD_no_format.doc");
+       try {
+           new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+       } finally {
+           stream.close();
+       }
+
+       String content = handler.toString();
+       assertContains("Will generate an exception", content);
+    }
+    
+    /**
      * Ensures that custom OLE2 (HPSF) properties are extracted
      */
     public void testCustomProperties() throws Exception {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Dec 14 03:08:58 2012
@@ -36,6 +36,8 @@ import org.apache.tika.metadata.TikaMeta
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.WordParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
@@ -874,6 +876,26 @@ public class OOXMLParserTest extends Tik
       assertContains("Test av styrt dokument", xml);
     }
 
+    /**
+     * TIKA-1044 - Handle word documents where parts of the
+     *  text have no formatting or styles applied to them
+     */
+    public void testNoFormat() throws Exception {
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = WordParserTest.class.getResourceAsStream(
+               "/test-documents/testWORD_no_format.docx");
+       try {
+          new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
+       } finally {
+           stream.close();
+       }
+
+       String content = handler.toString();
+       assertContains("This is a piece of text that causes an exception", content);
+    }
+    
     // TIKA-1005:
     public void testTextInsideTextBox() throws Exception {
         String xml = getXML("testWORD_text_box.docx").xml;

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc?rev=1421646&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx?rev=1421646&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream