You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/12/14 04:09:03 UTC
svn commit: r1421646 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/
main/java/org/apache/tika/parser/microsoft/ooxml/
test/java/org/apache/tika/parser/microsoft/
test/java/org/apache/tika/parser/microsoft/ooxml/ test/res...
Author: nick
Date: Fri Dec 14 03:08:58 2012
New Revision: 1421646
URL: http://svn.apache.org/viewvc?rev=1421646&view=rev
Log:
TIKA-1044 Fix issue for Word extractors on text that lacks any styling, plus tests based on files from Jonas Wilhelmsson
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Dec 14 03:08:58 2012
@@ -173,7 +173,7 @@ public class WordExtractor extends Abstr
if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
StyleDescription style =
document.getStyleSheet().getStyleDescription(p.getStyleIndex());
- if (style!=null) {
+ if (style != null && style.getName() != null && style.getName().length() > 0) {
tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0));
} else {
tas = new TagAndStyle("p", null);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Dec 14 03:08:58 2012
@@ -125,7 +125,7 @@ public class XWPFWordExtractorDecorator
paragraph.getStyleID()
);
- if (style != null) {
+ if (style != null && style.getName() != null) {
TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
style.getName(), paragraph.getPartType() == BodyType.TABLECELL
);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Fri Dec 14 03:08:58 2012
@@ -256,6 +256,26 @@ public class WordParserTest extends Tika
}
/**
+ * TIKA-1044 - Handle documents where parts of the
+ * text have no formatting or styles applied to them
+ */
+ public void testNoFormat() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_no_format.doc");
+ try {
+ new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ String content = handler.toString();
+ assertContains("Will generate an exception", content);
+ }
+
+ /**
* Ensures that custom OLE2 (HPSF) properties are extracted
*/
public void testCustomProperties() throws Exception {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1421646&r1=1421645&r2=1421646&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Dec 14 03:08:58 2012
@@ -36,6 +36,8 @@ import org.apache.tika.metadata.TikaMeta
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -874,6 +876,26 @@ public class OOXMLParserTest extends Tik
assertContains("Test av styrt dokument", xml);
}
+ /**
+ * TIKA-1044 - Handle word documents where parts of the
+ * text have no formatting or styles applied to them
+ */
+ public void testNoFormat() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_no_format.docx");
+ try {
+ new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ String content = handler.toString();
+ assertContains("This is a piece of text that causes an exception", content);
+ }
+
// TIKA-1005:
public void testTextInsideTextBox() throws Exception {
String xml = getXML("testWORD_text_box.docx").xml;
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc?rev=1421646&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx?rev=1421646&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_no_format.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream