You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/13 17:38:12 UTC
svn commit: r996577 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/ooxml/
test/java/org/apache/tika/detect/ test/java/org/apache/tika/mime/
test/java/org/apache/tika/parser/microsoft/ooxml/
test/resources/test-documents/
Author: nick
Date: Mon Sep 13 15:38:12 2010
New Revision: 996577
URL: http://svn.apache.org/viewvc?rev=996577&view=rev
Log:
Tidy up OOXML unit tests by removing TODOs, and make the sample word document contain a bit more so we can later improve the unit test (TIKA-506)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=996577&r1=996576&r2=996577&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Mon Sep 13 15:38:12 2010
@@ -78,8 +78,7 @@ public class XWPFWordExtractorDecorator
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
new XWPFHyperlinkDecorator(paragraph, null, true));
- CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
- for (CTBookmark bookmark : bookmarks) {
+ for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartArray()) {
xhtml.element("p", bookmark.getName());
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=996577&r1=996576&r2=996577&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Sep 13 15:38:12 2010
@@ -196,7 +196,7 @@ public class TestContainerAwareDetector
ContainerAwareDetector detector = new ContainerAwareDetector(mimeTypes);
// First up a truncated OOXML (zip) file
- InputStream input = getTestDoc("testWORD.docx");
+ InputStream input = getTestDoc("testEXCEL.xlsx");
byte [] buffer = new byte[300];
assertEquals(300,input.read(buffer));
Metadata metadata = new Metadata();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=996577&r1=996576&r2=996577&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Mon Sep 13 15:38:12 2010
@@ -118,10 +118,21 @@ public class TestMimeTypes extends TestC
assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
}
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
public void testOoxmlDetection() throws Exception {
- assertTypeByData("application/x-tika-ooxml", "testWORD.docx");
+ // These two do luckily have [Content_Types].xml near the start,
+ // so our mime magic will spot them
assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
+
+ // This one quite legitimately doesn't have its [Content_Types].xml
+ // file as one of the first couple of entries
+ // As such, our mime magic can't figure it out...
+ assertTypeByData("application/zip", "testWORD.docx");
}
public void testJpegDetection() throws Exception {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=996577&r1=996576&r2=996577&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Mon Sep 13 15:38:12 2010
@@ -21,6 +21,9 @@ import java.util.Locale;
import junit.framework.TestCase;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
@@ -31,22 +34,29 @@ import org.xml.sax.ContentHandler;
import org.apache.tika.parser.AutoDetectParser;
public class OOXMLParserTest extends TestCase {
- public void testExcel() throws Exception {
+ private Parser parser;
+
+ @Override
+ protected void setUp() throws Exception {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ ContainerAwareDetector detector = new ContainerAwareDetector(
+ config.getMimeRepository()
+ );
+ parser = new AutoDetectParser(detector);
+ }
+
+ public void testExcel() throws Exception {
InputStream input = OOXMLParserTest.class
.getResourceAsStream("/test-documents/testEXCEL.xlsx");
assertNotNull(input);
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
- // TODO: should auto-detect without the resource name
- metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
+ Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
-
try {
- parser.parse(input, handler, metadata, context);
+ parser.parse(TikaInputStream.get(input), handler, metadata, context);
assertEquals(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@@ -70,16 +80,13 @@ public class OOXMLParserTest extends Tes
InputStream input = OOXMLParserTest.class
.getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
- Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
- // TODO: should auto-detect without the resource name
- metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
try {
- parser.parse(input, handler, metadata, context);
+ parser.parse(TikaInputStream.get(input), handler, metadata, context);
assertEquals(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@@ -115,6 +122,10 @@ public class OOXMLParserTest extends Tes
// Date Format: d-mmm-yy
assertTrue(content.contains("17-May-07"));
+ // Currency $#,##0.00;[Red]($#,##0.00)
+ assertTrue(content.contains("$1,599.99"));
+ assertTrue(content.contains("($1,599.99)"));
+
// Below assertions represent outstanding formatting issues to be addressed
// they are included to allow the issues to be progressed with the Apache POI
// team - See TIKA-103.
@@ -126,10 +137,6 @@ public class OOXMLParserTest extends Tes
// Date/Time Format
assertTrue(content.contains("19/01/2008 04:35"));
- // Currency $#,##0.00;[Red]($#,##0.00)
- assertTrue(content.contains("$1,599.99"));
- assertTrue(content.contains("($1,599.99)"));
-
// Custom Number (0 "dollars and" .00 "cents")
assertTrue(content.contains("19 dollars and .99 cents"));
@@ -211,15 +218,12 @@ public class OOXMLParserTest extends Tes
InputStream input = OOXMLParserTest.class
.getResourceAsStream("/test-documents/testWORD.docx");
- Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
- // TODO: should auto-detect without the resource name
- metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try {
- parser.parse(input, handler, metadata, context);
+ parser.parse(TikaInputStream.get(input), handler, metadata, context);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx?rev=996577&r1=996576&r2=996577&view=diff
==============================================================================
Binary files - no diff available.