You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/09/02 10:55:42 UTC

svn commit: r991862 - in /tika/trunk/tika-parsers/src/test: java/org/apache/tika/parser/microsoft/ooxml/ resources/test-documents/

Author: jukka
Date: Thu Sep  2 08:55:42 2010
New Revision: 991862

URL: http://svn.apache.org/viewvc?rev=991862&view=rev
Log:
Add missing svn:eol-style settings

Modified:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/big-preamble.html   (props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPBM.pbm   (props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPGM.pgm   (props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPPM.ppm   (props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testSVG.svg   (props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testXHTML.html   (props changed)

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=991862&r1=991861&r2=991862&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu Sep  2 08:55:42 2010
@@ -1,290 +1,290 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.InputStream;
-import java.util.Locale;
-
-import junit.framework.TestCase;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
-
-import org.apache.tika.parser.AutoDetectParser;
-
-public class OOXMLParserTest extends TestCase {
-    public void testExcel() throws Exception {
-        InputStream input = OOXMLParserTest.class
-                .getResourceAsStream("/test-documents/testEXCEL.xlsx");
-        assertNotNull(input);
-
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-        // TODO: should auto-detect without the resource name
-        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
-        ContentHandler handler = new BodyContentHandler();
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-
-
-        try {
-            parser.parse(input, handler, metadata, context);
-
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            String content = handler.toString();
-            assertTrue(content.contains("Sample Excel Worksheet"));
-            assertTrue(content.contains("Numbers and their Squares"));
-            assertTrue(content.contains("9"));
-            assertFalse(content.contains("9.0"));
-            assertTrue(content.contains("196"));
-            assertFalse(content.contains("196.0"));
-            assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
-        } finally {
-            input.close();
-        }
-    }
-
-    public void testExcelFormats() throws Exception {
-        InputStream input = OOXMLParserTest.class
-                .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
-
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-        // TODO: should auto-detect without the resource name
-        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
-        ContentHandler handler = new BodyContentHandler();
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-
-        try {
-            parser.parse(input, handler, metadata, context);
-
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                    metadata.get(Metadata.CONTENT_TYPE));
-
-            String content = handler.toString();
-
-            // Number #,##0.00
-            assertTrue(content.contains("1,599.99"));
-            assertTrue(content.contains("-1,599.99"));
-
-            // Currency $#,##0.00;[Red]($#,##0.00)
-            assertTrue(content.contains("$1,599.99"));
-            assertTrue(content.contains("$1,599.99)"));
-
-            // Scientific 0.00E+00
-            assertTrue(content.contains("1.98E08"));
-            assertTrue(content.contains("-1.98E08"));
-
-            // Percentage
-            assertTrue(content.contains("2.50%"));
-            // Excel rounds up to 3%, but that requires Java 1.6 or later
-            if(System.getProperty("java.version").startsWith("1.5")) {
-                assertTrue(content.contains("2%"));
-            } else {
-                assertTrue(content.contains("3%"));
-            }
-
-            // Time Format: h:mm
-            assertTrue(content.contains("6:15"));
-            assertTrue(content.contains("18:15"));
-
-            // Date Format: d-mmm-yy
-            assertTrue(content.contains("17-May-07"));
-
-            // Below assertions represent outstanding formatting issues to be addressed
-            // they are included to allow the issues to be progressed with the Apache POI
-            // team - See TIKA-103.
-
-            /*************************************************************************
-            // Date Format: m/d/yy
-            assertTrue(content.contains("03/10/2009"));
-
-            // Date/Time Format
-            assertTrue(content.contains("19/01/2008 04:35"));
-
-            // Currency $#,##0.00;[Red]($#,##0.00)
-            assertTrue(content.contains("$1,599.99"));
-            assertTrue(content.contains("($1,599.99)"));
-            
-            // Custom Number (0 "dollars and" .00 "cents")
-            assertTrue(content.contains("19 dollars and .99 cents"));
-
-            // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
-            assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
-
-            // Fraction (2.5): # ?/?
-            assertTrue(content.contains("2 1 / 2"));
-            **************************************************************************/
-        } finally {
-            input.close();
-        }
-    }
-
-    /**
-     * We have a number of different powerpoint files,
-     *  such as presentation, macro-enabled etc
-     */
-    public void testPowerPoint() throws Exception {
-	String[] extensions = new String[] {
-		"pptx", "pptm", "ppsm", "ppsx",
-		//"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 
-		//"xps" // TIKA-418: Not yet supported by POI
-	};
-	for(String extension : extensions) {
-	    String filename = "testPPT." + extension;
-            InputStream input = OOXMLParserTest.class
-                    .getResourceAsStream("/test-documents/"+filename);
-    
-            Parser parser = new AutoDetectParser();
-            Metadata metadata = new Metadata();
-            // TODO: should auto-detect without the resource name
-            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
-            ContentHandler handler = new BodyContentHandler();
-            ParseContext context = new ParseContext();
-    
-            try {
-                parser.parse(input, handler, metadata, context);
-    
-                assertEquals(
-                        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-                        metadata.get(Metadata.CONTENT_TYPE));
-                assertEquals("Attachment Test", metadata.get(Metadata.TITLE));
-                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
-                
-                String content = handler.toString();
-                // Theme files don't have the text in them
-                if(extension.equals("thmx")) {
-                    assertEquals("", content);
-                } else {
-                    assertTrue(
-                    	"Text missing for " + filename + "\n" + content, 
-                    	content.contains("Attachment Test")
-                    );
-                    assertTrue(
-                    	"Text missing for " + filename + "\n" + content, 
-                    	content.contains("This is a test file data with the same content")
-                    );
-                    assertTrue(
-                    	"Text missing for " + filename + "\n" + content, 
-                    	content.contains("content parsing")
-                    );
-                    assertTrue(
-                    	"Text missing for " + filename + "\n" + content, 
-                    	content.contains("Different words to test against")
-                    );
-                    assertTrue(
-                    	"Text missing for " + filename + "\n" + content, 
-                    	content.contains("Mystery")
-                    );
-                }
-            } finally {
-                input.close();
-            }
-	}
-    }
-    
-    public void testWord() throws Exception {
-        InputStream input = OOXMLParserTest.class
-                .getResourceAsStream("/test-documents/testWORD.docx");
-
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-        // TODO: should auto-detect without the resource name
-        metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
-        ContentHandler handler = new BodyContentHandler();
-        ParseContext context = new ParseContext();
-
-        try {
-            parser.parse(input, handler, metadata, context);
-
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            assertTrue(handler.toString().contains("Sample Word Document"));
-        } finally {
-            input.close();
-        }
-    }
-
-    /**
-     * Documents with some sheets are protected, but not all. 
-     * See TIKA-364.
-     */
-    public void testProtectedExcelSheets() throws Exception {
-        InputStream input = OOXMLParserTest.class
-                .getResourceAsStream("/test-documents/protectedSheets.xlsx");
-
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        ParseContext context = new ParseContext();
-
-        try {
-            parser.parse(input, handler, metadata, context);
-
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                    metadata.get(Metadata.CONTENT_TYPE));
-
-            assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
-        } finally {
-            input.close();
-        }
-    }
-
-    /**
-     * An excel document which is password protected. 
-     * See TIKA-437.
-     */
-    public void testProtectedExcelFile() throws Exception {
-        InputStream input = OOXMLParserTest.class
-                .getResourceAsStream("/test-documents/protectedFile.xlsx");
-
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        ParseContext context = new ParseContext();
-
-        try {
-            parser.parse(input, handler, metadata, context);
-
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                    metadata.get(Metadata.CONTENT_TYPE));
-
-            assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
-            
-            String content = handler.toString();
-            assertTrue(content.contains("Office"));
-        } finally {
-            input.close();
-        }
-    }
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.InputStream;
+import java.util.Locale;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.parser.AutoDetectParser;
+
+public class OOXMLParserTest extends TestCase {
+    public void testExcel() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/testEXCEL.xlsx");
+        assertNotNull(input);
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        // TODO: should auto-detect without the resource name
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+
+
+        try {
+            parser.parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertTrue(content.contains("Sample Excel Worksheet"));
+            assertTrue(content.contains("Numbers and their Squares"));
+            assertTrue(content.contains("9"));
+            assertFalse(content.contains("9.0"));
+            assertTrue(content.contains("196"));
+            assertFalse(content.contains("196.0"));
+            assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
+        } finally {
+            input.close();
+        }
+    }
+
+    public void testExcelFormats() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        // TODO: should auto-detect without the resource name
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+
+        try {
+            parser.parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            String content = handler.toString();
+
+            // Number #,##0.00
+            assertTrue(content.contains("1,599.99"));
+            assertTrue(content.contains("-1,599.99"));
+
+            // Currency $#,##0.00;[Red]($#,##0.00)
+            assertTrue(content.contains("$1,599.99"));
+            assertTrue(content.contains("$1,599.99)"));
+
+            // Scientific 0.00E+00
+            assertTrue(content.contains("1.98E08"));
+            assertTrue(content.contains("-1.98E08"));
+
+            // Percentage
+            assertTrue(content.contains("2.50%"));
+            // Excel rounds up to 3%, but that requires Java 1.6 or later
+            if(System.getProperty("java.version").startsWith("1.5")) {
+                assertTrue(content.contains("2%"));
+            } else {
+                assertTrue(content.contains("3%"));
+            }
+
+            // Time Format: h:mm
+            assertTrue(content.contains("6:15"));
+            assertTrue(content.contains("18:15"));
+
+            // Date Format: d-mmm-yy
+            assertTrue(content.contains("17-May-07"));
+
+            // Below assertions represent outstanding formatting issues to be addressed
+            // they are included to allow the issues to be progressed with the Apache POI
+            // team - See TIKA-103.
+
+            /*************************************************************************
+            // Date Format: m/d/yy
+            assertTrue(content.contains("03/10/2009"));
+
+            // Date/Time Format
+            assertTrue(content.contains("19/01/2008 04:35"));
+
+            // Currency $#,##0.00;[Red]($#,##0.00)
+            assertTrue(content.contains("$1,599.99"));
+            assertTrue(content.contains("($1,599.99)"));
+            
+            // Custom Number (0 "dollars and" .00 "cents")
+            assertTrue(content.contains("19 dollars and .99 cents"));
+
+            // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+            assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
+
+            // Fraction (2.5): # ?/?
+            assertTrue(content.contains("2 1 / 2"));
+            **************************************************************************/
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
+     * We have a number of different powerpoint files,
+     *  such as presentation, macro-enabled etc
+     */
+    public void testPowerPoint() throws Exception {
+	String[] extensions = new String[] {
+		"pptx", "pptm", "ppsm", "ppsx",
+		//"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 
+		//"xps" // TIKA-418: Not yet supported by POI
+	};
+	for(String extension : extensions) {
+	    String filename = "testPPT." + extension;
+            InputStream input = OOXMLParserTest.class
+                    .getResourceAsStream("/test-documents/"+filename);
+    
+            Parser parser = new AutoDetectParser();
+            Metadata metadata = new Metadata();
+            // TODO: should auto-detect without the resource name
+            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+    
+            try {
+                parser.parse(input, handler, metadata, context);
+    
+                assertEquals(
+                        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                        metadata.get(Metadata.CONTENT_TYPE));
+                assertEquals("Attachment Test", metadata.get(Metadata.TITLE));
+                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
+                
+                String content = handler.toString();
+                // Theme files don't have the text in them
+                if(extension.equals("thmx")) {
+                    assertEquals("", content);
+                } else {
+                    assertTrue(
+                    	"Text missing for " + filename + "\n" + content, 
+                    	content.contains("Attachment Test")
+                    );
+                    assertTrue(
+                    	"Text missing for " + filename + "\n" + content, 
+                    	content.contains("This is a test file data with the same content")
+                    );
+                    assertTrue(
+                    	"Text missing for " + filename + "\n" + content, 
+                    	content.contains("content parsing")
+                    );
+                    assertTrue(
+                    	"Text missing for " + filename + "\n" + content, 
+                    	content.contains("Different words to test against")
+                    );
+                    assertTrue(
+                    	"Text missing for " + filename + "\n" + content, 
+                    	content.contains("Mystery")
+                    );
+                }
+            } finally {
+                input.close();
+            }
+	}
+    }
+    
+    public void testWord() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/testWORD.docx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        // TODO: should auto-detect without the resource name
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+
+        try {
+            parser.parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            assertTrue(handler.toString().contains("Sample Word Document"));
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
+     * Documents with some sheets are protected, but not all. 
+     * See TIKA-364.
+     */
+    public void testProtectedExcelSheets() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/protectedSheets.xlsx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+
+        try {
+            parser.parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
+     * An excel document which is password protected. 
+     * See TIKA-437.
+     */
+    public void testProtectedExcelFile() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/protectedFile.xlsx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+
+        try {
+            parser.parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
+            
+            String content = handler.toString();
+            assertTrue(content.contains("Office"));
+        } finally {
+            input.close();
+        }
+    }
+
 }
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/big-preamble.html
------------------------------------------------------------------------------
    sn:eol-syle = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPBM.pbm
------------------------------------------------------------------------------
    sn:eol-syle = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPGM.pgm
------------------------------------------------------------------------------
    sn:eol-syle = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPM.ppm
------------------------------------------------------------------------------
    sn:eol-syle = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testSVG.svg
------------------------------------------------------------------------------
    sn:eol-syle = native

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testXHTML.html
------------------------------------------------------------------------------
    sn:eol-syle = native