You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/13 18:57:22 UTC

svn commit: r1666529 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pkg/ test/java/org/apache/tika/detect/ test/java/org/apache/tika/parser/microsoft/ooxml/ test/resources/test-documents/

Author: nick
Date: Fri Mar 13 17:57:22 2015
New Revision: 1666529

URL: http://svn.apache.org/r1666529
Log:
Support detection of OOXML-Strict files, and add a disabled unit test for OOXML-Strict xlsx parsing (not yet supported by POI)

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1666529&r1=1666528&r2=1666529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Fri Mar 13 17:57:22 2015
@@ -35,12 +35,12 @@ import org.apache.commons.compress.archi
 import org.apache.commons.compress.compressors.CompressorException;
 import org.apache.commons.compress.compressors.CompressorInputStream;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.poi.extractor.ExtractorFactory;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
@@ -58,9 +58,12 @@ import org.apache.tika.parser.iwork.IWor
 public class ZipContainerDetector implements Detector {
     private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
 
-    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, it is defined in ExtractorFactory there
-    private static final String VISIO_DOCUMENT_REL =
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String VISIO_DOCUMENT =
             "http://schemas.microsoft.com/visio/2010/relationships/document";
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String STRICT_CORE_DOCUMENT = 
+            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
     
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
@@ -237,10 +240,13 @@ public class ZipContainerDetector implem
     public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
         // Check for the normal Office core document
         PackageRelationshipCollection core = 
-           pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
+               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
         // Otherwise check for some other Office core document types
         if (core.size() == 0) {
-            core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+        }
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
         }
         
         // If we didn't find a single core document of any type, skip detection

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1666529&r1=1666528&r2=1666529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Fri Mar 13 17:57:22 2015
@@ -213,6 +213,7 @@ public class TestContainerAwareDetector
         assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow");
         assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
         assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
+        assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
         assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
 
         assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1666529&r1=1666528&r2=1666529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Mar 13 17:57:22 2015
@@ -28,6 +28,7 @@ import java.io.StringWriter;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.io.IOUtils;
@@ -44,6 +45,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.microsoft.WordParserTest;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -163,6 +165,38 @@ public class OOXMLParserTest extends Tik
         } finally {
             input.close();
         }
+    }
+
+    @Test
+    @Ignore("OOXML-Strict not currently supported by POI, see #57699")
+    public void testExcelStrict() throws Exception {
+        Metadata metadata = new Metadata(); 
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+
+        InputStream input = getTestDocument("testEXCEL.strict.xlsx");
+        try {
+            parser.parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Spreadsheet for testing", metadata.get(TikaCoreProperties.DESCRIPTION));
+            
+            String content = handler.toString();
+            assertContains("Test spreadsheet", content);
+            assertContains("This one is red", content);
+            assertContains("cb=10", content);
+            assertNotContained("10.0", content);
+            assertContains("cb=sum", content);
+            assertNotContained("13.0", content);
+            assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
+        } finally {
+            input.close();
+        }
     }
 
     /**

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx?rev=1666529&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream