You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/03/13 18:57:22 UTC
svn commit: r1666529 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pkg/ test/java/org/apache/tika/detect/
test/java/org/apache/tika/parser/microsoft/ooxml/
test/resources/test-documents/
Author: nick
Date: Fri Mar 13 17:57:22 2015
New Revision: 1666529
URL: http://svn.apache.org/r1666529
Log:
Support detection of OOXML-Strict files, and add a disabled unit test for OOXML-Strict xlsx parsing (not yet supported by POI)
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1666529&r1=1666528&r2=1666529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Fri Mar 13 17:57:22 2015
@@ -35,12 +35,12 @@ import org.apache.commons.compress.archi
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
@@ -58,9 +58,12 @@ import org.apache.tika.parser.iwork.IWor
public class ZipContainerDetector implements Detector {
private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
- // TODO Remove this constant once we upgrade to POI 3.12 beta 2, it is defined in ExtractorFactory there
- private static final String VISIO_DOCUMENT_REL =
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String VISIO_DOCUMENT =
"http://schemas.microsoft.com/visio/2010/relationships/document";
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String STRICT_CORE_DOCUMENT =
+ "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
@@ -237,10 +240,13 @@ public class ZipContainerDetector implem
public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
// Check for the normal Office core document
PackageRelationshipCollection core =
- pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
+ pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
// Otherwise check for some other Office core document types
if (core.size() == 0) {
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+ }
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
}
// If we didn't find a single core document of any type, skip detection
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1666529&r1=1666528&r2=1666529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Fri Mar 13 17:57:22 2015
@@ -213,6 +213,7 @@ public class TestContainerAwareDetector
assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow");
assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
+ assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1666529&r1=1666528&r2=1666529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Mar 13 17:57:22 2015
@@ -28,6 +28,7 @@ import java.io.StringWriter;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
+
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.IOUtils;
@@ -44,6 +45,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -163,6 +165,38 @@ public class OOXMLParserTest extends Tik
} finally {
input.close();
}
+ }
+
+ @Test
+ @Ignore("OOXML-Strict not currently supported by POI, see #57699")
+ public void testExcelStrict() throws Exception {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+
+ InputStream input = getTestDocument("testEXCEL.strict.xlsx");
+ try {
+ parser.parse(input, handler, metadata, context);
+
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Spreadsheet for testing", metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ String content = handler.toString();
+ assertContains("Test spreadsheet", content);
+ assertContains("This one is red", content);
+ assertContains("cb=10", content);
+ assertNotContained("10.0", content);
+ assertContains("cb=sum", content);
+ assertNotContained("13.0", content);
+ assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
+ } finally {
+ input.close();
+ }
}
/**
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx?rev=1666529&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL.strict.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream