You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/28 16:39:28 UTC
[tika] branch master updated: TIKA-3057 -- improve detection of
some zip based files
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new c89fc0c TIKA-3057 -- improve detection of some zip based files
new d23602e Merge remote-tracking branch 'origin/master'
c89fc0c is described below
commit c89fc0c95937b71e9c1a1b5905f34e0dc1cb650f
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 28 11:38:51 2020 -0500
TIKA-3057 -- improve detection of some zip based files
---
CHANGES.txt | 2 +
.../org/apache/tika/mime/tika-mimetypes.xml | 28 +++++++
.../parser/iwork/iwana/IWork13PackageParser.java | 2 +-
...ackageParser.java => IWork18PackageParser.java} | 89 +++++++++------------
.../org/apache/tika/parser/pkg/PackageParser.java | 7 ++
.../parser/pkg/StreamingZipContainerDetector.java | 31 +++++--
.../tika/parser/pkg/ZipContainerDetector.java | 25 +++++-
.../tika/parser/pkg/ZipContainerDetectorBase.java | 77 ++++++++++++++++++
.../tika/detect/TestContainerAwareDetector.java | 49 +++++++++++-
.../resources/test-documents/testKeynote2018.key | Bin 0 -> 54228 bytes
.../test-documents/testOpenOffice-autotext.bau | Bin 0 -> 14802 bytes
.../test-documents/testOpenOffice-extension.oxt | Bin 0 -> 1741 bytes
.../test-documents/testStarOffice-6.0-calc.sxc | Bin 0 -> 7406 bytes
.../test-documents/testStarOffice-6.0-draw.sxd | Bin 0 -> 14593 bytes
.../test-documents/testStarOffice-6.0-draw.sxi | Bin 0 -> 7581 bytes
.../testStarOffice-6.0-writer-template.stw | Bin 0 -> 5669 bytes
.../test-documents/testStarOffice-6.0-writer.sxw | Bin 0 -> 5200 bytes
17 files changed, 250 insertions(+), 60 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 798529a..d382061 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
Release 1.24 - ???
+ * Improve detection of some zip-based formats (TIKA-3057).
+
* Upgrade metadata-extractor to 2.13.0 (TIKA-2952).
* Upgrade to POI 4.1.2 (TIKA-3047).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3211cfb..ea1f97b 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2353,6 +2353,11 @@
<glob pattern="*.oxt"/>
</mime-type>
+ <mime-type type="application/vnd.openofficeorg.autotext">
+ <sub-class-of type="application/zip"/>
+ <glob pattern="*.bau"/>
+ </mime-type>
+
<mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
<_comment>Office Open XML Presentation</_comment>
<glob pattern="*.pptx"/>
@@ -2598,6 +2603,29 @@
<sub-class-of type="application/x-tika-staroffice"/>
<glob pattern="*.vor"/>
</mime-type>
+ <mime-type type="application/vnd.sun.xml.writer">
+ <sub-class-of type="application/zip"/>
+ <glob pattern="*.sxw"/>
+ </mime-type>
+ <!-- can't currently find any diff in contents btwn writer
+ and template. Must rely on extension -->
+ <mime-type type="application/vnd.sun.xml.writer.template">
+ <sub-class-of type="application/vnd.sun.xml.writer"/>
+ <glob pattern="*.stw"/>
+ </mime-type>
+ <mime-type type="application/vnd.sun.xml.calc">
+ <sub-class-of type="application/zip"/>
+ <glob pattern="*.sxc"/>
+ </mime-type>
+ <mime-type type="application/vnd.sun.xml.draw">
+ <sub-class-of type="application/zip"/>
+ <glob pattern="*.sxd"/>
+ </mime-type>
+ <mime-type type="application/vnd.sun.xml.impress">
+ <sub-class-of type="application/zip"/>
+ <glob pattern="*.sxi"/>
+ </mime-type>
+
<mime-type type="application/vnd.stardivision.writer-global">
<glob pattern="*.sgl"/>
</mime-type>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index a090e84..07b91d2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -71,7 +71,7 @@ public class IWork13PackageParser extends AbstractParser {
/**
* @return Specific type if this identifies one, otherwise null
*/
- protected static MediaType detectIfPossible(ZipEntry entry) {
+ public static MediaType detectIfPossible(ZipEntry entry) {
String name = entry.getName();
if (! name.endsWith(".iwa")) return null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
similarity index 69%
copy from tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
copy to tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
index a090e84..7d58fa0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
@@ -17,16 +17,6 @@
package org.apache.tika.parser.iwork.iwana;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -37,17 +27,30 @@ import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-public class IWork13PackageParser extends AbstractParser {
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * For now, this parser isn't even registered. It contains
+ * code that will detect the newer 2018 .keynote, .numbers, .pages files.
+ */
+public class IWork18PackageParser extends AbstractParser {
- public enum IWork13DocumentType {
- KEYNOTE13(MediaType.application("vnd.apple.keynote.13")),
- NUMBERS13(MediaType.application("vnd.apple.numbers.13")),
- PAGES13(MediaType.application("vnd.apple.pages.13")),
- UNKNOWN13(MediaType.application("vnd.apple.unknown.13"));
+ public enum IWork18DocumentType {
+ KEYNOTE18(MediaType.application("vnd.apple.keynote.18")),
+ NUMBERS18(MediaType.application("vnd.apple.numbers.18")),
+ PAGES18(MediaType.application("vnd.apple.pages.18"));
private final MediaType mediaType;
- IWork13DocumentType(MediaType mediaType) {
+ IWork18DocumentType(MediaType mediaType) {
this.mediaType = mediaType;
}
@@ -55,57 +58,45 @@ public class IWork13PackageParser extends AbstractParser {
return mediaType;
}
+ /**
+ *
+ * @param zipFile
+ * @return mime if detected or null
+ */
public static MediaType detect(ZipFile zipFile) {
MediaType type = null;
Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
- type = IWork13DocumentType.detectIfPossible(entry);
+ type = IWork18DocumentType.detectIfPossible(entry);
if (type != null) return type;
}
// If we get here, we don't know what it is
- return UNKNOWN13.getType();
+ return null;
}
/**
* @return Specific type if this identifies one, otherwise null
*/
- protected static MediaType detectIfPossible(ZipEntry entry) {
+ public static MediaType detectIfPossible(ZipEntry entry) {
String name = entry.getName();
- if (! name.endsWith(".iwa")) return null;
-
- // Is it a uniquely identifying filename?
- if (name.equals("Index/MasterSlide.iwa") ||
- name.startsWith("Index/MasterSlide-")) {
- return KEYNOTE13.getType();
+ if (name.endsWith(".numbers/Metadata/BuildVersionHistory.plist")) {
+ return IWork18DocumentType.NUMBERS18.getType();
+ } else if (name.endsWith(".pages/Metadata/BuildVersionHistory.plist")) {
+ return IWork18DocumentType.PAGES18.getType();
+ } else if (name.endsWith(".key/Metadata/BuildVersionHistory.plist")) {
+ return IWork18DocumentType.KEYNOTE18.getType();
}
- if (name.equals("Index/Slide.iwa") ||
- name.startsWith("Index/Slide-")) {
- return KEYNOTE13.getType();
- }
-
- // Is it the main document?
- if (name.equals("Index/Document.iwa")) {
- // TODO Decode the snappy stream, and check for the Message Type
- // = 2 (TN::SheetArchive), it is a numbers file;
- // = 10000 (TP::DocumentArchive), that's a pages file
- }
-
- // Unknown
+ // Unknown
return null;
}
}
- /**
- * All iWork 13 files contain this, so we can detect based on it
- */
- public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist";
-
private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- IWork13DocumentType.KEYNOTE13.getType(),
- IWork13DocumentType.NUMBERS13.getType(),
- IWork13DocumentType.PAGES13.getType()
+ IWork18DocumentType.KEYNOTE18.getType(),
+ IWork18DocumentType.NUMBERS18.getType(),
+ IWork18DocumentType.PAGES18.getType()
)));
@Override
@@ -140,14 +131,14 @@ public class IWork13PackageParser extends AbstractParser {
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
if (type == null) {
- type = IWork13DocumentType.detectIfPossible(entry);
+ type = IWork18DocumentType.detectIfPossible(entry);
}
}
} else {
ZipEntry entry = zipStream.getNextEntry();
while (entry != null) {
if (type == null) {
- type = IWork13DocumentType.detectIfPossible(entry);
+ type = IWork18DocumentType.detectIfPossible(entry);
}
entry = zipStream.getNextEntry();
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 9da682c..d422484 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -160,6 +160,13 @@ public class PackageParser extends AbstractParser {
"application/x-xliff+zip",
"application/x-xmind",
"model/vnd.dwfx+xps",
+ "application/vnd.sun.xml.calc",
+ "application/vnd.sun.xml.writer",
+ "application/vnd.sun.xml.writer.template",
+ "application/vnd.sun.xml.draw",
+ "application/vnd.sun.xml.impress",
+ "application/vnd.openofficeorg.autotext",
+
"application/x-gtar" //specialization of tar
}) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index ac4b6e6..b55ed1a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.pkg;
import static java.nio.charset.StandardCharsets.UTF_8;
+import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Map;
@@ -37,6 +38,8 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
@@ -125,9 +128,29 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
return type.getType();
}
} else if (name.equals("mimetype")) {
- //odt -- TODO -- bound the read and check that the results are
- //valid
- return MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8));
+ //can't rely on zae.getSize to determine if there is any
+ //content here. :(
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(zipArchiveInputStream, bos);
+
+ if (bos.toByteArray().length > 0) {
+ //odt -- TODO -- bound the read and check that the results are
+ //valid
+ return MediaType.parse(new String(bos.toByteArray(), UTF_8));
+ }
+ } else if (name.equals("META-INF/manifest.xml")) {
+ MediaType mt = detectStarOfficeX(zipArchiveInputStream);
+ if (mt != null) {
+ return mt;
+ }
+ }
+ MediaType mt = IWork18PackageParser.IWork18DocumentType.detectIfPossible(zae);
+ if (mt != null) {
+ return mt;
+ }
+ mt = IWork13PackageParser.IWork13DocumentType.detectIfPossible(zae);
+ if (mt != null) {
+ return mt;
}
zae = zipArchiveInputStream.getNextZipEntry();
}
@@ -242,7 +265,5 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
}
}
- private static class StoppingEarlyException extends SAXException {
- }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index d35668c..84ba64d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -53,6 +53,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
/**
* A detector that works on Zip documents and other archive and compression
@@ -84,7 +85,7 @@ public class ZipContainerDetector implements Detector {
private static final String XPS_DOCUMENT =
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
-
+ private static final String STAR_OFFICE_6_WRITER = "application/vnd.sun.xml.writer";
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
@@ -209,6 +210,9 @@ public class ZipContainerDetector implements Detector {
type = detectOpenDocument(zip);
if (type == null) {
+ type = detectIWork18(zip);
+ }
+ if (type == null) {
type = detectIWork13(zip);
}
if (type == null) {
@@ -223,6 +227,9 @@ public class ZipContainerDetector implements Detector {
if (type == null) {
type = detectIpa(zip);
}
+ if (type == null) {
+ type = detectStarOfficeX(zip);
+ }
if (type != null) {
return type;
}
@@ -255,7 +262,7 @@ public class ZipContainerDetector implements Detector {
private static MediaType detectOpenDocument(ZipFile zip) {
try {
ZipArchiveEntry mimetype = zip.getEntry("mimetype");
- if (mimetype != null) {
+ if (mimetype != null && mimetype.getSize() > 0) {
try (InputStream stream = zip.getInputStream(mimetype)) {
return MediaType.parse(IOUtils.toString(stream, UTF_8));
}
@@ -384,6 +391,10 @@ public class ZipContainerDetector implements Detector {
return null;
}
+ private static MediaType detectIWork18(ZipFile zip) {
+ return IWork18PackageParser.IWork18DocumentType.detect(zip);
+ }
+
private static MediaType detectIWork(ZipFile zip) {
if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
// Locate the appropriate index file entry, and reads from that
@@ -456,6 +467,16 @@ public class ZipContainerDetector implements Detector {
}
}
+
+ private static MediaType detectStarOfficeX(ZipFile zip) throws IOException {
+ ZipArchiveEntry zae = zip.getEntry("META-INF/manifest.xml");
+ if (zae == null) {
+ return null;
+ }
+ try (InputStream is = zip.getInputStream(zae)) {
+ return ZipContainerDetectorBase.detectStarOfficeX(is);
+ }
+ }
/**
* To be considered as an IPA file, it needs to match all of these
*/
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
index a033d33..f18fc90 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
@@ -16,13 +16,22 @@
*/
package org.apache.tika.parser.pkg;
+import java.io.InputStream;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
abstract class ZipContainerDetectorBase {
@@ -65,6 +74,9 @@ abstract class ZipContainerDetectorBase {
static final MediaType XPS =
MediaType.application("vnd.ms-xpsdocument");
+ static final MediaType BAU =
+ MediaType.application("vnd.openofficeorg.autotext");
+
static final Set<String> OOXML_HINTS = fillSet(
"word/document.xml",
"_rels/.rels",
@@ -76,6 +88,20 @@ abstract class ZipContainerDetectorBase {
"xl/worksheets/sheet1.xml"
);
+ static final Map<String, MediaType> STAR_OFFICE_X = new HashMap<>();
+
+ static {
+ STAR_OFFICE_X.put("application/vnd.sun.xml.writer",
+ MediaType.application("vnd.sun.xml.writer"));
+ STAR_OFFICE_X.put("application/vnd.sun.xml.calc",
+ MediaType.application("vnd.sun.xml.calc"));
+ STAR_OFFICE_X.put("application/vnd.sun.xml.draw",
+ MediaType.application("vnd.sun.xml.draw"));
+ STAR_OFFICE_X.put("application/vnd.sun.xml.impress",
+ MediaType.application("vnd.sun.xml.impress"));
+ STAR_OFFICE_X.put("application/vnd.sun.star.configuration-data",
+ MediaType.application("vnd.openofficeorg.extension"));
+ }
private static Set<String> fillSet(String ... args) {
Set<String> tmp = new HashSet<>();
for (String arg : args) {
@@ -167,4 +193,55 @@ abstract class ZipContainerDetectorBase {
return null;
}
+ //parse the META-INF/content.xml file
+ static MediaType detectStarOfficeX(InputStream is) {
+ StarOfficeXHandler handler = new StarOfficeXHandler();
+ try {
+ XMLReaderUtils.parseSAX(is,
+ new OfflineContentHandler(handler),
+ new ParseContext());
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ }
+ return handler.mediaType;
+ }
+
+ private static class StarOfficeXHandler extends DefaultHandler {
+
+ private MediaType mediaType = null;
+
+ @Override
+ public void startElement(String uri, String localName,
+ String name, Attributes attrs) throws SAXException {
+ if (! "file-entry".equals(localName)) {
+ return;
+ }
+ String mediaTypeString = null;
+ String fullPath = null;
+ for (int i = 0; i < attrs.getLength(); i++) {
+ String attrName = attrs.getLocalName(i);
+ if (attrName.equals("media-type")) {
+ mediaTypeString = attrs.getValue(i);
+ if (STAR_OFFICE_X.containsKey(mediaTypeString)) {
+ mediaType = STAR_OFFICE_X.get(mediaTypeString);
+ throw new StoppingEarlyException();
+ }
+ } else if (attrName.equals("full-path")) {
+ fullPath = attrs.getValue(i);
+ }
+ }
+ if ("".equals(mediaTypeString) && "/".equals(fullPath)) {
+ mediaType = BAU;
+ throw new StoppingEarlyException();
+ }
+ }
+ }
+
+ /**
+ * sentinel exception to stop parsing xml once target is found
+ */
+ static class StoppingEarlyException extends SAXException {
+
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 4548cb4..b700f7c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -39,6 +39,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
import org.apache.tika.parser.pkg.StreamingZipContainerDetector;
import org.apache.tika.utils.XMLReaderUtils;
import org.junit.After;
@@ -97,11 +98,10 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
expected, MediaType.APPLICATION_ZIP) &&
! expected.toString().contains("tika-ooxml-protected")) {
- assertEquals(
+ assertEquals("streaming zip detector failed",
expected,
streamingZipDetector.detect(stream, m));
}
-
}
}
@@ -188,7 +188,39 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
assertType("testVORWriterTemplate.vor",
"application/vnd.stardivision.writer",
"application/vnd.stardivision.writer");
-
+ //file from open office bug tracker issue #6452
+ //star office >6.0
+ assertType("testStarOffice-6.0-writer.sxw",
+ "application/vnd.sun.xml.writer",
+ "application/vnd.sun.xml.writer");
+ //ooo byg #5116
+ //can't find a diff in contents btwn sxw and stw...need to rely on file extension
+ assertTypeByNameAndData("testStarOffice-6.0-writer-template.stw",
+ "application/vnd.sun.xml.writer.template",
+ "application/vnd.sun.xml.writer",
+ "application/zip");
+
+ //ooo bug #1151
+ assertType("testStarOffice-6.0-calc.sxc",
+ "application/vnd.sun.xml.calc",
+ "application/vnd.sun.xml.calc");
+ //ooo bug #261
+ assertType("testStarOffice-6.0-draw.sxd",
+ "application/vnd.sun.xml.draw",
+ "application/vnd.sun.xml.draw");
+ //ooo bug #5336
+ assertType("testStarOffice-6.0-draw.sxi",
+ "application/vnd.sun.xml.impress",
+ "application/vnd.sun.xml.impress");
+
+ //ooo bug #67431 -- had to manually fix the name spacing in the manifest.xml
+ assertType("testOpenOffice-autotext.bau",
+ "application/vnd.openofficeorg.autotext",
+ "application/vnd.openofficeorg.autotext");
+ //ooo bug #110760
+ assertType("testOpenOffice-extension.oxt",
+ "application/vnd.openofficeorg.extension",
+ "application/vnd.openofficeorg.extension");
}
@Test
@@ -361,6 +393,17 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
}
@Test
+ public void testDetectIWork2018() throws Exception {
+ //file from libre office issue tracker, issue #123573
+ //manually removed jpegs for the sake of space*/
+ assertTypeByData("testKeynote2018.key",
+ IWork18PackageParser.IWork18DocumentType.KEYNOTE18.getType().toString());
+ //see https://bugs.documentfoundation.org/show_bug.cgi?id=120709 for a 2018 numbers file
+ //see https://bugs.documentfoundation.org/show_bug.cgi?id=120707 for a 2018 pages file
+ }
+
+
+ @Test
public void testDetectKMZ() throws Exception {
assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
}
diff --git a/tika-parsers/src/test/resources/test-documents/testKeynote2018.key b/tika-parsers/src/test/resources/test-documents/testKeynote2018.key
new file mode 100644
index 0000000..1e13336
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testKeynote2018.key differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice-autotext.bau b/tika-parsers/src/test/resources/test-documents/testOpenOffice-autotext.bau
new file mode 100644
index 0000000..7d5f555
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOpenOffice-autotext.bau differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice-extension.oxt b/tika-parsers/src/test/resources/test-documents/testOpenOffice-extension.oxt
new file mode 100644
index 0000000..4f40c11
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOpenOffice-extension.oxt differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-calc.sxc b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-calc.sxc
new file mode 100644
index 0000000..cdbfc9f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-calc.sxc differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxd b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxd
new file mode 100644
index 0000000..b94f6dd
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxd differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxi b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxi
new file mode 100644
index 0000000..8ebb441
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxi differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer-template.stw b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer-template.stw
new file mode 100644
index 0000000..be448ad
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer-template.stw differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer.sxw b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer.sxw
new file mode 100644
index 0000000..a515c2e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer.sxw differ