You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/28 16:39:28 UTC

[tika] branch master updated: TIKA-3057 -- improve detection of some zip based files

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new c89fc0c  TIKA-3057 -- improve detection of some zip based files
     new d23602e  Merge remote-tracking branch 'origin/master'
c89fc0c is described below

commit c89fc0c95937b71e9c1a1b5905f34e0dc1cb650f
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 28 11:38:51 2020 -0500

    TIKA-3057 -- improve detection of some zip based files
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/mime/tika-mimetypes.xml        |  28 +++++++
 .../parser/iwork/iwana/IWork13PackageParser.java   |   2 +-
 ...ackageParser.java => IWork18PackageParser.java} |  89 +++++++++------------
 .../org/apache/tika/parser/pkg/PackageParser.java  |   7 ++
 .../parser/pkg/StreamingZipContainerDetector.java  |  31 +++++--
 .../tika/parser/pkg/ZipContainerDetector.java      |  25 +++++-
 .../tika/parser/pkg/ZipContainerDetectorBase.java  |  77 ++++++++++++++++++
 .../tika/detect/TestContainerAwareDetector.java    |  49 +++++++++++-
 .../resources/test-documents/testKeynote2018.key   | Bin 0 -> 54228 bytes
 .../test-documents/testOpenOffice-autotext.bau     | Bin 0 -> 14802 bytes
 .../test-documents/testOpenOffice-extension.oxt    | Bin 0 -> 1741 bytes
 .../test-documents/testStarOffice-6.0-calc.sxc     | Bin 0 -> 7406 bytes
 .../test-documents/testStarOffice-6.0-draw.sxd     | Bin 0 -> 14593 bytes
 .../test-documents/testStarOffice-6.0-draw.sxi     | Bin 0 -> 7581 bytes
 .../testStarOffice-6.0-writer-template.stw         | Bin 0 -> 5669 bytes
 .../test-documents/testStarOffice-6.0-writer.sxw   | Bin 0 -> 5200 bytes
 17 files changed, 250 insertions(+), 60 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 798529a..d382061 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
 
 Release 1.24 - ???
 
+   * Improve detection of some zip-based formats (TIKA-3057).
+
    * Upgrade metadata-extractor to 2.13.0 (TIKA-2952).
 
    * Upgrade to POI 4.1.2 (TIKA-3047).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3211cfb..ea1f97b 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2353,6 +2353,11 @@
     <glob pattern="*.oxt"/>
   </mime-type>
 
+  <mime-type type="application/vnd.openofficeorg.autotext">
+    <sub-class-of type="application/zip"/>
+    <glob pattern="*.bau"/>
+  </mime-type>
+
   <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
     <_comment>Office Open XML Presentation</_comment>
     <glob pattern="*.pptx"/>
@@ -2598,6 +2603,29 @@
     <sub-class-of type="application/x-tika-staroffice"/>
     <glob pattern="*.vor"/>
   </mime-type>
+  <mime-type type="application/vnd.sun.xml.writer">
+    <sub-class-of type="application/zip"/>
+    <glob pattern="*.sxw"/>
+  </mime-type>
+  <!-- can't currently find any diff in contents btwn writer
+    and template. Must rely on extension -->
+  <mime-type type="application/vnd.sun.xml.writer.template">
+    <sub-class-of type="application/vnd.sun.xml.writer"/>
+    <glob pattern="*.stw"/>
+  </mime-type>
+  <mime-type type="application/vnd.sun.xml.calc">
+    <sub-class-of type="application/zip"/>
+    <glob pattern="*.sxc"/>
+  </mime-type>
+  <mime-type type="application/vnd.sun.xml.draw">
+    <sub-class-of type="application/zip"/>
+    <glob pattern="*.sxd"/>
+  </mime-type>
+  <mime-type type="application/vnd.sun.xml.impress">
+    <sub-class-of type="application/zip"/>
+    <glob pattern="*.sxi"/>
+  </mime-type>
+
   <mime-type type="application/vnd.stardivision.writer-global">
     <glob pattern="*.sgl"/>
   </mime-type>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index a090e84..07b91d2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -71,7 +71,7 @@ public class IWork13PackageParser extends AbstractParser {
         /**
          * @return Specific type if this identifies one, otherwise null
          */
-        protected static MediaType detectIfPossible(ZipEntry entry) {
+        public static MediaType detectIfPossible(ZipEntry entry) {
            String name = entry.getName();
            if (! name.endsWith(".iwa")) return null;
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
similarity index 69%
copy from tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
copy to tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
index a090e84..7d58fa0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
@@ -17,16 +17,6 @@
 
 package org.apache.tika.parser.iwork.iwana;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -37,17 +27,30 @@ import org.apache.tika.parser.ParseContext;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-public class IWork13PackageParser extends AbstractParser {
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * For now, this parser isn't even registered.  It contains
+ * code that will detect the newer 2018 .keynote, .numbers, .pages files.
+ */
+public class IWork18PackageParser extends AbstractParser {
 
-    public enum IWork13DocumentType {
-        KEYNOTE13(MediaType.application("vnd.apple.keynote.13")),
-        NUMBERS13(MediaType.application("vnd.apple.numbers.13")),
-        PAGES13(MediaType.application("vnd.apple.pages.13")),
-        UNKNOWN13(MediaType.application("vnd.apple.unknown.13"));
+    public enum IWork18DocumentType {
+        KEYNOTE18(MediaType.application("vnd.apple.keynote.18")),
+        NUMBERS18(MediaType.application("vnd.apple.numbers.18")),
+        PAGES18(MediaType.application("vnd.apple.pages.18"));
 
         private final MediaType mediaType;
 
-        IWork13DocumentType(MediaType mediaType) {
+        IWork18DocumentType(MediaType mediaType) {
             this.mediaType = mediaType;
         }
 
@@ -55,57 +58,45 @@ public class IWork13PackageParser extends AbstractParser {
             return mediaType;
         }
 
+        /**
+         *
+         * @param zipFile
+         * @return mime if detected or null
+         */
         public static MediaType detect(ZipFile zipFile) {
            MediaType type = null;
            Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
            while (entries.hasMoreElements()) {
               ZipEntry entry = entries.nextElement();
-              type = IWork13DocumentType.detectIfPossible(entry);
+              type = IWork18DocumentType.detectIfPossible(entry);
               if (type != null) return type;
            }
            
            // If we get here, we don't know what it is
-           return UNKNOWN13.getType();
+           return null;
         }
         
         /**
          * @return Specific type if this identifies one, otherwise null
          */
-        protected static MediaType detectIfPossible(ZipEntry entry) {
+        public static MediaType detectIfPossible(ZipEntry entry) {
            String name = entry.getName();
-           if (! name.endsWith(".iwa")) return null;
-
-           // Is it a uniquely identifying filename?
-           if (name.equals("Index/MasterSlide.iwa") ||
-               name.startsWith("Index/MasterSlide-")) {
-              return KEYNOTE13.getType();
+           if (name.endsWith(".numbers/Metadata/BuildVersionHistory.plist")) {
+               return IWork18DocumentType.NUMBERS18.getType();
+           } else if (name.endsWith(".pages/Metadata/BuildVersionHistory.plist")) {
+               return IWork18DocumentType.PAGES18.getType();
+           } else if (name.endsWith(".key/Metadata/BuildVersionHistory.plist")) {
+                return IWork18DocumentType.KEYNOTE18.getType();
            }
-           if (name.equals("Index/Slide.iwa") ||
-               name.startsWith("Index/Slide-")) {
-              return KEYNOTE13.getType();
-           }
-           
-           // Is it the main document?
-           if (name.equals("Index/Document.iwa")) {
-              // TODO Decode the snappy stream, and check for the Message Type
-              // =     2 (TN::SheetArchive), it is a numbers file; 
-              // = 10000 (TP::DocumentArchive), that's a pages file
-           }
-
-           // Unknown
+ // Unknown
            return null;
         }
     }
 
-    /**
-     * All iWork 13 files contain this, so we can detect based on it
-     */
-    public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist";
-
     private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-            IWork13DocumentType.KEYNOTE13.getType(),
-            IWork13DocumentType.NUMBERS13.getType(),
-            IWork13DocumentType.PAGES13.getType()
+            IWork18DocumentType.KEYNOTE18.getType(),
+            IWork18DocumentType.NUMBERS18.getType(),
+            IWork18DocumentType.PAGES18.getType()
             )));
 
     @Override
@@ -140,14 +131,14 @@ public class IWork13PackageParser extends AbstractParser {
           while (entries.hasMoreElements()) {
              ZipEntry entry = entries.nextElement();
              if (type == null) {
-                type = IWork13DocumentType.detectIfPossible(entry);
+                type = IWork18DocumentType.detectIfPossible(entry);
              }
           }
        } else {
           ZipEntry entry = zipStream.getNextEntry();
           while (entry != null) {
              if (type == null) {
-                type = IWork13DocumentType.detectIfPossible(entry);
+                type = IWork18DocumentType.detectIfPossible(entry);
              }
              entry = zipStream.getNextEntry();
           }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 9da682c..d422484 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -160,6 +160,13 @@ public class PackageParser extends AbstractParser {
                 "application/x-xliff+zip",
                 "application/x-xmind",
                 "model/vnd.dwfx+xps",
+                "application/vnd.sun.xml.calc",
+                "application/vnd.sun.xml.writer",
+                "application/vnd.sun.xml.writer.template",
+                "application/vnd.sun.xml.draw",
+                "application/vnd.sun.xml.impress",
+                "application/vnd.openofficeorg.autotext",
+
 
                 "application/x-gtar" //specialization of tar
         }) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index ac4b6e6..b55ed1a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.pkg;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
+import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.util.HashSet;
 import java.util.Map;
@@ -37,6 +38,8 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.Attributes;
@@ -125,9 +128,29 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
                         return type.getType();
                     }
                 } else if (name.equals("mimetype")) {
-                    //odt -- TODO -- bound the read and check that the results are
-                    //valid
-                    return MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8));
+                    //can't rely on zae.getSize to determine if there is any
+                    //content here. :(
+                    ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                    IOUtils.copy(zipArchiveInputStream, bos);
+
+                    if (bos.toByteArray().length > 0)  {
+                        //odt -- TODO -- bound the read and check that the results are
+                        //valid
+                        return MediaType.parse(new String(bos.toByteArray(), UTF_8));
+                    }
+                } else if (name.equals("META-INF/manifest.xml")) {
+                    MediaType mt = detectStarOfficeX(zipArchiveInputStream);
+                    if (mt != null) {
+                        return mt;
+                    }
+                }
+                MediaType mt = IWork18PackageParser.IWork18DocumentType.detectIfPossible(zae);
+                if (mt != null) {
+                    return mt;
+                }
+                mt = IWork13PackageParser.IWork13DocumentType.detectIfPossible(zae);
+                if (mt != null) {
+                    return mt;
                 }
                 zae = zipArchiveInputStream.getNextZipEntry();
             }
@@ -242,7 +265,5 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
         }
     }
 
-    private static class StoppingEarlyException extends SAXException {
 
-    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index d35668c..84ba64d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -53,6 +53,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
 
 /**
  * A detector that works on Zip documents and other archive and compression
@@ -84,7 +85,7 @@ public class ZipContainerDetector implements Detector {
     private static final String XPS_DOCUMENT =
             "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
 
-
+    private static final String STAR_OFFICE_6_WRITER = "application/vnd.sun.xml.writer";
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
 
@@ -209,6 +210,9 @@ public class ZipContainerDetector implements Detector {
                 type = detectOpenDocument(zip);
 
                 if (type == null) {
+                    type = detectIWork18(zip);
+                }
+                if (type == null) {
                     type = detectIWork13(zip);
                 }
                 if (type == null) {
@@ -223,6 +227,9 @@ public class ZipContainerDetector implements Detector {
                 if (type == null) {
                     type = detectIpa(zip);
                 }
+                if (type == null) {
+                    type = detectStarOfficeX(zip);
+                }
                 if (type != null) {
                     return type;
                 }
@@ -255,7 +262,7 @@ public class ZipContainerDetector implements Detector {
     private static MediaType detectOpenDocument(ZipFile zip) {
         try {
             ZipArchiveEntry mimetype = zip.getEntry("mimetype");
-            if (mimetype != null) {
+            if (mimetype != null && mimetype.getSize() > 0) {
                 try (InputStream stream = zip.getInputStream(mimetype)) {
                     return MediaType.parse(IOUtils.toString(stream, UTF_8));
                 }
@@ -384,6 +391,10 @@ public class ZipContainerDetector implements Detector {
         return null;
     }
 
+    private static MediaType detectIWork18(ZipFile zip) {
+        return IWork18PackageParser.IWork18DocumentType.detect(zip);
+    }
+
     private static MediaType detectIWork(ZipFile zip) {
         if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
             // Locate the appropriate index file entry, and reads from that
@@ -456,6 +467,16 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
+
+    private static MediaType detectStarOfficeX(ZipFile zip) throws IOException {
+        ZipArchiveEntry zae = zip.getEntry("META-INF/manifest.xml");
+        if (zae == null) {
+            return null;
+        }
+        try (InputStream is = zip.getInputStream(zae)) {
+            return ZipContainerDetectorBase.detectStarOfficeX(is);
+        }
+    }
     /**
      * To be considered as an IPA file, it needs to match all of these
      */
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
index a033d33..f18fc90 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
@@ -16,13 +16,22 @@
  */
 package org.apache.tika.parser.pkg;
 
+import java.io.InputStream;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
 
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 abstract class ZipContainerDetectorBase {
 
@@ -65,6 +74,9 @@ abstract class ZipContainerDetectorBase {
     static final MediaType XPS =
             MediaType.application("vnd.ms-xpsdocument");
 
+    static final MediaType BAU =
+            MediaType.application("vnd.openofficeorg.autotext");
+
     static final Set<String> OOXML_HINTS = fillSet(
             "word/document.xml",
             "_rels/.rels",
@@ -76,6 +88,20 @@ abstract class ZipContainerDetectorBase {
             "xl/worksheets/sheet1.xml"
     );
 
+    static final Map<String, MediaType> STAR_OFFICE_X = new HashMap<>();
+
+    static {
+        STAR_OFFICE_X.put("application/vnd.sun.xml.writer",
+                MediaType.application("vnd.sun.xml.writer"));
+        STAR_OFFICE_X.put("application/vnd.sun.xml.calc",
+                MediaType.application("vnd.sun.xml.calc"));
+        STAR_OFFICE_X.put("application/vnd.sun.xml.draw",
+                MediaType.application("vnd.sun.xml.draw"));
+        STAR_OFFICE_X.put("application/vnd.sun.xml.impress",
+                MediaType.application("vnd.sun.xml.impress"));
+        STAR_OFFICE_X.put("application/vnd.sun.star.configuration-data",
+                MediaType.application("vnd.openofficeorg.extension"));
+    }
     private static Set<String> fillSet(String ... args) {
         Set<String> tmp = new HashSet<>();
         for (String arg : args) {
@@ -167,4 +193,55 @@ abstract class ZipContainerDetectorBase {
         return null;
     }
 
+    //parse the META-INF/content.xml file
+    static MediaType detectStarOfficeX(InputStream is) {
+        StarOfficeXHandler handler = new StarOfficeXHandler();
+        try {
+            XMLReaderUtils.parseSAX(is,
+                    new OfflineContentHandler(handler),
+                    new ParseContext());
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+        }
+        return handler.mediaType;
+    }
+
+    private static class StarOfficeXHandler extends DefaultHandler {
+
+        private MediaType mediaType = null;
+
+        @Override
+        public void startElement(String uri, String localName,
+                                 String name, Attributes attrs) throws SAXException {
+            if (! "file-entry".equals(localName)) {
+                return;
+            }
+            String mediaTypeString = null;
+            String fullPath = null;
+            for (int i = 0; i < attrs.getLength(); i++) {
+                String attrName = attrs.getLocalName(i);
+                if (attrName.equals("media-type")) {
+                    mediaTypeString = attrs.getValue(i);
+                    if (STAR_OFFICE_X.containsKey(mediaTypeString)) {
+                        mediaType = STAR_OFFICE_X.get(mediaTypeString);
+                        throw new StoppingEarlyException();
+                    }
+                } else if (attrName.equals("full-path")) {
+                    fullPath = attrs.getValue(i);
+                }
+            }
+            if ("".equals(mediaTypeString) && "/".equals(fullPath)) {
+                mediaType = BAU;
+                throw new StoppingEarlyException();
+            }
+        }
+    }
+
+    /**
+     * sentinel exception to stop parsing xml once target is found
+     */
+    static class StoppingEarlyException extends SAXException {
+
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 4548cb4..b700f7c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -39,6 +39,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
 import org.apache.tika.parser.pkg.StreamingZipContainerDetector;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.junit.After;
@@ -97,11 +98,10 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
                     expected, MediaType.APPLICATION_ZIP) &&
                 ! expected.toString().contains("tika-ooxml-protected")) {
 
-                assertEquals(
+                assertEquals("streaming zip detector failed",
                         expected,
                         streamingZipDetector.detect(stream, m));
             }
-
         }
     }
 
@@ -188,7 +188,39 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
         assertType("testVORWriterTemplate.vor",
                 "application/vnd.stardivision.writer",
                 "application/vnd.stardivision.writer");
-
+        //file from open office bug tracker issue #6452
+        //star office >6.0
+        assertType("testStarOffice-6.0-writer.sxw",
+                "application/vnd.sun.xml.writer",
+                "application/vnd.sun.xml.writer");
+        //ooo byg #5116
+        //can't find a diff in contents btwn sxw and stw...need to rely on file extension
+        assertTypeByNameAndData("testStarOffice-6.0-writer-template.stw",
+                "application/vnd.sun.xml.writer.template",
+                "application/vnd.sun.xml.writer",
+                "application/zip");
+
+        //ooo bug #1151
+        assertType("testStarOffice-6.0-calc.sxc",
+                "application/vnd.sun.xml.calc",
+                "application/vnd.sun.xml.calc");
+        //ooo bug #261
+        assertType("testStarOffice-6.0-draw.sxd",
+                "application/vnd.sun.xml.draw",
+                "application/vnd.sun.xml.draw");
+        //ooo bug #5336
+        assertType("testStarOffice-6.0-draw.sxi",
+                "application/vnd.sun.xml.impress",
+                "application/vnd.sun.xml.impress");
+
+        //ooo bug #67431 -- had to manually fix the name spacing in the manifest.xml
+        assertType("testOpenOffice-autotext.bau",
+                "application/vnd.openofficeorg.autotext",
+                "application/vnd.openofficeorg.autotext");
+        //ooo bug #110760
+        assertType("testOpenOffice-extension.oxt",
+                "application/vnd.openofficeorg.extension",
+                "application/vnd.openofficeorg.extension");
     }
 
     @Test
@@ -361,6 +393,17 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
     }
 
     @Test
+    public void testDetectIWork2018() throws Exception {
+        //file from libre office issue tracker, issue #123573
+        //manually removed jpegs for the sake of space*/
+        assertTypeByData("testKeynote2018.key",
+                IWork18PackageParser.IWork18DocumentType.KEYNOTE18.getType().toString());
+        //see https://bugs.documentfoundation.org/show_bug.cgi?id=120709 for a 2018 numbers file
+        //see https://bugs.documentfoundation.org/show_bug.cgi?id=120707 for a 2018 pages file
+    }
+
+
+    @Test
     public void testDetectKMZ() throws Exception {
        assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
     }
diff --git a/tika-parsers/src/test/resources/test-documents/testKeynote2018.key b/tika-parsers/src/test/resources/test-documents/testKeynote2018.key
new file mode 100644
index 0000000..1e13336
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testKeynote2018.key differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice-autotext.bau b/tika-parsers/src/test/resources/test-documents/testOpenOffice-autotext.bau
new file mode 100644
index 0000000..7d5f555
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOpenOffice-autotext.bau differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice-extension.oxt b/tika-parsers/src/test/resources/test-documents/testOpenOffice-extension.oxt
new file mode 100644
index 0000000..4f40c11
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOpenOffice-extension.oxt differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-calc.sxc b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-calc.sxc
new file mode 100644
index 0000000..cdbfc9f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-calc.sxc differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxd b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxd
new file mode 100644
index 0000000..b94f6dd
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxd differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxi b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxi
new file mode 100644
index 0000000..8ebb441
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-draw.sxi differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer-template.stw b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer-template.stw
new file mode 100644
index 0000000..be448ad
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer-template.stw differ
diff --git a/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer.sxw b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer.sxw
new file mode 100644
index 0000000..a515c2e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testStarOffice-6.0-writer.sxw differ