You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/03 20:29:23 UTC
[tika] branch master updated: TIKA-2765

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 93a2c27  TIKA-2765
93a2c27 is described below

commit 93a2c2740edd803cba0273dec92e69c1f2a60dbe
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jan 3 15:29:11 2019 -0500

    TIKA-2765
---
 CHANGES.txt                                        |   5 +-
 .../src/test/java/org/apache/tika/TikaTest.java    |  28 +++-
 .../parser/microsoft/POIFSContainerDetector.java   |   5 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  80 ++++++++++-
 .../tika/parser/pkg/ZipContainerDetector.java      | 147 +++++++++++++++++++-
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 152 +++++++++++++++++++++
 6 files changed, 406 insertions(+), 11 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8bd4fd5..376a109 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,7 +5,10 @@ Release 2.0.0 - ???
 
    Other changes
 
-Release 1.21 - ???
+
+Release 1.21 - ????
+
+   * Try to handle truncated OOXML files more robustly (TIKA-2765).
 
 Release 1.20 - 12/17/2018
 
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index dde60a3..0de69aa 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -211,6 +211,10 @@ public abstract class TikaTest {
       }
     }
 
+    protected List<Metadata> getRecursiveMetadata(String filePath, boolean suppressException) throws Exception {
+        return getRecursiveMetadata(filePath, new ParseContext(), new Metadata(), suppressException);
+    }
+
     protected List<Metadata> getRecursiveMetadata(String filePath) throws Exception {
         return getRecursiveMetadata(filePath, new ParseContext());
     }
@@ -220,18 +224,36 @@ public abstract class TikaTest {
     }
 
     protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
+        return getRecursiveMetadata(filePath, context, metadata, false);
+    }
+
+    protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata,
+                                                  boolean suppressException) throws Exception {
+        try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+            return getRecursiveMetadata(is, context, metadata, suppressException);
+        }
+    }
+
+    protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
+        return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
+    }
+    protected List<Metadata> getRecursiveMetadata(InputStream is, ParseContext context, Metadata metadata,
+                                                  boolean suppressException) throws Exception {
         Parser p = new AutoDetectParser();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
-
-        try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+        try {
             wrapper.parse(is, handler, metadata, context);
+        } catch (Exception e) {
+            if (!suppressException) {
+                throw e;
+            }
         }
         return handler.getMetadataList();
     }
 
-    protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
+        protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
         Parser p = new AutoDetectParser();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 1b5a0a9..6f32984 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
 import static org.apache.tika.mime.MediaType.application;
 
 import java.io.File;
@@ -440,7 +441,9 @@ public class POIFSContainerDetector implements Detector {
                         || input.read() != 0x1a || input.read() != 0xe1) {
                     return MediaType.OCTET_STREAM;
                 }
-            } finally {
+            } catch (IOException e) {
+                return MediaType.OCTET_STREAM;
+            } finally  {
                 input.reset();
             }
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index a6e111a..bcf8ea8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -16,15 +16,22 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Locale;
 
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.ooxml.POIXMLDocument;
 import org.apache.poi.ooxml.extractor.ExtractorFactory;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
@@ -41,11 +48,13 @@ import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.core.ChmExtractor;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
 import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
@@ -53,6 +62,8 @@ import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtra
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.xmlbeans.XmlException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -62,6 +73,8 @@ import org.xml.sax.SAXException;
  */
 public class OOXMLExtractorFactory {
 
+    private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
+
     public static void parse(
             InputStream stream, ContentHandler baseHandler,
             Metadata metadata, ParseContext context)
@@ -69,16 +82,26 @@ public class OOXMLExtractorFactory {
         Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
         ExtractorFactory.setThreadPrefersEventExtractors(true);
 
+        //if there's a problem opening the zip file;
+        //create a tmp file, and copy what you can read of it.
+        File tmpRepairedCopy = null;
+
+        OPCPackage pkg = null;
         try {
             OOXMLExtractor extractor = null;
-            OPCPackage pkg;
 
             // Locate or Open the OPCPackage for the file
             TikaInputStream tis = TikaInputStream.cast(stream);
             if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
                 pkg = (OPCPackage) tis.getOpenContainer();
             } else if (tis != null && tis.hasFile()) {
-                pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
+                try {
+                    pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
+                } catch (InvalidOperationException e) {
+                    tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", "");
+                    repairCopy(tis.getFile(), tmpRepairedCopy);
+                    pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
+                }
                 tis.setOpenContainer(pkg);
             } else {
                 InputStream shield = new CloseShieldInputStream(stream);
@@ -169,7 +192,60 @@ public class OOXMLExtractorFactory {
             throw new TikaException("Error creating OOXML extractor", e);
         } catch (XmlException e) {
             throw new TikaException("Error creating OOXML extractor", e);
+        } finally {
+            if (tmpRepairedCopy != null) {
+                if (pkg != null) {
+                    try {
+                        pkg.close();
+                    } catch (IOException e) {
+                        LOG.warn("problem closing pkg file");
+                    }
+                }
+                boolean deleted = tmpRepairedCopy.delete();
+                if (! deleted) {
+                    LOG.warn("failed to delete tmp (repair) file: "+tmpRepairedCopy.getAbsolutePath());
+                }
+            }
+        }
+    }
+
+    private static void repairCopy(File brokenZip, File fixedZip) {
+        try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(fixedZip)) {
+            try (InputStream is = new FileInputStream(brokenZip)) {
+                ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+                while (zae != null) {
+                    try {
+                        if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+                            outputStream.putArchiveEntry(zae);
+                            //this will copy an incomplete stream...so there
+                            //could be truncation of the xml, but the zip file
+                            //should be intact.
+                            boolean successfullyCopied = false;
+                            try {
+                                IOUtils.copy(zipArchiveInputStream, outputStream);
+                                successfullyCopied = true;
+                            } catch (IOException e) {
+                                //this can hit a "truncated ZipFile" IOException
+                            }
+                            outputStream.flush();
+                            outputStream.closeArchiveEntry();
+                            if (!successfullyCopied) {
+                                break;
+                            }
+                        }
+                        zae = zipArchiveInputStream.getNextZipEntry();
+                    } catch (EOFException e) {
+                        break;
+                    }
 
+                }
+                outputStream.flush();
+                outputStream.finish();
+                outputStream.close();
+            }
+        } catch (IOException e) {
+            LOG.warn("problem fixing zip", e);
         }
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 08174d0..3f2303b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -19,37 +19,48 @@ package org.apache.tika.parser.pkg;
 import org.apache.commons.compress.archivers.ArchiveException;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.compress.compressors.CompressorException;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.commons.io.IOUtils;
-import org.apache.poi.UnsupportedFileFormatException;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.util.ZipEntrySource;
 import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
@@ -83,7 +94,43 @@ public class ZipContainerDetector implements Detector {
 
     private static final String XPS_DOCUMENT =
             "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
-    
+
+    private static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
+    private static final MediaType DOCX =
+            MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    private static final MediaType DOCM =
+            MediaType.application("vnd.ms-word.document.macroEnabled.12");
+    private static final MediaType DOTX =
+            MediaType.application("vnd.ms-word.document.macroEnabled.12");
+    private static final MediaType PPTX =
+            MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+    private static final MediaType PPTM =
+            MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12");
+    private static final MediaType POTX =
+            MediaType.application("vnd.openxmlformats-officedocument.presentationml.template");
+    private static final MediaType XLSX =
+            MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    private static final MediaType XLSM =
+            MediaType.application("vnd.ms-excel.sheet.macroEnabled.12");
+
+    private static final Set<String> OOXML_HINTS = fillSet(
+            "word/document.xml",
+            "_rels/.rels",
+            "[Content_Types].xml",
+            "ppt/presentation.xml",
+            "ppt/slides/slide1.xml",
+            "xl/workbook.xml",
+            "xl/sharedStrings.xml",
+            "xl/worksheets/sheet1.xml"
+    );
+
+    static Set<String> fillSet(String ... args) {
+        Set<String> tmp = new HashSet<>();
+        for (String arg : args) {
+            tmp.add(arg);
+        }
+        return Collections.unmodifiableSet(tmp);
+    }
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
 
@@ -240,7 +287,7 @@ public class ZipContainerDetector implements Detector {
         try {
             zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile()));
         } catch (IOException e) {
-            return null;
+            return tryStreamingDetection(stream);
         }
 
         //if (zip.getEntry("_rels/.rels") != null
@@ -486,4 +533,96 @@ public class ZipContainerDetector implements Detector {
         // If we get here, not all required entries were found
         return null;
     }
+
+    private static MediaType tryStreamingDetection(TikaInputStream stream) {
+        Set<String> entryNames = new HashSet<>();
+        try (InputStream is = new FileInputStream(stream.getFile())) {
+            ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+            ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+            while (zae != null) {
+                if (zae.isDirectory()) {
+                    zae = zipArchiveInputStream.getNextZipEntry();
+                    continue;
+                }
+                entryNames.add(zae.getName());
+                //we could also parse _rel/.rels, but if
+                // there isn't a valid content_types, then POI
+                //will throw an exception...Better to backoff to PKG
+                //than correctly identify a truncated
+                if (zae.getName().equals("[Content_Types].xml")) {
+                    MediaType mt = parseContentTypes(zipArchiveInputStream);
+                    if (mt != null) {
+                        return mt;
+                    }
+                    return TIKA_OOXML;
+                }
+                zae = zipArchiveInputStream.getNextZipEntry();
+            }
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+            //swallow
+        }
+        int hits = 0;
+        for (String s : OOXML_HINTS) {
+            if (entryNames.contains(s)) {
+                hits++;
+            }
+        }
+        if (hits > 2) {
+            return TIKA_OOXML;
+        }
+        return MediaType.APPLICATION_ZIP;
+    }
+
+    private static MediaType parseContentTypes(InputStream is) {
+        ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
+        try {
+            XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext());
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+
+        }
+        return contentTypeHandler.mediaType;
+    }
+
+
+    private static class ContentTypeHandler extends DefaultHandler {
+        static Map<String, MediaType> CONTENT_TYPES = new ConcurrentHashMap<>();
+        static {
+            CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX);
+            CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), DOCM);
+            CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX);
+
+            CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX);
+            CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), XLSM);
+            CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), PPTX);
+            CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM);
+            CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX);
+        }
+
+        private MediaType mediaType = null;
+
+        @Override
+        public void startElement(String uri, String localName,
+                                 String name, Attributes attrs) throws SAXException {
+            for (int i = 0; i < attrs.getLength(); i++) {
+                String attrName = attrs.getLocalName(i);
+                if (attrName.equals("ContentType")) {
+                    String contentType = attrs.getValue(i);
+                    if (CONTENT_TYPES.containsKey(contentType)) {
+                        mediaType = CONTENT_TYPES.get(contentType);
+                        throw new StoppingEarlyException();
+                    }
+
+                }
+            }
+        }
+    }
+
+    private static class StoppingEarlyException extends SAXException {
+
+    }
+
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
new file mode 100644
index 0000000..81c588d
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Random;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class TruncatedOOXMLTest extends TikaTest {
+
+    @Test
+    public void testWordTrunc14435() throws Exception {
+        //this is only very slightly truncated
+        List<Metadata> metadataList = getRecursiveMetadata(truncate(
+                "testWord_various.docx", 14435), true);
+        assertEquals(1, metadataList.size());
+        Metadata metadata = metadataList.get(0);
+        String content = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertContains("This is the header", content);
+        assertContains("This is the footer text", content);
+        assertContains("Suddenly some Japanese", content);
+    }
+
+    @Test
+    public void testWordTrunc13138() throws Exception {
+        //this truncates the content_types.xml
+        //this tests that there's a backoff to the pkg parser
+        List<Metadata> metadataList = getRecursiveMetadata(truncate(
+                "testWord_various.docx", 13138), true);
+        assertEquals(19, metadataList.size());
+        Metadata m = metadataList.get(0);
+        assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testWordTrunc774() throws Exception {
+        //this is really truncated
+        List<Metadata> metadataList = getRecursiveMetadata(truncate(
+                "testWord_various.docx", 774), true);
+        assertEquals(4, metadataList.size());
+        Metadata m = metadataList.get(0);
+        assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testTruncation() throws Exception {
+
+        int length = (int)getResourceAsFile("/test-documents/testWORD_various.docx").length();
+        Random r = new Random();
+        for (int i = 0; i < 50; i++) {
+            int targetLength = r.nextInt(length);
+            InputStream is = truncate("testWORD_various.docx", targetLength);
+            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+            IOUtils.copy(is, bos);
+            assertEquals(targetLength, bos.toByteArray().length);
+        }
+        try {
+            InputStream is = truncate("testWORD_various.docx", length+1);
+            fail("should have thrown EOF");
+        } catch (EOFException e) {
+
+        }
+    }
+
+    private InputStream truncate(String fileName, int length) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        int bufferSize = 4096;
+        byte[] buffer = new byte[bufferSize];
+        int bytesRead = 0;
+        int toRead = length;
+        try (InputStream is = getResourceAsStream("/test-documents/"+fileName)) {
+            while (toRead > 0) {
+                int justRead = is.read(buffer, 0, Math.min(bufferSize, toRead));
+                if (justRead == -1) {
+                    throw new EOFException("eof reached");
+                }
+                bos.write(buffer, 0, justRead);
+                toRead -= justRead;
+            }
+        }
+        return new ByteArrayInputStream(bos.toByteArray());
+    }
+
+    @Test
+    @Ignore("for dev/debugging only")
+    public void listStreams() throws Exception {
+        File tstDir = new File(TruncatedOOXMLTest.class.getResource("/test-documents").toURI());
+        for (File f : tstDir.listFiles()) {
+            if (f.isDirectory()) {
+                continue;
+            }
+            if (f.getName().endsWith(".xlsx")) {// || f.getName().endsWith(".pptx") || f.getName().endsWith(".docx")) {
+
+            } else {
+                continue;
+            }
+            try (InputStream is = new FileInputStream(f)) {
+                ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+                int cnt = 0;
+                while (zae != null && ! zae.isDirectory() && ++cnt <= 10) {
+                    System.out.println(f.getName() + " : " + zae.getName());
+                    if (zae.getName().equals("_rels/.rels")) {
+                        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                        IOUtils.copy(zipArchiveInputStream, bos);
+                        System.out.println(new String(bos.toByteArray(), StandardCharsets.UTF_8));
+                    }
+                    zae = zipArchiveInputStream.getNextZipEntry();
+                }
+            } catch (Exception e) {
+                System.out.println(f.getName() + " : "+e.getMessage());
+            }
+        }
+    }
+}