You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/03/21 21:10:21 UTC

[tika] branch branch_1x updated: TIKA-2841 - focusing on epub, but also fixing TIKA-2310, and handling embedded images/attachments

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new b437e21  TIKA-2841 - focusing on epub, but also fixing TIKA-2310, and handling embedded images/attachments
b437e21 is described below

commit b437e212afd5d04022e0f5224c7f3cfc17ed4e74
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Mar 21 16:40:47 2019 -0400

    TIKA-2841 - focusing on epub, but also fixing TIKA-2310, and handling embedded images/attachments
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java |  16 +
 .../src/test/java/org/apache/tika/TikaTest.java    |  28 +-
 .../org/apache/tika/parser/epub/EpubParser.java    | 397 +++++++++++++++++++--
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  50 +--
 .../org/apache/tika/parser/utils/ZipSalvager.java  |  89 +++++
 .../org/apache/tika/parser/dbf/DBFParserTest.java  |   7 -
 .../apache/tika/parser/epub/EpubParserTest.java    |  48 +++
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |  19 -
 .../org/apache/tika/parser/epub/tika-config.xml    |  26 ++
 .../test/resources/test-documents/testEPUB.epub    | Bin 30556 -> 30552 bytes
 10 files changed, 584 insertions(+), 96 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 9118542..f70f3e4 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -21,6 +21,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.w3c.dom.Document;
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.DTDHandler;
 import org.xml.sax.EntityResolver;
@@ -768,6 +769,21 @@ public class XMLReaderUtils implements Serializable {
         return MAX_ENTITY_EXPANSIONS;
     }
 
+    /**
+     *
+     * @param localName
+     * @param atts
+     * @return attribute value with that local name or <code>null</code> if not found
+     */
+    public static String getAttrValue(String localName, Attributes atts) {
+        for (int i = 0; i < atts.getLength(); i++) {
+            if (localName.equals(atts.getLocalName(i))) {
+                return atts.getValue(i);
+            }
+        }
+        return null;
+    }
+
     private static class PoolDOMBuilder {
         private final int poolGeneration;
         private final DocumentBuilder documentBuilder;
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 797ae1b..cfa4d61 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -21,7 +21,9 @@ import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -236,9 +238,18 @@ public abstract class TikaTest {
     protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
         return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
     }
+
+    protected List<Metadata> getRecursiveMetadata(InputStream is, Parser parser, boolean suppressException) throws Exception {
+        return getRecursiveMetadata(is, parser, new ParseContext(), new Metadata(), suppressException);
+    }
+
     protected List<Metadata> getRecursiveMetadata(InputStream is, ParseContext context, Metadata metadata,
                                                   boolean suppressException) throws Exception {
-        Parser p = new AutoDetectParser();
+        return getRecursiveMetadata(is, new AutoDetectParser(), context, metadata, suppressException);
+    }
+
+    protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, ParseContext context, Metadata metadata,
+                                                  boolean suppressException) throws Exception {
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
@@ -252,7 +263,7 @@ public abstract class TikaTest {
         return handler.getMetadataList();
     }
 
-        protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
+    protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
         Parser p = new AutoDetectParser();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
 
@@ -371,6 +382,19 @@ public abstract class TikaTest {
         }
     }
 
+    public InputStream truncate(String testFileName, int truncatedLength) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (InputStream is = getResourceAsStream("/test-documents/"+testFileName)) {
+            IOUtils.copy(is, bos);
+        }
+        if (truncatedLength > bos.toByteArray().length) {
+            throw new EOFException("Can't truncate beyond file length");
+        }
+        byte[] truncated = new byte[truncatedLength];
+        System.arraycopy(bos.toByteArray(), 0, truncated, 0, truncatedLength);
+        return TikaInputStream.get(truncated);
+    }
+
     public static void debug(List<Metadata> list) {
         int i = 0;
         for (Metadata m : list) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 775b319..df5b221 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -20,24 +20,47 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
+import java.util.zip.ZipException;
 
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.utils.ZipSalvager;
 import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ParserUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -80,6 +103,9 @@ public class EpubParser extends AbstractParser {
         return SUPPORTED_TYPES;
     }
 
+    @Field
+    boolean streaming = false;
+
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
@@ -88,33 +114,364 @@ public class EpubParser extends AbstractParser {
         //  we need explicit control over the start and end of the document
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
+        IOException caughtException = null;
         ContentHandler childHandler = new EmbeddedContentHandler(
-              new BodyContentHandler(xhtml));
-       
-        ZipInputStream zip = new ZipInputStream(stream);
-        ZipEntry entry = zip.getNextEntry();
+                new BodyContentHandler(xhtml));
+        if (streaming) {
+            try {
+                streamingParse(stream, childHandler, metadata, context);
+            } catch (IOException e) {
+                caughtException = e;
+            }
+        } else {
+            try {
+                bufferedParse(stream, childHandler, xhtml, metadata, context);
+            } catch (IOException e) {
+                caughtException = e;
+            }
+        }
+        // Finish everything
+        xhtml.endDocument();
+        if (caughtException != null) {
+            throw caughtException;
+        }
+    }
+
+    private void streamingParse(InputStream stream, ContentHandler bodyHandler,
+                                Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+
+        ZipArchiveEntry entry = zip.getNextZipEntry();
         while (entry != null) {
             if (entry.getName().equals("mimetype")) {
-                String type = IOUtils.toString(zip, UTF_8);
-                //often has trailing new lines
-                if (type != null) {
-                    type = type.trim();
-                }
-                metadata.set(Metadata.CONTENT_TYPE, type);
+                updateMimeType(zip, metadata);
             } else if (entry.getName().equals("metadata.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".opf")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
-            } else if (entry.getName().endsWith(".htm") || 
-                           entry.getName().endsWith(".html") || 
-            		   entry.getName().endsWith(".xhtml")) {
-                content.parse(zip, childHandler, metadata, context);
+            } else if (entry.getName().endsWith(".htm") ||
+                    entry.getName().endsWith(".html") ||
+                    entry.getName().endsWith(".xhtml")) {
+                content.parse(zip, bodyHandler, metadata, context);
             }
-            entry = zip.getNextEntry();
+            entry = zip.getNextZipEntry();
         }
-        
-        // Finish everything
-        xhtml.endDocument();
     }
 
+    private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
+        String type = IOUtils.toString(is, UTF_8);
+        //often has trailing new lines
+        if (type != null) {
+            type = type.trim();
+        }
+        metadata.set(Metadata.CONTENT_TYPE, type);
+
+    }
+
+    private void bufferedParse(InputStream stream,
+                               ContentHandler bodyHandler, XHTMLContentHandler xhtml,
+                               Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+        TikaInputStream tis;
+        TemporaryResources temporaryResources = null;
+        if (TikaInputStream.isTikaInputStream(stream)) {
+            tis = TikaInputStream.cast(stream);
+        } else {
+            temporaryResources = new TemporaryResources();
+            tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources);
+        }
+        ZipFile zipFile = null;
+        try {
+            zipFile = new ZipFile(tis.getPath().toFile());
+        } catch (ZipException e) {
+            ParserUtils.recordParserFailure(this, e, metadata);
+            trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
+            return;
+        } finally {
+            //if we had to wrap tis
+            if (temporaryResources != null) {
+                tis.close();
+            }
+        }
+        bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
+    }
+
+    private void trySalvage(Path brokenZip, ContentHandler bodyHandler,
+                               XHTMLContentHandler xhtml,
+                               Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+        TemporaryResources resources = new TemporaryResources();
+        try {
+            Path salvaged = resources.createTempFile();
+            ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
+            boolean success = false;
+            try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
+                success = bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
+            }
+            if (! success) {
+                try (InputStream is = TikaInputStream.get(salvaged)) {
+                    streamingParse(is, xhtml, metadata, context);
+                }
+            }
+        } finally {
+            resources.close();
+        }
+    }
+
+    private boolean bufferedParseZipFile(ZipFile zipFile,
+                                         ContentHandler bodyHandler, XHTMLContentHandler xhtml,
+                                         Metadata metadata, ParseContext context,
+                                         boolean isStrict) throws IOException, TikaException, SAXException {
+        String rootOPF = getRoot(zipFile, context);
+        if (rootOPF == null) {
+            return false;
+        }
+        ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
+        if (zae == null) {
+            return false;
+        }
+        if (!zipFile.canReadEntryData(zae)) {
+            return false;
+        }
+        meta.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);
+
+        ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
+        try (InputStream is = zipFile.getInputStream(zae)) {
+            XMLReaderUtils.parseSAX(is,
+                    new OfflineContentHandler(contentOrderScraper), context);
+        }
+        //if no content items, false
+        if (contentOrderScraper.contentItems.size() == 0) {
+            return false;
+        }
+        String relativePath = "";
+        if (rootOPF.lastIndexOf("/") > -1) {
+            relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
+        }
+
+        if (isStrict) {
+            int found = 0;
+            for (String id : contentOrderScraper.contentItems) {
+                HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+                if (hRefMediaPair != null && hRefMediaPair.href != null) {
+                    zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
+                    if (zae != null && zipFile.canReadEntryData(zae)) {
+                        found++;
+                    }
+                }
+            }
+            //if not perfect match btwn items and readable items
+            //return false
+            if (found != contentOrderScraper.contentItems.size()) {
+                return false;
+            }
+        }
+
+        extractMetadata(zipFile, metadata, context);
+        Set<String> processed = new HashSet<>();
+        for (String id : contentOrderScraper.contentItems) {
+            HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+            if (hRefMediaPair != null &&
+                    hRefMediaPair.href != null &&
+                    hRefMediaPair.href.toLowerCase(Locale.US).contains("html")) {
+                zae = zipFile.getEntry(relativePath+hRefMediaPair.href);
+                if (zae != null) {
+                    try (InputStream is = zipFile.getInputStream(zae)) {
+                        content.parse(is, bodyHandler, metadata, context);
+                        processed.add(id);
+                    }
+                }
+            }
+        }
+
+        //now handle embedded files
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        for (String id : contentOrderScraper.locationMap.keySet()) {
+            if (! processed.contains(id)) {
+                HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+                if (shouldHandleEmbedded(hRefMediaPair.media)) {
+                    handleEmbedded(zipFile, relativePath,
+                            hRefMediaPair, embeddedDocumentExtractor, xhtml, metadata);
+                }
+            }
+        }
+        return true;
+    }
+
+    private boolean shouldHandleEmbedded(String media) {
+        if (media == null) {
+            return true;
+        }
+        String lc = media.toLowerCase(Locale.US);
+        if (lc.contains("css")) {
+            return false;
+        } else if (lc.contains("svg")) {
+            return false;
+        } else if (lc.endsWith("/xml")) {
+            return false;
+        } else if (lc.contains("x-ibooks")) {
+            return false;
+        }
+        return true;
+    }
+
+    private void handleEmbedded(ZipFile zipFile, String relativePath,
+                                HRefMediaPair hRefMediaPair,
+                                EmbeddedDocumentExtractor embeddedDocumentExtractor,
+                                XHTMLContentHandler xhtml, Metadata parentMetadata) throws IOException, SAXException {
+        if (hRefMediaPair.href == null) {
+            return;
+        }
+        String fullPath = relativePath + hRefMediaPair.href;
+
+        ZipArchiveEntry ze = zipFile.getEntry(fullPath);
+        if (!zipFile.canReadEntryData(ze)) {
+            return;
+        }
+        Metadata embeddedMetadata = new Metadata();
+        if (!StringUtils.isBlank(hRefMediaPair.media)) {
+            embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
+        }
+        if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+            return;
+        }
+
+        TikaInputStream stream = null;
+        try {
+            stream = TikaInputStream.get(zipFile.getInputStream(ze));
+        } catch (IOException e) {
+            //store this exception in the parent's metadata
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+            return;
+        }
+
+        xhtml.startElement("div", "class", "embedded");
+        try {
+            embeddedDocumentExtractor.parseEmbedded(
+                    stream,
+                    new EmbeddedContentHandler(xhtml),
+                    embeddedMetadata, false);
+
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+        xhtml.endElement("div");
+    }
+
+    private void extractMetadata(ZipFile zipFile, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+        ZipArchiveEntry zae = zipFile.getEntry("mimetype");
+        if (zae != null && zipFile.canReadEntryData(zae)) {
+            try (InputStream is = zipFile.getInputStream(zae)) {
+                updateMimeType(is, metadata);
+            }
+        }
+        zae = zipFile.getEntry("metadata.xml");
+        if (zae != null && zipFile.canReadEntryData(zae)) {
+            try (InputStream is = zipFile.getInputStream(zae)) {
+                meta.parse(is, new DefaultHandler(), metadata, context);
+            }
+        }
+    }
+
+    private String getRoot(ZipFile zipFile, ParseContext context) throws IOException, TikaException, SAXException {
+        ZipArchiveEntry container = zipFile.getEntry("META-INF/container.xml");
+        if (container != null) {
+            RootFinder rootFinder = new RootFinder();
+            try (InputStream is = zipFile.getInputStream(container)) {
+                XMLReaderUtils.parseSAX(is, new OfflineContentHandler(rootFinder), context);
+            }
+            return rootFinder.root;
+        } else {
+            Enumeration<ZipArchiveEntry> entryEnum = zipFile.getEntries();
+            while (entryEnum.hasMoreElements()) {
+                ZipArchiveEntry ze = entryEnum.nextElement();
+                if (ze.getName().toLowerCase(Locale.US).endsWith(".opf") &&
+                        zipFile.canReadEntryData(ze)) {
+                    return ze.getName();
+                }
+            }
+            return null;
+        }
+    }
+
+    private static class RootFinder extends DefaultHandler {
+        String root = null;
+        @Override
+        public void startElement(
+                String uri, String localName, String name, Attributes atts)
+                throws SAXException {
+            if ("rootfile".equalsIgnoreCase(localName)) {
+                root = XMLReaderUtils.getAttrValue("full-path", atts);
+            }
+        }
+    }
+
+    private static class ContentOrderScraper extends DefaultHandler {
+
+        Map<String, HRefMediaPair> locationMap = new HashMap<>();
+        List<String> contentItems = new ArrayList<>();
+        boolean inManifest = false;
+        boolean inSpine = false;
+
+        @Override
+        public void startElement(
+                String uri, String localName, String name, Attributes atts)
+                throws SAXException {
+            if ("manifest".equalsIgnoreCase(localName)) {
+                inManifest = true;
+            } else if ("spine".equalsIgnoreCase(localName)) {
+                inSpine = true;
+            }
+            if (inManifest) {
+                if ("item".equalsIgnoreCase(localName)) {
+                    String id = XMLReaderUtils.getAttrValue("id", atts);
+                    String href = XMLReaderUtils.getAttrValue("href", atts);
+                    String mime = XMLReaderUtils.getAttrValue("media-type", atts);
+                    if (id != null && href != null) {
+                        try {
+                            href = URLDecoder.decode(href, UTF_8.name());
+                        } catch (UnsupportedEncodingException e) {
+                        }
+                        locationMap.put(id, new HRefMediaPair(href, mime));
+                    }
+                }
+            }
+            if (inSpine) {
+                if ("itemRef".equalsIgnoreCase(localName)) {
+                    String id = XMLReaderUtils.getAttrValue("idref", atts);
+                    if (id != null) {
+                        contentItems.add(id);
+                    }
+                }
+            }
+        }
+
+
+        @Override
+        public void endElement(
+                String uri, String localName, String name)
+                throws SAXException {
+            if ("manifest".equalsIgnoreCase(localName)) {
+                inManifest = false;
+            } else if ("spine".equalsIgnoreCase(localName)) {
+                inSpine = false;
+            }
+        }
+    }
+    private static class HRefMediaPair {
+        private final String href;
+        private final String media;
+
+        HRefMediaPair(String href, String media) {
+            this.href = href;
+            this.media = media;
+        }
+
+        @Override
+        public String toString() {
+            return "HRefMediaPair{" +
+                    "href='" + href + '\'' +
+                    ", media='" + media + '\'' +
+                    '}';
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index bcf8ea8..017469b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -16,16 +16,11 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import java.io.EOFException;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Locale;
 
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.ooxml.POIXMLDocument;
 import org.apache.poi.ooxml.extractor.ExtractorFactory;
@@ -48,19 +43,18 @@ import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.chm.core.ChmExtractor;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
 import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.parser.utils.ZipSalvager;
 import org.apache.xmlbeans.XmlException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -99,7 +93,7 @@ public class OOXMLExtractorFactory {
                     pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
                 } catch (InvalidOperationException e) {
                     tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", "");
-                    repairCopy(tis.getFile(), tmpRepairedCopy);
+                    ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
                     pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                 }
                 tis.setOpenContainer(pkg);
@@ -209,46 +203,6 @@ public class OOXMLExtractorFactory {
         }
     }
 
-    private static void repairCopy(File brokenZip, File fixedZip) {
-        try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(fixedZip)) {
-            try (InputStream is = new FileInputStream(brokenZip)) {
-                ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
-                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
-                while (zae != null) {
-                    try {
-                        if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
-                            outputStream.putArchiveEntry(zae);
-                            //this will copy an incomplete stream...so there
-                            //could be truncation of the xml, but the zip file
-                            //should be intact.
-                            boolean successfullyCopied = false;
-                            try {
-                                IOUtils.copy(zipArchiveInputStream, outputStream);
-                                successfullyCopied = true;
-                            } catch (IOException e) {
-                                //this can hit a "truncated ZipFile" IOException
-                            }
-                            outputStream.flush();
-                            outputStream.closeArchiveEntry();
-                            if (!successfullyCopied) {
-                                break;
-                            }
-                        }
-                        zae = zipArchiveInputStream.getNextZipEntry();
-                    } catch (EOFException e) {
-                        break;
-                    }
-
-                }
-                outputStream.flush();
-                outputStream.finish();
-                outputStream.close();
-            }
-        } catch (IOException e) {
-            LOG.warn("problem fixing zip", e);
-        }
-    }
-
     private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
         PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
         if (packageRelationshipCollection.size() == 0) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
new file mode 100644
index 0000000..20ebf1b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.utils;
+
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipException;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.apache.tika.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ZipSalvager {
+
+    private static final Logger LOG = LoggerFactory.getLogger(ZipSalvager.class);
+
+    /**
+     * This streams the broken zip and rebuilds a new zip that
+     * is at least a valid zip file.  The contents of the final stream
+     * may be truncated, but the result should be a valid zip file.
+     *
+     * This does nothing fancy to fix the underlying broken zip.
+     *
+     * @param brokenZip
+     * @param salvagedZip
+     */
+    public static void salvageCopy(File brokenZip, File salvagedZip) {
+        try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) {
+            try (InputStream is = new FileInputStream(brokenZip)) {
+                ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+                while (zae != null) {
+                    try {
+                        if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+                            //create a new ZAE and copy over only the name so that
+                            //if there is bad info (e.g. CRC) in brokenZip's zae, that
+                            //won't be propagated or cause an exception
+                            outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
+                            //this will copy an incomplete stream...so there
+                            //could be truncation of the xml/contents, but the zip file
+                            //should be intact.
+                            boolean successfullyCopied = false;
+                            try {
+                                IOUtils.copy(zipArchiveInputStream, outputStream);
+                                successfullyCopied = true;
+                            } catch (IOException e) {
+                                //this can hit a "truncated ZipFile" IOException
+                            }
+                            outputStream.flush();
+                            outputStream.closeArchiveEntry();
+                            if (!successfullyCopied) {
+                                break;
+                            }
+                        }
+                        zae = zipArchiveInputStream.getNextZipEntry();
+                    } catch (ZipException|EOFException e) {
+                        break;
+                    }
+
+                }
+                outputStream.flush();
+                outputStream.finish();
+                outputStream.close();
+            }
+        } catch (IOException e) {
+            LOG.warn("problem fixing zip", e);
+        }
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
index 3ab043b..ac33de7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
@@ -148,11 +148,4 @@ commented out until we get permission to add the test file
     }
     */
 
-    InputStream truncate(String testFileName, int length) throws IOException {
-        byte[] bytes = new byte[length];
-        try (InputStream is = getResourceAsStream("/test-documents/" + testFileName)) {
-            IOUtils.readFully(is, bytes);
-        }
-        return new ByteArrayInputStream(bytes);
-    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index 71c91a1..b3d2401 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -17,10 +17,18 @@
 package org.apache.tika.parser.epub;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.InputStream;
+import java.util.List;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.junit.Test;
 
 public class EpubParserTest extends TikaTest {
@@ -55,7 +63,47 @@ public class EpubParserTest extends TikaTest {
         assertContainsCount("<html", content, 1);
         assertContainsCount("<head", content, 1);
         assertContainsCount("<body", content, 1);
+    }
+
+    @Test
+    public void testEpubOrder() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");
 
+        //test attachments
+        assertEquals(3, metadataList.size());
+        String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
+        int ch1 = xml.indexOf("<h1>Chapter 1");
+        int ch2 = xml.indexOf("<h1>Chapter 2");
+        assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
+        assert(tocIndex < ch1);
+        assert(tocIndex < ch2);
+        assert(ch1 < ch2);
+
+        InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/epub/tika-config.xml");
+        assertNotNull(is);
+        Parser p = new AutoDetectParser(new TikaConfig(is));
+        xml = getXML("testEPUB.epub", p).xml;
+        tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
+        ch1 = xml.indexOf("<h1>Chapter 1");
+        ch2 = xml.indexOf("<h1>Chapter 2");
+        assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
+        assert(tocIndex > ch1);
+        assert(tocIndex > ch2);
+        assert(ch1 < ch2);
     }
 
+
+    @Test
+    public void testTruncated() throws Exception {
+        Parser p = new EpubParser();
+        List<Metadata> metadataList;
+        try (InputStream is = truncate("testEPUB.epub", 10000)) {
+            metadataList = getRecursiveMetadata(is, p, true);
+        }
+        String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        int ch1 = xml.indexOf("<h1>Chapter 1");
+        int ch2 = xml.indexOf("<h1>Chapter 2");
+        assert(ch1 < ch2);
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 913125e..1cf1874 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -99,25 +99,6 @@ public class TruncatedOOXMLTest extends TikaTest {
         }
     }
 
-    private InputStream truncate(String fileName, int length) throws IOException {
-        ByteArrayOutputStream bos = new ByteArrayOutputStream();
-        int bufferSize = 4096;
-        byte[] buffer = new byte[bufferSize];
-        int bytesRead = 0;
-        int toRead = length;
-        try (InputStream is = getResourceAsStream("/test-documents/"+fileName)) {
-            while (toRead > 0) {
-                int justRead = is.read(buffer, 0, Math.min(bufferSize, toRead));
-                if (justRead == -1) {
-                    throw new EOFException("eof reached");
-                }
-                bos.write(buffer, 0, justRead);
-                toRead -= justRead;
-            }
-        }
-        return new ByteArrayInputStream(bos.toByteArray());
-    }
-
     @Test
     @Ignore("for dev/debugging only")
     public void listStreams() throws Exception {
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/epub/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/epub/tika-config.xml
new file mode 100644
index 0000000..5dbd625
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/epub/tika-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.epub.EpubParser">
+            <params>
+                <param name="streaming" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/test-documents/testEPUB.epub b/tika-parsers/src/test/resources/test-documents/testEPUB.epub
index 5965601..a88df80 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testEPUB.epub and b/tika-parsers/src/test/resources/test-documents/testEPUB.epub differ