You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/03/21 20:41:06 UTC
[tika] branch master updated: TIKA-2841 - focusing on epub,
but also fixing TIKA-2310, and handling embedded images/attachments
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 4131c6e TIKA-2841 - focusing on epub, but also fixing TIKA-2310, and handling embedded images/attachments
4131c6e is described below
commit 4131c6e30f2e0eb1feb85e0f7576531d4e830468
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Mar 21 16:40:47 2019 -0400
TIKA-2841 - focusing on epub, but also fixing TIKA-2310, and handling embedded images/attachments
---
.../java/org/apache/tika/utils/XMLReaderUtils.java | 16 +
.../src/test/java/org/apache/tika/TikaTest.java | 28 +-
.../org/apache/tika/parser/epub/EpubParser.java | 397 +++++++++++++++++++--
.../microsoft/ooxml/OOXMLExtractorFactory.java | 50 +--
.../org/apache/tika/parser/utils/ZipSalvager.java | 89 +++++
.../org/apache/tika/parser/dbf/DBFParserTest.java | 7 -
.../apache/tika/parser/epub/EpubParserTest.java | 48 +++
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 19 -
.../org/apache/tika/parser/epub/tika-config.xml | 26 ++
.../test/resources/test-documents/testEPUB.epub | Bin 30556 -> 30552 bytes
10 files changed, 584 insertions(+), 96 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 9118542..f70f3e4 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -21,6 +21,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.OfflineContentHandler;
import org.w3c.dom.Document;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
@@ -768,6 +769,21 @@ public class XMLReaderUtils implements Serializable {
return MAX_ENTITY_EXPANSIONS;
}
+ /**
+ *
+ * @param localName
+ * @param atts
+ * @return attribute value with that local name or <code>null</code> if not found
+ */
+ public static String getAttrValue(String localName, Attributes atts) {
+ for (int i = 0; i < atts.getLength(); i++) {
+ if (localName.equals(atts.getLocalName(i))) {
+ return atts.getValue(i);
+ }
+ }
+ return null;
+ }
+
private static class PoolDOMBuilder {
private final int poolGeneration;
private final DocumentBuilder documentBuilder;
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 0de69aa..931266c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -21,7 +21,9 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
+import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -237,9 +239,18 @@ public abstract class TikaTest {
protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
}
+
+ protected List<Metadata> getRecursiveMetadata(InputStream is, Parser parser, boolean suppressException) throws Exception {
+ return getRecursiveMetadata(is, parser, new ParseContext(), new Metadata(), suppressException);
+ }
+
protected List<Metadata> getRecursiveMetadata(InputStream is, ParseContext context, Metadata metadata,
boolean suppressException) throws Exception {
- Parser p = new AutoDetectParser();
+ return getRecursiveMetadata(is, new AutoDetectParser(), context, metadata, suppressException);
+ }
+
+ protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, ParseContext context, Metadata metadata,
+ boolean suppressException) throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
@@ -253,7 +264,7 @@ public abstract class TikaTest {
return handler.getMetadataList();
}
- protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
@@ -372,6 +383,19 @@ public abstract class TikaTest {
}
}
+ public InputStream truncate(String testFileName, int truncatedLength) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try (InputStream is = getResourceAsStream("/test-documents/"+testFileName)) {
+ IOUtils.copy(is, bos);
+ }
+ if (truncatedLength > bos.toByteArray().length) {
+ throw new EOFException("Can't truncate beyond file length");
+ }
+ byte[] truncated = new byte[truncatedLength];
+ System.arraycopy(bos.toByteArray(), 0, truncated, 0, truncatedLength);
+ return TikaInputStream.get(truncated);
+ }
+
public static void debug(List<Metadata> list) {
int i = 0;
for (Metadata m : list) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 775b319..df5b221 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -20,24 +20,47 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
+import java.util.zip.ZipException;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.utils.ZipSalvager;
import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ParserUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -80,6 +103,9 @@ public class EpubParser extends AbstractParser {
return SUPPORTED_TYPES;
}
+ @Field
+ boolean streaming = false;
+
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
@@ -88,33 +114,364 @@ public class EpubParser extends AbstractParser {
// we need explicit control over the start and end of the document
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
+ IOException caughtException = null;
ContentHandler childHandler = new EmbeddedContentHandler(
- new BodyContentHandler(xhtml));
-
- ZipInputStream zip = new ZipInputStream(stream);
- ZipEntry entry = zip.getNextEntry();
+ new BodyContentHandler(xhtml));
+ if (streaming) {
+ try {
+ streamingParse(stream, childHandler, metadata, context);
+ } catch (IOException e) {
+ caughtException = e;
+ }
+ } else {
+ try {
+ bufferedParse(stream, childHandler, xhtml, metadata, context);
+ } catch (IOException e) {
+ caughtException = e;
+ }
+ }
+ // Finish everything
+ xhtml.endDocument();
+ if (caughtException != null) {
+ throw caughtException;
+ }
+ }
+
+ private void streamingParse(InputStream stream, ContentHandler bodyHandler,
+ Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+
+ ZipArchiveEntry entry = zip.getNextZipEntry();
while (entry != null) {
if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, UTF_8);
- //often has trailing new lines
- if (type != null) {
- type = type.trim();
- }
- metadata.set(Metadata.CONTENT_TYPE, type);
+ updateMimeType(zip, metadata);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
- } else if (entry.getName().endsWith(".htm") ||
- entry.getName().endsWith(".html") ||
- entry.getName().endsWith(".xhtml")) {
- content.parse(zip, childHandler, metadata, context);
+ } else if (entry.getName().endsWith(".htm") ||
+ entry.getName().endsWith(".html") ||
+ entry.getName().endsWith(".xhtml")) {
+ content.parse(zip, bodyHandler, metadata, context);
}
- entry = zip.getNextEntry();
+ entry = zip.getNextZipEntry();
}
-
- // Finish everything
- xhtml.endDocument();
}
+ private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
+ String type = IOUtils.toString(is, UTF_8);
+ //often has trailing new lines
+ if (type != null) {
+ type = type.trim();
+ }
+ metadata.set(Metadata.CONTENT_TYPE, type);
+
+ }
+
+ private void bufferedParse(InputStream stream,
+ ContentHandler bodyHandler, XHTMLContentHandler xhtml,
+ Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ TikaInputStream tis;
+ TemporaryResources temporaryResources = null;
+ if (TikaInputStream.isTikaInputStream(stream)) {
+ tis = TikaInputStream.cast(stream);
+ } else {
+ temporaryResources = new TemporaryResources();
+ tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources);
+ }
+ ZipFile zipFile = null;
+ try {
+ zipFile = new ZipFile(tis.getPath().toFile());
+ } catch (ZipException e) {
+ ParserUtils.recordParserFailure(this, e, metadata);
+ trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
+ return;
+ } finally {
+ //if we had to wrap tis
+ if (temporaryResources != null) {
+ tis.close();
+ }
+ }
+ bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
+ }
+
+ private void trySalvage(Path brokenZip, ContentHandler bodyHandler,
+ XHTMLContentHandler xhtml,
+ Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ TemporaryResources resources = new TemporaryResources();
+ try {
+ Path salvaged = resources.createTempFile();
+ ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
+ boolean success = false;
+ try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
+ success = bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
+ }
+ if (! success) {
+ try (InputStream is = TikaInputStream.get(salvaged)) {
+ streamingParse(is, xhtml, metadata, context);
+ }
+ }
+ } finally {
+ resources.close();
+ }
+ }
+
+ private boolean bufferedParseZipFile(ZipFile zipFile,
+ ContentHandler bodyHandler, XHTMLContentHandler xhtml,
+ Metadata metadata, ParseContext context,
+ boolean isStrict) throws IOException, TikaException, SAXException {
+ String rootOPF = getRoot(zipFile, context);
+ if (rootOPF == null) {
+ return false;
+ }
+ ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
+ if (zae == null) {
+ return false;
+ }
+ if (!zipFile.canReadEntryData(zae)) {
+ return false;
+ }
+ meta.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);
+
+ ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
+ try (InputStream is = zipFile.getInputStream(zae)) {
+ XMLReaderUtils.parseSAX(is,
+ new OfflineContentHandler(contentOrderScraper), context);
+ }
+ //if no content items, false
+ if (contentOrderScraper.contentItems.size() == 0) {
+ return false;
+ }
+ String relativePath = "";
+ if (rootOPF.lastIndexOf("/") > -1) {
+ relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
+ }
+
+ if (isStrict) {
+ int found = 0;
+ for (String id : contentOrderScraper.contentItems) {
+ HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+ if (hRefMediaPair != null && hRefMediaPair.href != null) {
+ zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
+ if (zae != null && zipFile.canReadEntryData(zae)) {
+ found++;
+ }
+ }
+ }
+ //if not perfect match btwn items and readable items
+ //return false
+ if (found != contentOrderScraper.contentItems.size()) {
+ return false;
+ }
+ }
+
+ extractMetadata(zipFile, metadata, context);
+ Set<String> processed = new HashSet<>();
+ for (String id : contentOrderScraper.contentItems) {
+ HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+ if (hRefMediaPair != null &&
+ hRefMediaPair.href != null &&
+ hRefMediaPair.href.toLowerCase(Locale.US).contains("html")) {
+ zae = zipFile.getEntry(relativePath+hRefMediaPair.href);
+ if (zae != null) {
+ try (InputStream is = zipFile.getInputStream(zae)) {
+ content.parse(is, bodyHandler, metadata, context);
+ processed.add(id);
+ }
+ }
+ }
+ }
+
+ //now handle embedded files
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ for (String id : contentOrderScraper.locationMap.keySet()) {
+ if (! processed.contains(id)) {
+ HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+ if (shouldHandleEmbedded(hRefMediaPair.media)) {
+ handleEmbedded(zipFile, relativePath,
+ hRefMediaPair, embeddedDocumentExtractor, xhtml, metadata);
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean shouldHandleEmbedded(String media) {
+ if (media == null) {
+ return true;
+ }
+ String lc = media.toLowerCase(Locale.US);
+ if (lc.contains("css")) {
+ return false;
+ } else if (lc.contains("svg")) {
+ return false;
+ } else if (lc.endsWith("/xml")) {
+ return false;
+ } else if (lc.contains("x-ibooks")) {
+ return false;
+ }
+ return true;
+ }
+
+ private void handleEmbedded(ZipFile zipFile, String relativePath,
+ HRefMediaPair hRefMediaPair,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ XHTMLContentHandler xhtml, Metadata parentMetadata) throws IOException, SAXException {
+ if (hRefMediaPair.href == null) {
+ return;
+ }
+ String fullPath = relativePath + hRefMediaPair.href;
+
+ ZipArchiveEntry ze = zipFile.getEntry(fullPath);
+ if (!zipFile.canReadEntryData(ze)) {
+ return;
+ }
+ Metadata embeddedMetadata = new Metadata();
+ if (!StringUtils.isBlank(hRefMediaPair.media)) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
+ }
+ if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ return;
+ }
+
+ TikaInputStream stream = null;
+ try {
+ stream = TikaInputStream.get(zipFile.getInputStream(ze));
+ } catch (IOException e) {
+ //store this exception in the parent's metadata
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ return;
+ }
+
+ xhtml.startElement("div", "class", "embedded");
+ try {
+ embeddedDocumentExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
+
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ xhtml.endElement("div");
+ }
+
+ private void extractMetadata(ZipFile zipFile, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ ZipArchiveEntry zae = zipFile.getEntry("mimetype");
+ if (zae != null && zipFile.canReadEntryData(zae)) {
+ try (InputStream is = zipFile.getInputStream(zae)) {
+ updateMimeType(is, metadata);
+ }
+ }
+ zae = zipFile.getEntry("metadata.xml");
+ if (zae != null && zipFile.canReadEntryData(zae)) {
+ try (InputStream is = zipFile.getInputStream(zae)) {
+ meta.parse(is, new DefaultHandler(), metadata, context);
+ }
+ }
+ }
+
+ private String getRoot(ZipFile zipFile, ParseContext context) throws IOException, TikaException, SAXException {
+ ZipArchiveEntry container = zipFile.getEntry("META-INF/container.xml");
+ if (container != null) {
+ RootFinder rootFinder = new RootFinder();
+ try (InputStream is = zipFile.getInputStream(container)) {
+ XMLReaderUtils.parseSAX(is, new OfflineContentHandler(rootFinder), context);
+ }
+ return rootFinder.root;
+ } else {
+ Enumeration<ZipArchiveEntry> entryEnum = zipFile.getEntries();
+ while (entryEnum.hasMoreElements()) {
+ ZipArchiveEntry ze = entryEnum.nextElement();
+ if (ze.getName().toLowerCase(Locale.US).endsWith(".opf") &&
+ zipFile.canReadEntryData(ze)) {
+ return ze.getName();
+ }
+ }
+ return null;
+ }
+ }
+
+ private static class RootFinder extends DefaultHandler {
+ String root = null;
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ if ("rootfile".equalsIgnoreCase(localName)) {
+ root = XMLReaderUtils.getAttrValue("full-path", atts);
+ }
+ }
+ }
+
+ private static class ContentOrderScraper extends DefaultHandler {
+
+ Map<String, HRefMediaPair> locationMap = new HashMap<>();
+ List<String> contentItems = new ArrayList<>();
+ boolean inManifest = false;
+ boolean inSpine = false;
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ if ("manifest".equalsIgnoreCase(localName)) {
+ inManifest = true;
+ } else if ("spine".equalsIgnoreCase(localName)) {
+ inSpine = true;
+ }
+ if (inManifest) {
+ if ("item".equalsIgnoreCase(localName)) {
+ String id = XMLReaderUtils.getAttrValue("id", atts);
+ String href = XMLReaderUtils.getAttrValue("href", atts);
+ String mime = XMLReaderUtils.getAttrValue("media-type", atts);
+ if (id != null && href != null) {
+ try {
+ href = URLDecoder.decode(href, UTF_8.name());
+ } catch (UnsupportedEncodingException e) {
+ }
+ locationMap.put(id, new HRefMediaPair(href, mime));
+ }
+ }
+ }
+ if (inSpine) {
+ if ("itemRef".equalsIgnoreCase(localName)) {
+ String id = XMLReaderUtils.getAttrValue("idref", atts);
+ if (id != null) {
+ contentItems.add(id);
+ }
+ }
+ }
+ }
+
+
+ @Override
+ public void endElement(
+ String uri, String localName, String name)
+ throws SAXException {
+ if ("manifest".equalsIgnoreCase(localName)) {
+ inManifest = false;
+ } else if ("spine".equalsIgnoreCase(localName)) {
+ inSpine = false;
+ }
+ }
+ }
+ private static class HRefMediaPair {
+ private final String href;
+ private final String media;
+
+ HRefMediaPair(String href, String media) {
+ this.href = href;
+ this.media = media;
+ }
+
+ @Override
+ public String toString() {
+ return "HRefMediaPair{" +
+ "href='" + href + '\'' +
+ ", media='" + media + '\'' +
+ '}';
+ }
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index bcf8ea8..017469b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -16,16 +16,11 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import java.io.EOFException;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
@@ -48,19 +43,18 @@ import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.chm.core.ChmExtractor;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.parser.utils.ZipSalvager;
import org.apache.xmlbeans.XmlException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -99,7 +93,7 @@ public class OOXMLExtractorFactory {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
} catch (InvalidOperationException e) {
tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", "");
- repairCopy(tis.getFile(), tmpRepairedCopy);
+ ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
}
tis.setOpenContainer(pkg);
@@ -209,46 +203,6 @@ public class OOXMLExtractorFactory {
}
}
- private static void repairCopy(File brokenZip, File fixedZip) {
- try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(fixedZip)) {
- try (InputStream is = new FileInputStream(brokenZip)) {
- ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
- ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
- while (zae != null) {
- try {
- if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
- outputStream.putArchiveEntry(zae);
- //this will copy an incomplete stream...so there
- //could be truncation of the xml, but the zip file
- //should be intact.
- boolean successfullyCopied = false;
- try {
- IOUtils.copy(zipArchiveInputStream, outputStream);
- successfullyCopied = true;
- } catch (IOException e) {
- //this can hit a "truncated ZipFile" IOException
- }
- outputStream.flush();
- outputStream.closeArchiveEntry();
- if (!successfullyCopied) {
- break;
- }
- }
- zae = zipArchiveInputStream.getNextZipEntry();
- } catch (EOFException e) {
- break;
- }
-
- }
- outputStream.flush();
- outputStream.finish();
- outputStream.close();
- }
- } catch (IOException e) {
- LOG.warn("problem fixing zip", e);
- }
- }
-
private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
if (packageRelationshipCollection.size() == 0) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
new file mode 100644
index 0000000..20ebf1b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.utils;
+
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipException;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.apache.tika.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ZipSalvager {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ZipSalvager.class);
+
+ /**
+ * This streams the broken zip and rebuilds a new zip that
+ * is at least a valid zip file. The contents of the final stream
+ * may be truncated, but the result should be a valid zip file.
+ *
+ * This does nothing fancy to fix the underlying broken zip.
+ *
+ * @param brokenZip
+ * @param salvagedZip
+ */
+ public static void salvageCopy(File brokenZip, File salvagedZip) {
+ try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) {
+ try (InputStream is = new FileInputStream(brokenZip)) {
+ ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+ while (zae != null) {
+ try {
+ if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+ //create a new ZAE and copy over only the name so that
+ //if there is bad info (e.g. CRC) in brokenZip's zae, that
+ //won't be propagated or cause an exception
+ outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
+ //this will copy an incomplete stream...so there
+ //could be truncation of the xml/contents, but the zip file
+ //should be intact.
+ boolean successfullyCopied = false;
+ try {
+ IOUtils.copy(zipArchiveInputStream, outputStream);
+ successfullyCopied = true;
+ } catch (IOException e) {
+ //this can hit a "truncated ZipFile" IOException
+ }
+ outputStream.flush();
+ outputStream.closeArchiveEntry();
+ if (!successfullyCopied) {
+ break;
+ }
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
+ } catch (ZipException|EOFException e) {
+ break;
+ }
+
+ }
+ outputStream.flush();
+ outputStream.finish();
+ outputStream.close();
+ }
+ } catch (IOException e) {
+ LOG.warn("problem fixing zip", e);
+ }
+ }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
index 3ab043b..ac33de7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
@@ -148,11 +148,4 @@ commented out until we get permission to add the test file
}
*/
- InputStream truncate(String testFileName, int length) throws IOException {
- byte[] bytes = new byte[length];
- try (InputStream is = getResourceAsStream("/test-documents/" + testFileName)) {
- IOUtils.readFully(is, bytes);
- }
- return new ByteArrayInputStream(bytes);
- }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index 71c91a1..b3d2401 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -17,10 +17,18 @@
package org.apache.tika.parser.epub;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.InputStream;
+import java.util.List;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.junit.Test;
public class EpubParserTest extends TikaTest {
@@ -55,7 +63,47 @@ public class EpubParserTest extends TikaTest {
assertContainsCount("<html", content, 1);
assertContainsCount("<head", content, 1);
assertContainsCount("<body", content, 1);
+ }
+
+ @Test
+ public void testEpubOrder() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");
+ //test attachments
+ assertEquals(3, metadataList.size());
+ String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+ int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
+ int ch1 = xml.indexOf("<h1>Chapter 1");
+ int ch2 = xml.indexOf("<h1>Chapter 2");
+ assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
+ assert(tocIndex < ch1);
+ assert(tocIndex < ch2);
+ assert(ch1 < ch2);
+
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/epub/tika-config.xml");
+ assertNotNull(is);
+ Parser p = new AutoDetectParser(new TikaConfig(is));
+ xml = getXML("testEPUB.epub", p).xml;
+ tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
+ ch1 = xml.indexOf("<h1>Chapter 1");
+ ch2 = xml.indexOf("<h1>Chapter 2");
+ assert(tocIndex > -1 && ch1 > -1 && ch2 > -1);
+ assert(tocIndex > ch1);
+ assert(tocIndex > ch2);
+ assert(ch1 < ch2);
}
+
+ @Test
+ public void testTruncated() throws Exception {
+ Parser p = new EpubParser();
+ List<Metadata> metadataList;
+ try (InputStream is = truncate("testEPUB.epub", 10000)) {
+ metadataList = getRecursiveMetadata(is, p, true);
+ }
+ String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+ int ch1 = xml.indexOf("<h1>Chapter 1");
+ int ch2 = xml.indexOf("<h1>Chapter 2");
+ assert(ch1 < ch2);
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 913125e..1cf1874 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -99,25 +99,6 @@ public class TruncatedOOXMLTest extends TikaTest {
}
}
- private InputStream truncate(String fileName, int length) throws IOException {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- int bufferSize = 4096;
- byte[] buffer = new byte[bufferSize];
- int bytesRead = 0;
- int toRead = length;
- try (InputStream is = getResourceAsStream("/test-documents/"+fileName)) {
- while (toRead > 0) {
- int justRead = is.read(buffer, 0, Math.min(bufferSize, toRead));
- if (justRead == -1) {
- throw new EOFException("eof reached");
- }
- bos.write(buffer, 0, justRead);
- toRead -= justRead;
- }
- }
- return new ByteArrayInputStream(bos.toByteArray());
- }
-
@Test
@Ignore("for dev/debugging only")
public void listStreams() throws Exception {
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/epub/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/epub/tika-config.xml
new file mode 100644
index 0000000..5dbd625
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/epub/tika-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.epub.EpubParser">
+ <params>
+ <param name="streaming" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers/src/test/resources/test-documents/testEPUB.epub b/tika-parsers/src/test/resources/test-documents/testEPUB.epub
index 5965601..a88df80 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testEPUB.epub and b/tika-parsers/src/test/resources/test-documents/testEPUB.epub differ