You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/03/25 16:24:54 UTC
(tika) branch branch_2x updated: TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new fa58cd418 TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684)
fa58cd418 is described below
commit fa58cd418eb922fefa5931d0cf803e551baba31e
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Mar 25 12:24:49 2024 -0400
TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684)
* TIKA-4219 -- improve epub handling of encrypted non-text-containing items
---
.../apache/tika/parser/epub/EncryptionParser.java | 88 ----------
.../org/apache/tika/parser/epub/EpubParser.java | 187 ++++++++++++++++-----
2 files changed, 147 insertions(+), 128 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
deleted file mode 100644
index 26aae7574..000000000
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.XMLReaderUtils;
-
-public class EncryptionParser implements Parser {
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.EMPTY_SET;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
-
- try {
- XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context);
- } catch (SAXException e) {
- if (e.getCause() instanceof EncryptedDocumentException) {
- throw (EncryptedDocumentException)e.getCause();
- }
- }
- }
-
- private class EncryptionHandler extends DefaultHandler {
- Set<String> encryptedItems = new HashSet<>();
- @Override
- public void startElement(String uri, String localName, String qName, Attributes attributes) {
- if ("CipherReference".equals(localName)) {
- String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
- encryptedItems.add(encryptedUri);
- }
- }
-
- @Override
- public void endDocument() throws SAXException {
- if (encryptedItems.size() > 0) {
- StringBuilder sb = new StringBuilder();
- sb.append("EPUB contains encrypted items: ");
- int added = 0;
- for (String u : encryptedItems) {
- if (sb.length() > 500) {
- sb.append(" and others...");
- break;
- }
- if (added++ > 0) {
- sb.append(", ");
- }
- sb.append(u);
- }
- throw new SAXException(new EncryptedDocumentException(sb.toString()));
- }
- }
- }
-}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 1bdd95750..a572ad2cc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -43,6 +43,7 @@ import org.apache.commons.lang3.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
@@ -55,12 +56,14 @@ import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ParserUtils;
@@ -121,7 +124,9 @@ public class EpubParser extends AbstractParser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
IOException caughtException = null;
- ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
+ ContentHandler childHandler = new EmbeddedContentHandler(
+ new EpubNormalizingHandler(new BodyContentHandler(xhtml)));
+ Set<String> encryptedItems = Collections.EMPTY_SET;
if (streaming) {
try {
streamingParse(stream, childHandler, metadata, context);
@@ -130,7 +135,7 @@ public class EpubParser extends AbstractParser {
}
} else {
try {
- bufferedParse(stream, childHandler, xhtml, metadata, context);
+ encryptedItems = bufferedParse(stream, childHandler, xhtml, metadata, context);
} catch (IOException e) {
caughtException = e;
}
@@ -140,9 +145,11 @@ public class EpubParser extends AbstractParser {
if (caughtException != null) {
throw caughtException;
}
+ maybeThrowEncryptedException(encryptedItems);
}
- private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata,
+ private Set<String> streamingParse(InputStream stream, ContentHandler bodyHandler,
+ Metadata metadata,
ParseContext context)
throws IOException, TikaException, SAXException {
ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);
@@ -153,7 +160,8 @@ public class EpubParser extends AbstractParser {
if (entry.getName().equals("mimetype")) {
updateMimeType(zip, metadata);
} else if (entry.getName().equals(META_INF_ENCRYPTION)) {
- checkForDRM(zip);
+ //when streaming, throw an encryption exception if anything is encrypted
+ checkForDRM(zip, context);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
@@ -176,6 +184,9 @@ public class EpubParser extends AbstractParser {
if (sax != null) {
throw sax;
}
+ //always empty -- we throw an encryption exception
+ //as soon as checkForDRM hits an encrypted item
+ return Collections.EMPTY_SET;
}
private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
@@ -188,7 +199,7 @@ public class EpubParser extends AbstractParser {
}
- private void bufferedParse(InputStream stream, ContentHandler bodyHandler,
+ private Set<String> bufferedParse(InputStream stream, ContentHandler bodyHandler,
XHTMLContentHandler xhtml, Metadata metadata, ParseContext context)
throws IOException, TikaException, SAXException {
TikaInputStream tis;
@@ -196,9 +207,8 @@ public class EpubParser extends AbstractParser {
if (TikaInputStream.isTikaInputStream(stream)) {
tis = TikaInputStream.cast(stream);
if (tis.getOpenContainer() instanceof ZipFile) {
- bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata,
- context, true);
- return;
+ return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml,
+ metadata, context, true);
}
} else {
temporaryResources = new TemporaryResources();
@@ -209,8 +219,7 @@ public class EpubParser extends AbstractParser {
zipFile = new ZipFile(tis.getPath().toFile());
} catch (IOException e) {
ParserUtils.recordParserFailure(this, e, metadata);
- trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
- return;
+ return trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
} finally {
//if we had to wrap tis
if (temporaryResources != null) {
@@ -218,44 +227,42 @@ public class EpubParser extends AbstractParser {
}
}
try {
- bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
+ return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
} finally {
zipFile.close();
}
}
- private void trySalvage(Path brokenZip, ContentHandler bodyHandler, XHTMLContentHandler xhtml,
+ private Set<String> trySalvage(Path brokenZip, ContentHandler bodyHandler,
+ XHTMLContentHandler xhtml,
Metadata metadata, ParseContext context)
throws IOException, TikaException, SAXException {
try (TemporaryResources resources = new TemporaryResources()) {
Path salvaged =
resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString()));
ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
- boolean success = false;
try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
- success =
- bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
- }
- if (!success) {
+ return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
+ } catch (EpubZipException e) {
try (InputStream is = TikaInputStream.get(salvaged)) {
- streamingParse(is, xhtml, metadata, context);
+ return streamingParse(is, xhtml, metadata, context);
}
}
}
}
- private boolean bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler,
+ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler,
XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context, boolean isStrict)
- throws IOException, TikaException, SAXException {
+ throws IOException, TikaException, SAXException, EpubZipException {
String rootOPF = getRoot(zipFile, context);
if (rootOPF == null) {
- return false;
+ throw new EpubZipException();
}
ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
if (zae == null || !zipFile.canReadEntryData(zae)) {
- return false;
+ throw new EpubZipException();
}
opf.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);
@@ -265,7 +272,7 @@ public class EpubParser extends AbstractParser {
}
//if no content items, false
if (contentOrderScraper.contentItems.size() == 0) {
- return false;
+ throw new EpubZipException();
}
String relativePath = "";
if (rootOPF.lastIndexOf("/") > -1) {
@@ -286,13 +293,14 @@ public class EpubParser extends AbstractParser {
//if not perfect match btwn items and readable items
//return false
if (found != contentOrderScraper.contentItems.size()) {
- return false;
+ throw new EpubZipException();
}
}
extractMetadata(zipFile, metadata, context);
- checkForDRM(zipFile);
+ Set<String> encryptedItems = checkForDRM(zipFile);
Set<String> processed = new HashSet<>();
+ Set<SAXException> saxExceptions = new HashSet<>();
for (String id : contentOrderScraper.contentItems) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
if (hRefMediaPair != null && hRefMediaPair.href != null) {
@@ -309,10 +317,21 @@ public class EpubParser extends AbstractParser {
shouldParse = true;
}
if (shouldParse) {
+ String path = relativePath + hRefMediaPair.href;
+ //if content is encrypted, do not parse it, throw an exception now
+ if (encryptedItems.contains(path)) {
+ maybeThrowEncryptedException(encryptedItems);
+ }
zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
if (zae != null) {
try (InputStream is = zipFile.getInputStream(zae)) {
content.parse(is, bodyHandler, metadata, context);
+ } catch (SAXException e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ throw e;
+ }
+ saxExceptions.add(e);
+ } finally {
processed.add(id);
}
}
@@ -326,37 +345,59 @@ public class EpubParser extends AbstractParser {
for (String id : contentOrderScraper.locationMap.keySet()) {
if (!processed.contains(id)) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
+ String fullPath = relativePath + hRefMediaPair.href;
+ if (encryptedItems.contains(fullPath)) {
+ continue;
+ }
if (shouldHandleEmbedded(hRefMediaPair.media)) {
handleEmbedded(zipFile, relativePath, hRefMediaPair, embeddedDocumentExtractor,
xhtml, metadata);
}
}
}
- return true;
+ //throw SAXException if any from the parse of the body contents
+ for (SAXException e : saxExceptions) {
+ throw e;
+ }
+ return encryptedItems;
}
- private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException {
+ private Set<String> checkForDRM(ZipFile zipFile) throws IOException, TikaException,
+ SAXException {
ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
if (zae == null) {
- return;
+ return Collections.EMPTY_SET;
}
try (InputStream is = zipFile.getInputStream(zae)) {
- new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
- } catch (EncryptedDocumentException e) {
- throw e;
- } catch (TikaException | SAXException e) {
- //swallow ?!
+ return EncryptionHandler.parse(is, new ParseContext());
}
}
- private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException {
- try {
- new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
- } catch (EncryptedDocumentException e) {
- throw e;
- } catch (TikaException | SAXException e) {
- //swallow ?!
+ private void checkForDRM(InputStream is, ParseContext parseContext)
+ throws IOException, TikaException, SAXException {
+ Set<String> encryptedItems = EncryptionHandler.parse(is, parseContext);
+ maybeThrowEncryptedException(encryptedItems);
+ }
+
+ private void maybeThrowEncryptedException(Set<String> encryptedItems)
+ throws EncryptedDocumentException {
+ if (encryptedItems.size() == 0) {
+ return;
}
+ StringBuilder sb = new StringBuilder();
+ sb.append("EPUB contains encrypted items: ");
+ int added = 0;
+ for (String u : encryptedItems) {
+ if (sb.length() > 500) {
+ sb.append(" and others...");
+ break;
+ }
+ if (added++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(u);
+ }
+ throw new EncryptedDocumentException(sb.toString());
}
private boolean shouldHandleEmbedded(String media) {
@@ -395,6 +436,7 @@ public class EpubParser extends AbstractParser {
if (!StringUtils.isBlank(hRefMediaPair.media)) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
}
+ embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullPath);
if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
return;
}
@@ -535,4 +577,69 @@ public class EpubParser extends AbstractParser {
return "HRefMediaPair{" + "href='" + href + '\'' + ", media='" + media + '\'' + '}';
}
}
+
+
+ private static class EncryptionHandler extends DefaultHandler {
+ private static Set<String> parse(InputStream is, ParseContext parseContext)
+ throws TikaException, IOException, SAXException {
+ EncryptionHandler handler = new EncryptionHandler();
+ XMLReaderUtils.parseSAX(is, handler, parseContext);
+ return handler.getEncryptedItems();
+ }
+
+ Set<String> encryptedItems = new HashSet<>();
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) {
+ if ("CipherReference".equals(localName)) {
+ String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
+ encryptedItems.add(encryptedUri);
+ }
+ }
+ public Set<String> getEncryptedItems() {
+ return encryptedItems;
+ }
+ }
+
+ //any problem with parsing an epub file when it is
+ //a zip file
+ private static class EpubZipException extends IOException {
+
+ }
+
+ //for now, this simply converts all names to local names to avoid
+ //namespace conflicts in the content handler. This also removes namespaces
+ //from attributes
+ private class EpubNormalizingHandler extends ContentHandlerDecorator {
+ public EpubNormalizingHandler(ContentHandler contentHandler) {
+ super(contentHandler);
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ //some atts may have namespaces that were not included in the header
+ boolean needToRewrite = false;
+ for (int i = 0; i < atts.getLength(); i++) {
+ if (atts.getQName(i) != null && ! atts.getQName(i).equals(atts.getLocalName(i))) {
+ needToRewrite = true;
+ break;
+ }
+ }
+ if (needToRewrite) {
+ AttributesImpl simplifiedAtts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ simplifiedAtts.addAttribute("", atts.getLocalName(i), atts.getLocalName(i),
+ atts.getType(i), atts.getValue(i));
+ }
+ super.startElement(uri, localName, localName, simplifiedAtts);
+ } else {
+ super.startElement(uri, localName, localName, atts);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name) throws SAXException {
+ super.endElement(uri, localName, localName);
+ }
+ }
}