You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/04 18:19:53 UTC
(tika) 02/02: TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit bd9719e210bb10814f95997d5a626130f107509f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Jan 2 16:37:23 2024 -0500
TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)
(cherry picked from commit d2530b8f9ea62e4cb24e5ac226732c21431fd5c4)
---
.../apache/tika/parser/epub/EncryptionParser.java | 88 ++++++++++++++++++++++
.../org/apache/tika/parser/epub/EpubParser.java | 54 ++++++++++++-
2 files changed, 140 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
new file mode 100644
index 000000000..26aae7574
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class EncryptionParser implements Parser {
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.EMPTY_SET;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ try {
+ XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context);
+ } catch (SAXException e) {
+ if (e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
+ }
+ }
+ }
+
+ private class EncryptionHandler extends DefaultHandler {
+ Set<String> encryptedItems = new HashSet<>();
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) {
+ if ("CipherReference".equals(localName)) {
+ String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
+ encryptedItems.add(encryptedUri);
+ }
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ if (encryptedItems.size() > 0) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("EPUB contains encrypted items: ");
+ int added = 0;
+ for (String u : encryptedItems) {
+ if (sb.length() > 500) {
+ sb.append(" and others...");
+ break;
+ }
+ if (added++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(u);
+ }
+ throw new SAXException(new EncryptedDocumentException(sb.toString()));
+ }
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 75e5eaf69..1bdd95750 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -46,7 +46,9 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.FilenameUtils;
@@ -78,6 +80,8 @@ public class EpubParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(MediaType.application("epub+zip"),
MediaType.application("x-ibooks+zip"))));
+
+ private static final String META_INF_ENCRYPTION = "META-INF/encryption.xml";
@Field
boolean streaming = false;
private Parser meta = new DcXMLParser();
@@ -101,6 +105,11 @@ public class EpubParser extends AbstractParser {
this.content = content;
}
+ @Field
+ public void setStreaming(boolean streaming) {
+ this.streaming = streaming;
+ }
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -136,22 +145,37 @@ public class EpubParser extends AbstractParser {
private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata,
ParseContext context)
throws IOException, TikaException, SAXException {
- ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+ ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);
ZipArchiveEntry entry = zip.getNextZipEntry();
+ SAXException sax = null;
while (entry != null) {
if (entry.getName().equals("mimetype")) {
updateMimeType(zip, metadata);
+ } else if (entry.getName().equals(META_INF_ENCRYPTION)) {
+ checkForDRM(zip);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
opf.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") ||
entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) {
- content.parse(zip, bodyHandler, metadata, context);
+ try {
+ content.parse(zip, bodyHandler, metadata, context);
+ } catch (SAXException e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ throw e;
+ }
+ if (sax == null) {
+ sax = e;
+ }
+ }
}
entry = zip.getNextZipEntry();
}
+ if (sax != null) {
+ throw sax;
+ }
}
private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
@@ -224,6 +248,7 @@ public class EpubParser extends AbstractParser {
XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context, boolean isStrict)
throws IOException, TikaException, SAXException {
+
String rootOPF = getRoot(zipFile, context);
if (rootOPF == null) {
return false;
@@ -266,6 +291,7 @@ public class EpubParser extends AbstractParser {
}
extractMetadata(zipFile, metadata, context);
+ checkForDRM(zipFile);
Set<String> processed = new HashSet<>();
for (String id : contentOrderScraper.contentItems) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
@@ -309,6 +335,30 @@ public class EpubParser extends AbstractParser {
return true;
}
+ private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException {
+ ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
+ if (zae == null) {
+ return;
+ }
+ try (InputStream is = zipFile.getInputStream(zae)) {
+ new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+ } catch (EncryptedDocumentException e) {
+ throw e;
+ } catch (TikaException | SAXException e) {
+ //swallow ?!
+ }
+ }
+
+ private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException {
+ try {
+ new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+ } catch (EncryptedDocumentException e) {
+ throw e;
+ } catch (TikaException | SAXException e) {
+ //swallow ?!
+ }
+ }
+
private boolean shouldHandleEmbedded(String media) {
if (media == null) {
return true;