You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/04 18:19:53 UTC

(tika) 02/02: TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit bd9719e210bb10814f95997d5a626130f107509f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Jan 2 16:37:23 2024 -0500

    TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)
    
    (cherry picked from commit d2530b8f9ea62e4cb24e5ac226732c21431fd5c4)
---
 .../apache/tika/parser/epub/EncryptionParser.java  | 88 ++++++++++++++++++++++
 .../org/apache/tika/parser/epub/EpubParser.java    | 54 ++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
new file mode 100644
index 000000000..26aae7574
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class EncryptionParser implements Parser {
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.EMPTY_SET;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        try {
+            XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context);
+        } catch (SAXException e) {
+            if (e.getCause() instanceof EncryptedDocumentException) {
+                throw (EncryptedDocumentException)e.getCause();
+            }
+        }
+    }
+
+    private class EncryptionHandler extends DefaultHandler {
+        Set<String> encryptedItems = new HashSet<>();
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) {
+            if ("CipherReference".equals(localName)) {
+                String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
+                encryptedItems.add(encryptedUri);
+            }
+        }
+
+        @Override
+        public void endDocument() throws SAXException {
+            if (encryptedItems.size() > 0) {
+                StringBuilder sb = new StringBuilder();
+                sb.append("EPUB contains encrypted items: ");
+                int added = 0;
+                for (String u : encryptedItems) {
+                    if (sb.length() > 500) {
+                        sb.append(" and others...");
+                        break;
+                    }
+                    if (added++ > 0) {
+                        sb.append(", ");
+                    }
+                    sb.append(u);
+                }
+                throw new SAXException(new EncryptedDocumentException(sb.toString()));
+            }
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 75e5eaf69..1bdd95750 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -46,7 +46,9 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.FilenameUtils;
@@ -78,6 +80,8 @@ public class EpubParser extends AbstractParser {
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("epub+zip"),
                     MediaType.application("x-ibooks+zip"))));
+
+    private static final String META_INF_ENCRYPTION = "META-INF/encryption.xml";
     @Field
     boolean streaming = false;
     private Parser meta = new DcXMLParser();
@@ -101,6 +105,11 @@ public class EpubParser extends AbstractParser {
         this.content = content;
     }
 
+    @Field
+    public void setStreaming(boolean streaming) {
+        this.streaming = streaming;
+    }
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -136,22 +145,37 @@ public class EpubParser extends AbstractParser {
     private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata,
                                 ParseContext context)
             throws IOException, TikaException, SAXException {
-        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);
 
         ZipArchiveEntry entry = zip.getNextZipEntry();
+        SAXException sax = null;
         while (entry != null) {
             if (entry.getName().equals("mimetype")) {
                 updateMimeType(zip, metadata);
+            } else if (entry.getName().equals(META_INF_ENCRYPTION)) {
+                checkForDRM(zip);
             } else if (entry.getName().equals("metadata.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".opf")) {
                 opf.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") ||
                     entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) {
-                content.parse(zip, bodyHandler, metadata, context);
+                try {
+                    content.parse(zip, bodyHandler, metadata, context);
+                } catch (SAXException e) {
+                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                        throw e;
+                    }
+                    if (sax == null) {
+                        sax = e;
+                    }
+                }
             }
             entry = zip.getNextZipEntry();
         }
+        if (sax != null) {
+            throw sax;
+        }
     }
 
     private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
@@ -224,6 +248,7 @@ public class EpubParser extends AbstractParser {
                                          XHTMLContentHandler xhtml, Metadata metadata,
                                          ParseContext context, boolean isStrict)
             throws IOException, TikaException, SAXException {
+
         String rootOPF = getRoot(zipFile, context);
         if (rootOPF == null) {
             return false;
@@ -266,6 +291,7 @@ public class EpubParser extends AbstractParser {
         }
 
         extractMetadata(zipFile, metadata, context);
+        checkForDRM(zipFile);
         Set<String> processed = new HashSet<>();
         for (String id : contentOrderScraper.contentItems) {
             HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
@@ -309,6 +335,30 @@ public class EpubParser extends AbstractParser {
         return true;
     }
 
+    private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException {
+        ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
+        if (zae == null) {
+            return;
+        }
+        try (InputStream is = zipFile.getInputStream(zae)) {
+            new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+        } catch (EncryptedDocumentException e) {
+            throw e;
+        } catch (TikaException | SAXException e) {
+            //swallow ?!
+        }
+    }
+
+    private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException {
+        try {
+            new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+        } catch (EncryptedDocumentException e) {
+            throw e;
+        } catch (TikaException | SAXException e) {
+            //swallow ?!
+        }
+    }
+
     private boolean shouldHandleEmbedded(String media) {
         if (media == null) {
             return true;