You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/02 15:40:16 UTC

(tika) branch TIKA-4176 created (now f083609b8)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4176
in repository https://gitbox.apache.org/repos/asf/tika.git


      at f083609b8 TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs

This branch includes the following new commits:

     new f083609b8 TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4176
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f083609b8c09c7a4e0ee67fdc1468bf73578eb1b
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jan 2 10:36:36 2024 -0500

    TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs
---
 .../apache/tika/parser/epub/EncryptionParser.java  | 88 ++++++++++++++++++++++
 .../org/apache/tika/parser/epub/EpubParser.java    | 54 ++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
new file mode 100644
index 000000000..26aae7574
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class EncryptionParser implements Parser {
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.EMPTY_SET;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        try {
+            XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context);
+        } catch (SAXException e) {
+            if (e.getCause() instanceof EncryptedDocumentException) {
+                throw (EncryptedDocumentException)e.getCause();
+            }
+        }
+    }
+
+    private class EncryptionHandler extends DefaultHandler {
+        Set<String> encryptedItems = new HashSet<>();
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) {
+            if ("CipherReference".equals(localName)) {
+                String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
+                encryptedItems.add(encryptedUri);
+            }
+        }
+
+        @Override
+        public void endDocument() throws SAXException {
+            if (encryptedItems.size() > 0) {
+                StringBuilder sb = new StringBuilder();
+                sb.append("EPUB contains encrypted items: ");
+                int added = 0;
+                for (String u : encryptedItems) {
+                    if (sb.length() > 500) {
+                        sb.append(" and others...");
+                        break;
+                    }
+                    if (added++ > 0) {
+                        sb.append(", ");
+                    }
+                    sb.append(u);
+                }
+                throw new SAXException(new EncryptedDocumentException(sb.toString()));
+            }
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 00b42d77f..97b27f27f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -46,7 +46,9 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.FilenameUtils;
@@ -77,6 +79,8 @@ public class EpubParser implements Parser {
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("epub+zip"),
                     MediaType.application("x-ibooks+zip"))));
+
+    private static final String META_INF_ENCRYPTION = "META-INF/encryption.xml";
     @Field
     boolean streaming = false;
     private Parser meta = new DcXMLParser();
@@ -100,6 +104,11 @@ public class EpubParser implements Parser {
         this.content = content;
     }
 
+    @Field
+    public void setStreaming(boolean streaming) {
+        this.streaming = streaming;
+    }
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -135,22 +144,37 @@ public class EpubParser implements Parser {
     private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata,
                                 ParseContext context)
             throws IOException, TikaException, SAXException {
-        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);
 
         ZipArchiveEntry entry = zip.getNextZipEntry();
+        SAXException sax = null;
         while (entry != null) {
             if (entry.getName().equals("mimetype")) {
                 updateMimeType(zip, metadata);
+            } else if (entry.getName().equals(META_INF_ENCRYPTION)) {
+                checkForDRM(zip);
             } else if (entry.getName().equals("metadata.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".opf")) {
                 opf.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") ||
                     entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) {
-                content.parse(zip, bodyHandler, metadata, context);
+                try {
+                    content.parse(zip, bodyHandler, metadata, context);
+                } catch (SAXException e) {
+                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                        throw e;
+                    }
+                    if (sax == null) {
+                        sax = e;
+                    }
+                }
             }
             entry = zip.getNextZipEntry();
         }
+        if (sax != null) {
+            throw sax;
+        }
     }
 
     private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
@@ -223,6 +247,7 @@ public class EpubParser implements Parser {
                                          XHTMLContentHandler xhtml, Metadata metadata,
                                          ParseContext context, boolean isStrict)
             throws IOException, TikaException, SAXException {
+
         String rootOPF = getRoot(zipFile, context);
         if (rootOPF == null) {
             return false;
@@ -265,6 +290,7 @@ public class EpubParser implements Parser {
         }
 
         extractMetadata(zipFile, metadata, context);
+        checkForDRM(zipFile);
         Set<String> processed = new HashSet<>();
         for (String id : contentOrderScraper.contentItems) {
             HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
@@ -308,6 +334,30 @@ public class EpubParser implements Parser {
         return true;
     }
 
+    private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException {
+        ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
+        if (zae == null) {
+            return;
+        }
+        try (InputStream is = zipFile.getInputStream(zae)) {
+            new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+        } catch (EncryptedDocumentException e) {
+            throw e;
+        } catch (TikaException | SAXException e) {
+            //swallow ?!
+        }
+    }
+
+    private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException {
+        try {
+            new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+        } catch (EncryptedDocumentException e) {
+            throw e;
+        } catch (TikaException | SAXException e) {
+            //swallow ?!
+        }
+    }
+
     private boolean shouldHandleEmbedded(String media) {
         if (media == null) {
             return true;