You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/18 19:13:58 UTC

[tika] branch main updated: TIKA-3331 -- throw a more informative exception for an encrypted odt file

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 20eae4f  TIKA-3331 -- throw a more informative exception for an encrypted odt file
20eae4f is described below

commit 20eae4f243581f919fc4814660dec07d9b330a33
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 18 15:13:27 2021 -0400

    TIKA-3331 -- throw a more informative exception for an encrypted odt file
---
 .../parser/odf/OpenDocumentManifestHandler.java    |  45 +++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  73 ++++++++++++++++-----
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  25 +++++++
 .../resources/test-documents/testODTEncrypted.odt  | Bin 0 -> 12714 bytes
 4 files changed, 125 insertions(+), 18 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
new file mode 100644
index 0000000..6ad64cc
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+/**
+ * For now, this only looks for any encryption-data elements.
+ * If found this will throw an EncryptedDocumentException wrapped
+ * in a SAXException.
+ *
+ * If desired, we can add to this to actually extract information
+ * necessary for decryption.  Please open an issue or pull
+ * request for this added functionality.
+ *
+ */
+class OpenDocumentManifestHandler extends ContentHandlerDecorator {
+
+    @Override
+    public void startElement(
+            String namespaceURI, String localName, String qName,
+            Attributes attrs) throws SAXException {
+        if (localName.equals("encryption-data")) {
+            throw new SAXException(new EncryptedDocumentException());
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 9a8ed33..54f831c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -20,10 +20,12 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
@@ -36,6 +38,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
@@ -97,6 +100,7 @@ public class OpenDocumentParser extends AbstractParser {
                     MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
 
     private static final String META_NAME = "meta.xml";
+    private static final String MANIFEST_NAME = "META-INF/manifest.xml";
 
     private EmbeddedDocumentUtil embeddedDocumentUtil;
 
@@ -155,20 +159,27 @@ public class OpenDocumentParser extends AbstractParser {
         //  we'll hit first, catch the endDocument call initially
         EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
 
-        if (zipFile != null) {
-            try {
-                handleZipFile(zipFile, metadata, context, handler);
-            } finally {
-                //Do we want to close silently == catch an exception here?
-                zipFile.close();
+        try {
+            if (zipFile != null) {
+                try {
+                    handleZipFile(zipFile, metadata, context, handler);
+                } finally {
+                    //Do we want to close silently == catch an exception here?
+                    zipFile.close();
+                }
+            } else {
+                try {
+                    handleZipStream(zipStream, metadata, context, handler);
+                } finally {
+                    //Do we want to close silently == catch an exception here?
+                    zipStream.close();
+                }
             }
-        } else {
-            try {
-                handleZipStream(zipStream, metadata, context, handler);
-            } finally {
-                //Do we want to close silently == catch an exception here?
-                zipStream.close();
+        } catch (SAXException e) {
+            if (e.getCause() instanceof EncryptedDocumentException) {
+                throw (EncryptedDocumentException)e.getCause();
             }
+            throw e;
         }
 
         // Only now call the end document
@@ -189,10 +200,23 @@ public class OpenDocumentParser extends AbstractParser {
         if (entry == null) {
             throw new IOException("No entries found in ZipInputStream");
         }
+        List<SAXException> exceptions = new ArrayList<>();
         do {
-            handleZipEntry(entry, zipStream, metadata, context, handler);
+            try {
+                handleZipEntry(entry, zipStream, metadata, context, handler);
+            } catch (SAXException e) {
+                if (e.getCause() instanceof EncryptedDocumentException) {
+                    throw (EncryptedDocumentException)e.getCause();
+                } else {
+                    exceptions.add(e);
+                }
+            }
             entry = zipStream.getNextEntry();
         } while (entry != null);
+
+        if (exceptions.size() > 0) {
+            throw exceptions.get(0);
+        }
     }
 
     private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context,
@@ -202,7 +226,12 @@ public class OpenDocumentParser extends AbstractParser {
         //  rest of the file afterwards (TIKA-1353)
         // Only possible to guarantee that when opened from a file not a stream
 
-        ZipEntry entry = zipFile.getEntry(META_NAME);
+        ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
+        if (entry != null) {
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+        }
+
+        entry = zipFile.getEntry(META_NAME);
         if (entry != null) {
             handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
         }
@@ -219,10 +248,11 @@ public class OpenDocumentParser extends AbstractParser {
     private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
                                 ParseContext context, ContentHandler handler)
             throws IOException, SAXException, TikaException {
-        if (entry == null) {
-            return;
-        }
-        if (entry.getName().equals("mimetype")) {
+
+
+        if (entry.getName().contains("manifest.xml")) {
+            checkForEncryption(zip, context);
+        } else if (entry.getName().equals("mimetype")) {
             String type = IOUtils.toString(zip, UTF_8);
             metadata.set(Metadata.CONTENT_TYPE, type);
         } else if (entry.getName().equals(META_NAME)) {
@@ -299,6 +329,13 @@ public class OpenDocumentParser extends AbstractParser {
                 new OfflineContentHandler(new EmbeddedContentHandler(handler)), context);
     }
 
+    private void checkForEncryption(InputStream stream, ParseContext context)
+            throws SAXException, TikaException, IOException {
+        XMLReaderUtils.parseSAX(
+                new CloseShieldInputStream(stream),
+                new OfflineContentHandler(new EmbeddedContentHandler(
+                        new OpenDocumentManifestHandler())), context);
+    }
 
     private boolean ignoreScriptFile(String embeddedName) {
         if (embeddedName.contains("Basic/")) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index a4861d6..b4eaf76 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
 
@@ -28,6 +30,8 @@ import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -380,6 +384,27 @@ public class ODFParserTest extends TikaTest {
         }
     }
 
+    @Test(expected = EncryptedDocumentException.class)
+    public void testEncryptedODTFile() throws Exception {
+        //the password to this file is "tika"
+        Path p =
+                Paths.get(
+                        ODFParserTest.class.getResource(
+                                "/test-documents/testODTEncrypted.odt").toURI());
+        getRecursiveMetadata(p, false);
+    }
+
+    //this, of course, should throw an EncryptedDocumentException
+    //but the file can't be read by Java's ZipInputStream or
+    //by commons compress, unless you enable descriptors.
+    //https://issues.apache.org/jira/browse/ODFTOOLKIT-402
+    @Test(expected = TikaException.class)
+    public void testEncryptedODTStream() throws Exception {
+        try (InputStream is = ODFParserTest.class.getResourceAsStream(
+                "/test-documents/testODTEncrypted.odt")) {
+            getRecursiveMetadata(is, false);
+        }
+    }
 
     private ParseContext getNonRecursingParseContext() {
         ParseContext parseContext = new ParseContext();
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt
new file mode 100644
index 0000000..55785dd
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt differ