You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/18 18:57:53 UTC

[tika] branch branch_1x updated: TIKA-3331 -- return a more informative exception for encrypted ODT

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 88166ce  TIKA-3331 -- return a more informative exception for encrypted ODT
88166ce is described below

commit 88166ce9ec9bc3c50126ecd9f0d8f0eeaadaf610
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 18 14:57:22 2021 -0400

    TIKA-3331 -- return a more informative exception for encrypted ODT
---
 .../parser/odf/OpenDocumentManifestHandler.java    |  46 ++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  82 ++++++++++++++++-----
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  29 ++++++++
 .../resources/test-documents/testODTEncrypted.odt  | Bin 0 -> 12714 bytes
 4 files changed, 138 insertions(+), 19 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
new file mode 100644
index 0000000..65dcaf5
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+/**
+ * For now, this only looks for any encryption-data elements.
+ * If found this will throw an EncryptedDocumentException wrapped
+ * in a SAXException.
+ *
+ * If desired, we can add to this to actually extract information
+ * necessary for decryption.  Please open an issue or pull
+ * request for this added functionality.
+ *
+ */
+class OpenDocumentManifestHandler extends ContentHandlerDecorator {
+
+    @Override
+    public void startElement(
+            String namespaceURI, String localName, String qName,
+            Attributes attrs) throws SAXException {
+        if (localName.equals("encryption-data")) {
+            throw new SAXException(new EncryptedDocumentException());
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 851d3b6..f609e89 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -20,10 +20,12 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
@@ -32,6 +34,7 @@ import java.util.zip.ZipInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
@@ -98,6 +101,7 @@ public class OpenDocumentParser extends AbstractParser {
                     MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
 
     private static final String META_NAME = "meta.xml";
+    private static final String MANIFEST_NAME = "META-INF/manifest.xml";
 
     private EmbeddedDocumentUtil embeddedDocumentUtil;
 
@@ -160,20 +164,27 @@ public class OpenDocumentParser extends AbstractParser {
         EndDocumentShieldingContentHandler handler =
                 new EndDocumentShieldingContentHandler(xhtml);
 
-        if (zipFile != null) {
-            try {
-                handleZipFile(zipFile, metadata, context, handler);
-            } finally {
-                //Do we want to close silently == catch an exception here?
-                zipFile.close();
+        try {
+            if (zipFile != null) {
+                try {
+                    handleZipFile(zipFile, metadata, context, handler);
+                } finally {
+                    //Do we want to close silently == catch an exception here?
+                    zipFile.close();
+                }
+            } else {
+                try {
+                    handleZipStream(zipStream, metadata, context, handler);
+                } finally {
+                    //Do we want to close silently == catch an exception here?
+                    zipStream.close();
+                }
             }
-        } else {
-            try {
-                handleZipStream(zipStream, metadata, context, handler);
-            } finally {
-                //Do we want to close silently == catch an exception here?
-                zipStream.close();
+        } catch (SAXException e) {
+            if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
+                throw (EncryptedDocumentException)e.getCause();
             }
+            throw e;
         }
 
         // Only now call the end document
@@ -187,27 +198,47 @@ public class OpenDocumentParser extends AbstractParser {
         this.extractMacros = extractMacros;
     }
 
-    private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+    private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
+                                 ParseContext context,
+                                 EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
         ZipEntry entry = zipStream.getNextEntry();
 		if (entry == null) {
 			throw new IOException("No entries found in ZipInputStream");
 		}
+		List<SAXException> saxExceptions = new ArrayList<>();
         do {
-            handleZipEntry(entry, zipStream, metadata, context, handler);
+            try {
+                handleZipEntry(entry, zipStream, metadata, context, handler);
+            } catch (SAXException e) {
+                if (e.getCause() instanceof EncryptedDocumentException) {
+                    throw (EncryptedDocumentException)e.getCause();
+                } else {
+                    saxExceptions.add(e);
+                }
+            }
             entry = zipStream.getNextEntry();
         } while (entry != null);
+        //throw the first
+        if (saxExceptions.size() > 0) {
+            throw saxExceptions.get(0);
+        }
     }
 
     private void handleZipFile(ZipFile zipFile, Metadata metadata,
                                ParseContext context, EndDocumentShieldingContentHandler handler)
             throws IOException, TikaException, SAXException {
+
+        ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
+        if (entry != null) {
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+        }
         // If we can, process the metadata first, then the
         //  rest of the file afterwards (TIKA-1353)
         // Only possible to guarantee that when opened from a file not a stream
-
-        ZipEntry entry = zipFile.getEntry(META_NAME);
+        entry = zipFile.getEntry(META_NAME);
         if (entry != null) {
-            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
+                    handler);
         }
 
         Enumeration<? extends ZipEntry> entries = zipFile.entries();
@@ -221,9 +252,14 @@ public class OpenDocumentParser extends AbstractParser {
     private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
                                 ParseContext context, ContentHandler handler)
             throws IOException, SAXException, TikaException {
-        if (entry == null) return;
+        if (entry == null) {
+            return;
+        }
+        if (entry.getName().contains("manifest.xml")) {
+            checkForEncryption(zip, context);
+        }
         if (entry.getName().equals("mimetype")) {
-            String type = IOUtils.toString(zip, UTF_8);
+            String type = IOUtils.toString(zip, UTF_8).trim();
             metadata.set(Metadata.CONTENT_TYPE, type);
         } else if (entry.getName().equals(META_NAME)) {
             meta.parse(zip, new DefaultHandler(), metadata, context);
@@ -280,6 +316,14 @@ public class OpenDocumentParser extends AbstractParser {
         }
     }
 
+    private void checkForEncryption(InputStream stream, ParseContext context)
+            throws SAXException, TikaException, IOException {
+        XMLReaderUtils.parseSAX(
+                new CloseShieldInputStream(stream),
+                new OfflineContentHandler(new EmbeddedContentHandler(
+                        new OpenDocumentManifestHandler())), context);
+    }
+
     private void maybeHandleMacro(InputStream is, String embeddedName,
                                   ContentHandler handler, ParseContext context)
             throws TikaException, IOException, SAXException {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 0b0e2ad..0affa14 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -21,10 +21,15 @@ import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -39,6 +44,9 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.opendocument.OpenOfficeParser;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BodyContentHandler;
+
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
@@ -618,6 +626,27 @@ public class ODFParserTest extends TikaTest {
 
     }
 
+    @Test(expected = EncryptedDocumentException.class)
+    public void testEncryptedODTFile() throws Exception {
+        Path p =
+                Paths.get(
+                        ODFParserTest.class.getResource(
+                                "/test-documents/testODTEncrypted.odt").toURI());
+        getRecursiveMetadata(p, false);
+    }
+
+    //this, of course, should throw an EncryptedDocumentException
+    //but the file can't be read by Java's ZipInputStream or
+    //by commons compress, unless you enable descriptors.
+    //https://issues.apache.org/jira/browse/ODFTOOLKIT-402
+    @Test(expected = TikaException.class)
+    public void testEncryptedODTStream() throws Exception {
+        try (InputStream is = ODFParserTest.class.getResourceAsStream(
+                                "/test-documents/testODTEncrypted.odt")) {
+            getRecursiveMetadata(is, false);
+        }
+    }
+
     private ParseContext getNonRecursingParseContext() {
         ParseContext parseContext = new ParseContext();
         parseContext.set(Parser.class, new EmptyParser());
diff --git a/tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt b/tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt
new file mode 100644
index 0000000..55785dd
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt differ