You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/18 19:13:58 UTC
[tika] branch main updated: TIKA-3331 -- throw a more informative
exception for an encrypted odt file
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 20eae4f TIKA-3331 -- throw a more informative exception for an encrypted odt file
20eae4f is described below
commit 20eae4f243581f919fc4814660dec07d9b330a33
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 18 15:13:27 2021 -0400
TIKA-3331 -- throw a more informative exception for an encrypted odt file
---
.../parser/odf/OpenDocumentManifestHandler.java | 45 +++++++++++++
.../apache/tika/parser/odf/OpenDocumentParser.java | 73 ++++++++++++++++-----
.../org/apache/tika/parser/odf/ODFParserTest.java | 25 +++++++
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
4 files changed, 125 insertions(+), 18 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
new file mode 100644
index 0000000..6ad64cc
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+/**
+ * For now, this only looks for any encryption-data elements.
+ * If found this will throw an EncryptedDocumentException wrapped
+ * in a SAXException.
+ *
+ * If desired, we can add to this to actually extract information
+ * necessary for decryption. Please open an issue or pull
+ * request for this added functionality.
+ *
+ */
+class OpenDocumentManifestHandler extends ContentHandlerDecorator {
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ if (localName.equals("encryption-data")) {
+ throw new SAXException(new EncryptedDocumentException());
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 9a8ed33..54f831c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -20,10 +20,12 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@@ -36,6 +38,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
@@ -97,6 +100,7 @@ public class OpenDocumentParser extends AbstractParser {
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
+ private static final String MANIFEST_NAME = "META-INF/manifest.xml";
private EmbeddedDocumentUtil embeddedDocumentUtil;
@@ -155,20 +159,27 @@ public class OpenDocumentParser extends AbstractParser {
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
- if (zipFile != null) {
- try {
- handleZipFile(zipFile, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipFile.close();
+ try {
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipStream.close();
+ }
}
- } else {
- try {
- handleZipStream(zipStream, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipStream.close();
+ } catch (SAXException e) {
+ if (e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
}
+ throw e;
}
// Only now call the end document
@@ -189,10 +200,23 @@ public class OpenDocumentParser extends AbstractParser {
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
+ List<SAXException> exceptions = new ArrayList<>();
do {
- handleZipEntry(entry, zipStream, metadata, context, handler);
+ try {
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ } catch (SAXException e) {
+ if (e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
+ } else {
+ exceptions.add(e);
+ }
+ }
entry = zipStream.getNextEntry();
} while (entry != null);
+
+ if (exceptions.size() > 0) {
+ throw exceptions.get(0);
+ }
}
private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context,
@@ -202,7 +226,12 @@ public class OpenDocumentParser extends AbstractParser {
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
- ZipEntry entry = zipFile.getEntry(META_NAME);
+ ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+
+ entry = zipFile.getEntry(META_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
}
@@ -219,10 +248,11 @@ public class OpenDocumentParser extends AbstractParser {
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, ContentHandler handler)
throws IOException, SAXException, TikaException {
- if (entry == null) {
- return;
- }
- if (entry.getName().equals("mimetype")) {
+
+
+ if (entry.getName().contains("manifest.xml")) {
+ checkForEncryption(zip, context);
+ } else if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
@@ -299,6 +329,13 @@ public class OpenDocumentParser extends AbstractParser {
new OfflineContentHandler(new EmbeddedContentHandler(handler)), context);
}
+ private void checkForEncryption(InputStream stream, ParseContext context)
+ throws SAXException, TikaException, IOException {
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new OpenDocumentManifestHandler())), context);
+ }
private boolean ignoreScriptFile(String embeddedName) {
if (embeddedName.contains("Basic/")) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index a4861d6..b4eaf76 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
@@ -28,6 +30,8 @@ import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -380,6 +384,27 @@ public class ODFParserTest extends TikaTest {
}
}
+ @Test(expected = EncryptedDocumentException.class)
+ public void testEncryptedODTFile() throws Exception {
+ //the password to this file is "tika"
+ Path p =
+ Paths.get(
+ ODFParserTest.class.getResource(
+ "/test-documents/testODTEncrypted.odt").toURI());
+ getRecursiveMetadata(p, false);
+ }
+
+ //this, of course, should throw an EncryptedDocumentException
+ //but the file can't be read by Java's ZipInputStream or
+ //by commons compress, unless you enable descriptors.
+ //https://issues.apache.org/jira/browse/ODFTOOLKIT-402
+ @Test(expected = TikaException.class)
+ public void testEncryptedODTStream() throws Exception {
+ try (InputStream is = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testODTEncrypted.odt")) {
+ getRecursiveMetadata(is, false);
+ }
+ }
private ParseContext getNonRecursingParseContext() {
ParseContext parseContext = new ParseContext();
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt
new file mode 100644
index 0000000..55785dd
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEncrypted.odt differ