You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/18 18:57:53 UTC
[tika] branch branch_1x updated: TIKA-3331 -- return a more
informative exception for encrypted ODT
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 88166ce TIKA-3331 -- return a more informative exception for encrypted ODT
88166ce is described below
commit 88166ce9ec9bc3c50126ecd9f0d8f0eeaadaf610
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 18 14:57:22 2021 -0400
TIKA-3331 -- return a more informative exception for encrypted ODT
---
.../parser/odf/OpenDocumentManifestHandler.java | 46 ++++++++++++
.../apache/tika/parser/odf/OpenDocumentParser.java | 82 ++++++++++++++++-----
.../org/apache/tika/parser/odf/ODFParserTest.java | 29 ++++++++
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
4 files changed, 138 insertions(+), 19 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
new file mode 100644
index 0000000..65dcaf5
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+/**
+ * For now, this only looks for any encryption-data elements.
+ * If found this will throw an EncryptedDocumentException wrapped
+ * in a SAXException.
+ *
+ * If desired, we can add to this to actually extract information
+ * necessary for decryption. Please open an issue or pull
+ * request for this added functionality.
+ *
+ */
+class OpenDocumentManifestHandler extends ContentHandlerDecorator {
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ if (localName.equals("encryption-data")) {
+ throw new SAXException(new EncryptedDocumentException());
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 851d3b6..f609e89 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -20,10 +20,12 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@@ -32,6 +34,7 @@ import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
@@ -98,6 +101,7 @@ public class OpenDocumentParser extends AbstractParser {
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
+ private static final String MANIFEST_NAME = "META-INF/manifest.xml";
private EmbeddedDocumentUtil embeddedDocumentUtil;
@@ -160,20 +164,27 @@ public class OpenDocumentParser extends AbstractParser {
EndDocumentShieldingContentHandler handler =
new EndDocumentShieldingContentHandler(xhtml);
- if (zipFile != null) {
- try {
- handleZipFile(zipFile, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipFile.close();
+ try {
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipStream.close();
+ }
}
- } else {
- try {
- handleZipStream(zipStream, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipStream.close();
+ } catch (SAXException e) {
+ if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
}
+ throw e;
}
// Only now call the end document
@@ -187,27 +198,47 @@ public class OpenDocumentParser extends AbstractParser {
this.extractMacros = extractMacros;
}
- private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
+ ParseContext context,
+ EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
+ List<SAXException> saxExceptions = new ArrayList<>();
do {
- handleZipEntry(entry, zipStream, metadata, context, handler);
+ try {
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ } catch (SAXException e) {
+ if (e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
+ } else {
+ saxExceptions.add(e);
+ }
+ }
entry = zipStream.getNextEntry();
} while (entry != null);
+ //throw the first
+ if (saxExceptions.size() > 0) {
+ throw saxExceptions.get(0);
+ }
}
private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, TikaException, SAXException {
+
+ ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
-
- ZipEntry entry = zipFile.getEntry(META_NAME);
+ entry = zipFile.getEntry(META_NAME);
if (entry != null) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
+ handler);
}
Enumeration<? extends ZipEntry> entries = zipFile.entries();
@@ -221,9 +252,14 @@ public class OpenDocumentParser extends AbstractParser {
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, ContentHandler handler)
throws IOException, SAXException, TikaException {
- if (entry == null) return;
+ if (entry == null) {
+ return;
+ }
+ if (entry.getName().contains("manifest.xml")) {
+ checkForEncryption(zip, context);
+ }
if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, UTF_8);
+ String type = IOUtils.toString(zip, UTF_8).trim();
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
@@ -280,6 +316,14 @@ public class OpenDocumentParser extends AbstractParser {
}
}
+ private void checkForEncryption(InputStream stream, ParseContext context)
+ throws SAXException, TikaException, IOException {
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new OpenDocumentManifestHandler())), context);
+ }
+
private void maybeHandleMacro(InputStream is, String embeddedName,
ContentHandler handler, ParseContext context)
throws TikaException, IOException, SAXException {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 0b0e2ad..0affa14 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -21,10 +21,15 @@ import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -39,6 +44,9 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.opendocument.OpenOfficeParser;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BodyContentHandler;
+
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.junit.BeforeClass;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -618,6 +626,27 @@ public class ODFParserTest extends TikaTest {
}
+ @Test(expected = EncryptedDocumentException.class)
+ public void testEncryptedODTFile() throws Exception {
+ Path p =
+ Paths.get(
+ ODFParserTest.class.getResource(
+ "/test-documents/testODTEncrypted.odt").toURI());
+ getRecursiveMetadata(p, false);
+ }
+
+ //this, of course, should throw an EncryptedDocumentException
+ //but the file can't be read by Java's ZipInputStream or
+ //by commons compress, unless you enable descriptors.
+ //https://issues.apache.org/jira/browse/ODFTOOLKIT-402
+ @Test(expected = TikaException.class)
+ public void testEncryptedODTStream() throws Exception {
+ try (InputStream is = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testODTEncrypted.odt")) {
+ getRecursiveMetadata(is, false);
+ }
+ }
+
private ParseContext getNonRecursingParseContext() {
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
diff --git a/tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt b/tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt
new file mode 100644
index 0000000..55785dd
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt differ