You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/23 21:43:21 UTC

[tika] branch branch_1x updated (b29cce5 -> da05576)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from b29cce5  TIKA-3244 -- general upgrades for 1.26
     new b1e8641  TIKA-3335 -- handle bad xml more robustly when checking for encryption
     new 2b8c9a3  TIKA-3336 -- new zip bombs detect in 1.26-SNAPSHOT compared with 1.25 -- bug, don't advance twice per call to chars/whitespace
     new da05576  TIKA-3334 -- fix thread safety bug in handling embedded docs in open office parser

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  2 ++
 .../apache/tika/parser/RecursiveParserWrapper.java |  4 ---
 .../parser/odf/OpenDocumentManifestHandler.java    |  8 +++++
 .../apache/tika/parser/odf/OpenDocumentParser.java | 39 ++++++++++++++--------
 .../org/apache/tika/parser/odf/ODFParserTest.java  | 39 ++++++++++++++++++++++
 5 files changed, 74 insertions(+), 18 deletions(-)

[tika] 03/03: TIKA-3334 -- fix thread safety bug in handling embedded docs in open office parser

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit da055766a892254199ff8253834b871548a9fbef
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 23 17:43:00 2021 -0400

    TIKA-3334 -- fix thread safety bug in handling embedded docs in open office parser
---
 CHANGES.txt                                        |  2 ++
 .../apache/tika/parser/odf/OpenDocumentParser.java | 30 +++++++++--------
 .../org/apache/tika/parser/odf/ODFParserTest.java  | 39 ++++++++++++++++++++++
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 37dcabe..c16f37b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.26 - ??/??/????
 
+   * Fix thread safety bug in OpenOffice parser (TIKA-3334).
+
    * The "writeLimit" header now pertains to the combined characters
      written per container document (and embedded documents) in the /rmeta
      endpoint in tika-server (TIKA-3325); it no longer functions only
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index babaac2..fafefd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -65,7 +65,7 @@ public class OpenDocumentParser extends AbstractParser {
     private static final long serialVersionUID = -6410276875438618287L;
 
     private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
                     MediaType.application("vnd.sun.xml.writer"),
                     MediaType.application("vnd.oasis.opendocument.text"),
                     MediaType.application("vnd.oasis.opendocument.graphics"),
@@ -103,8 +103,6 @@ public class OpenDocumentParser extends AbstractParser {
     private static final String META_NAME = "meta.xml";
     private static final String MANIFEST_NAME = "META-INF/manifest.xml";
 
-    private EmbeddedDocumentUtil embeddedDocumentUtil;
-
     private Parser meta = new OpenDocumentMetaParser();
 
     private Parser content = new OpenDocumentContentParser();
@@ -136,7 +134,7 @@ public class OpenDocumentParser extends AbstractParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
-        embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+        EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
 
         // Open the Zip stream
         // Use a File if we can, and an already open zip is even better
@@ -167,14 +165,14 @@ public class OpenDocumentParser extends AbstractParser {
         try {
             if (zipFile != null) {
                 try {
-                    handleZipFile(zipFile, metadata, context, handler);
+                    handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil);
                 } finally {
                     //Do we want to close silently == catch an exception here?
                     zipFile.close();
                 }
             } else {
                 try {
-                    handleZipStream(zipStream, metadata, context, handler);
+                    handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil);
                 } finally {
                     //Do we want to close silently == catch an exception here?
                     zipStream.close();
@@ -200,7 +198,9 @@ public class OpenDocumentParser extends AbstractParser {
 
     private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
                                  ParseContext context,
-                                 EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+                                 EndDocumentShieldingContentHandler handler,
+                                 EmbeddedDocumentUtil embeddedDocumentUtil) throws IOException,
+            TikaException, SAXException {
         ZipEntry entry = zipStream.getNextEntry();
 		if (entry == null) {
 			throw new IOException("No entries found in ZipInputStream");
@@ -208,7 +208,7 @@ public class OpenDocumentParser extends AbstractParser {
 		List<SAXException> saxExceptions = new ArrayList<>();
         do {
             try {
-                handleZipEntry(entry, zipStream, metadata, context, handler);
+                handleZipEntry(entry, zipStream, metadata, context, handler, embeddedDocumentUtil);
             } catch (SAXException e) {
                 if (e.getCause() instanceof EncryptedDocumentException) {
                     throw (EncryptedDocumentException)e.getCause();
@@ -225,12 +225,14 @@ public class OpenDocumentParser extends AbstractParser {
     }
 
     private void handleZipFile(ZipFile zipFile, Metadata metadata,
-                               ParseContext context, EndDocumentShieldingContentHandler handler)
+                               ParseContext context, EndDocumentShieldingContentHandler handler,
+                               EmbeddedDocumentUtil embeddedDocumentUtil)
             throws IOException, TikaException, SAXException {
 
         ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
         if (entry != null) {
-            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler,
+                    embeddedDocumentUtil);
         }
         // If we can, process the metadata first, then the
         //  rest of the file afterwards (TIKA-1353)
@@ -238,19 +240,21 @@ public class OpenDocumentParser extends AbstractParser {
         entry = zipFile.getEntry(META_NAME);
         if (entry != null) {
             handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
-                    handler);
+                    handler, embeddedDocumentUtil);
         }
 
         Enumeration<? extends ZipEntry> entries = zipFile.entries();
         while (entries.hasMoreElements()) {
             entry = entries.nextElement();
             if (!META_NAME.equals(entry.getName())) {
-                handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+                handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler,
+                        embeddedDocumentUtil);
             }
         }
     }
     private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
-                                ParseContext context, ContentHandler handler)
+                                ParseContext context, ContentHandler handler,
+                                EmbeddedDocumentUtil embeddedDocumentUtil)
             throws IOException, SAXException, TikaException {
         if (entry == null) {
             return;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 0affa14..6006548 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -24,6 +24,11 @@ import java.io.InputStream;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
@@ -652,4 +657,38 @@ public class ODFParserTest extends TikaTest {
         parseContext.set(Parser.class, new EmptyParser());
         return parseContext;
     }
+
+    @Test
+    public void testMultiThreaded() throws Exception {
+        int numThreads = 10;
+        ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
+        ExecutorCompletionService<Integer> executorCompletionService =
+                new ExecutorCompletionService<>(executorService);
+
+        for (int i = 0; i < numThreads; i++) {
+            executorCompletionService.submit(new Callable<Integer>() {
+                @Override
+                public Integer call() throws Exception {
+                    for (int i = 0; i < 10; i++) {
+                        List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
+                        assertEquals(3, metadataList.size());
+                        assertEquals("THUMBNAIL",
+                                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+                    }
+                    return 1;
+                }
+            });
+        }
+
+        try {
+            int finished = 0;
+            while (finished < numThreads) {
+                Future<Integer> future = executorCompletionService.take();
+                future.get();
+                finished++;
+            }
+        } finally {
+            executorService.shutdownNow();
+        }
+    }
 }

[tika] 02/03: TIKA-3336 -- new zip bombs detect in 1.26-SNAPSHOT compared with 1.25 -- bug, don't advance twice per call to chars/whitespace

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2b8c9a3eba7007998cca6df6db973c2c4547d55e
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 23 14:41:27 2021 -0400

    TIKA-3336 -- new zip bombs detect in 1.26-SNAPSHOT compared with 1.25 -- bug, don't advance twice per call to chars/whitespace
---
 .../src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java  | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index e7dee24..021fca3 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -488,12 +488,10 @@ public class RecursiveParserWrapper extends ParserDecorator {
         @Override
         public void characters(char[] ch, int start, int length) throws SAXException {
             if (totalWriteLimit < 0) {
-                advance(length);
                 super.characters(ch, start, length);
                 return;
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
-            advance(availableLength);
             super.characters(ch, start, availableLength);
             if (availableLength < length) {
                 throw new WriteLimitReached();
@@ -503,12 +501,10 @@ public class RecursiveParserWrapper extends ParserDecorator {
         @Override
         public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
             if (totalWriteLimit < 0) {
-                advance(length);
                 super.ignorableWhitespace(ch, start, length);
                 return;
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
-            advance(availableLength);
             super.ignorableWhitespace(ch, start, availableLength);
             if (availableLength < length) {
                 throw new WriteLimitReached();

[tika] 01/03: TIKA-3335 -- handle bad xml more robustly when checking for encryption

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b1e86412b04d563d6f557d686602738957990c5d
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 23 14:32:05 2021 -0400

    TIKA-3335 -- handle bad xml more robustly when checking for encryption
---
 .../org/apache/tika/parser/odf/OpenDocumentManifestHandler.java  | 8 ++++++++
 .../main/java/org/apache/tika/parser/odf/OpenDocumentParser.java | 9 ++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
index 65dcaf5..ca9a9ec 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
@@ -34,6 +34,14 @@ import org.apache.tika.sax.ContentHandlerDecorator;
  */
 class OpenDocumentManifestHandler extends ContentHandlerDecorator {
 
+    /**
+     *
+     * @param namespaceURI
+     * @param localName
+     * @param qName
+     * @param attrs
+     * @throws SAXException wrapping a {@link EncryptedDocumentException} if the file is encrypted
+     */
     @Override
     public void startElement(
             String namespaceURI, String localName, String qName,
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index f609e89..babaac2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -256,7 +256,14 @@ public class OpenDocumentParser extends AbstractParser {
             return;
         }
         if (entry.getName().contains("manifest.xml")) {
-            checkForEncryption(zip, context);
+            try {
+                checkForEncryption(zip, context);
+            } catch (SAXException e) {
+                if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
+                    throw e;
+                }
+                //else, swallow for now
+            }
         }
         if (entry.getName().equals("mimetype")) {
             String type = IOUtils.toString(zip, UTF_8).trim();