You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/12/20 14:35:02 UTC

[tika] branch main updated: TIKA-3627: add unit test for multithreading in msoffice docs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 365388d  TIKA-3627: add unit test for multithreading in msoffice docs
365388d is described below

commit 365388d43551cad8adebd142b65a988a1611d9d7
Author: tallison <ta...@apache.org>
AuthorDate: Mon Dec 20 09:34:43 2021 -0500

    TIKA-3627: add unit test for multithreading in msoffice docs
---
 CHANGES.txt                                        |  2 ++
 .../org/apache/tika/MultiThreadedTikaTest.java     |  2 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    | 38 ++++++++++++++++++++--
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index dcd513b..6bcf561 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.2.1 - 12/19/2021
 
+   * Fix multithreading bug for ooxml files (TIKA-3627).
+
    * Upgrade log4j to 2.17.0 (TIKA-3625).
 
    * Upgrade to PDFBox 2.0.25 (TIKA-3622)
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index bf27240..fd3f381 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -354,7 +354,7 @@ public class MultiThreadedTikaTest extends TikaTest {
                 Path testFile = files[randIndex];
                 List<Metadata> metadataList = null;
                 boolean success = false;
-                try (InputStream is = Files.newInputStream(testFile)) {
+                try (InputStream is = TikaInputStream.get(testFile)) {
                     metadataList = getRecursiveMetadata(is, parser, new ParseContext());
                     success = true;
                 } catch (Exception e) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 1916d30..738d9a0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -52,7 +52,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
-import org.apache.tika.TikaTest;
+import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
@@ -70,11 +70,12 @@ import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 
-public class OOXMLParserTest extends TikaTest {
+public class OOXMLParserTest extends MultiThreadedTikaTest {
 
     private static Locale USER_LOCALE = null;
 
@@ -1741,6 +1742,39 @@ public class OOXMLParserTest extends TikaTest {
                 getRecursiveMetadata("testWORD_docSecurity.docx").get(0)
                         .get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
     }
+
+    @Test
+    public void testMultiThreaded() throws Exception {
+        //TIKA-3627
+        int numThreads = 5;
+        int numIterations = 5;
+        ParseContext[] parseContexts = new ParseContext[numThreads];
+        for (int i = 0; i < parseContexts.length; i++) {
+            parseContexts[i] = new ParseContext();
+        }
+        Set<String> extensions = new HashSet<>();
+        extensions.add(".pptx");
+        extensions.add(".docx");
+        extensions.add(".xlsx");
+        extensions.add(".ppt");
+        extensions.add(".doc");
+        extensions.add(".xls");
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        testMultiThreaded(wrapper, parseContexts, numThreads, numIterations, path -> {
+            String pathName = path.getName().toLowerCase(Locale.ENGLISH);
+            int i = pathName.lastIndexOf(".");
+            String ext = "";
+            if (i > -1) {
+                ext = pathName.substring(i);
+            }
+            if (extensions.contains(ext)) {
+                return true;
+            } else {
+                return false;
+            }
+        });
+
+    }
 }