You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/12/20 14:35:02 UTC
[tika] branch main updated: TIKA-3627: add unit test for multithreading in msoffice docs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 365388d TIKA-3627: add unit test for multithreading in msoffice docs
365388d is described below
commit 365388d43551cad8adebd142b65a988a1611d9d7
Author: tallison <ta...@apache.org>
AuthorDate: Mon Dec 20 09:34:43 2021 -0500
TIKA-3627: add unit test for multithreading in msoffice docs
---
CHANGES.txt | 2 ++
.../org/apache/tika/MultiThreadedTikaTest.java | 2 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 38 ++++++++++++++++++++--
3 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index dcd513b..6bcf561 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.2.1 - 12/19/2021
+ * Fix multithreading bug for ooxml files (TIKA-3627).
+
* Upgrade log4j to 2.17.0 (TIKA-3625).
* Upgrade to PDFBox 2.0.25 (TIKA-3622)
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index bf27240..fd3f381 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -354,7 +354,7 @@ public class MultiThreadedTikaTest extends TikaTest {
Path testFile = files[randIndex];
List<Metadata> metadataList = null;
boolean success = false;
- try (InputStream is = Files.newInputStream(testFile)) {
+ try (InputStream is = TikaInputStream.get(testFile)) {
metadataList = getRecursiveMetadata(is, parser, new ParseContext());
success = true;
} catch (Exception e) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 1916d30..738d9a0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -52,7 +52,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
-import org.apache.tika.TikaTest;
+import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -70,11 +70,12 @@ import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.BodyContentHandler;
-public class OOXMLParserTest extends TikaTest {
+public class OOXMLParserTest extends MultiThreadedTikaTest {
private static Locale USER_LOCALE = null;
@@ -1741,6 +1742,39 @@ public class OOXMLParserTest extends TikaTest {
getRecursiveMetadata("testWORD_docSecurity.docx").get(0)
.get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
}
+
+ @Test
+ public void testMultiThreaded() throws Exception {
+ //TIKA-3627
+ int numThreads = 5;
+ int numIterations = 5;
+ ParseContext[] parseContexts = new ParseContext[numThreads];
+ for (int i = 0; i < parseContexts.length; i++) {
+ parseContexts[i] = new ParseContext();
+ }
+ Set<String> extensions = new HashSet<>();
+ extensions.add(".pptx");
+ extensions.add(".docx");
+ extensions.add(".xlsx");
+ extensions.add(".ppt");
+ extensions.add(".doc");
+ extensions.add(".xls");
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+ testMultiThreaded(wrapper, parseContexts, numThreads, numIterations, path -> {
+ String pathName = path.getName().toLowerCase(Locale.ENGLISH);
+ int i = pathName.lastIndexOf(".");
+ String ext = "";
+ if (i > -1) {
+ ext = pathName.substring(i);
+ }
+ if (extensions.contains(ext)) {
+ return true;
+ } else {
+ return false;
+ }
+ });
+
+ }
}