You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/24 12:14:16 UTC
[tika] branch master updated: prep for 1.22 rc2
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 22ff756 prep for 1.22 rc2
22ff756 is described below
commit 22ff7564f2641ba195f192d7c59e9df4a3a10747
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 24 08:14:01 2019 -0400
prep for 1.22 rc2
---
.../java/org/apache/tika/utils/XMLReaderUtils.java | 5 ++++
.../ooxml/xwpf/ml2006/Word2006MLParser.java | 10 ++++---
.../microsoft/xml/AbstractXML2003Parser.java | 8 +++--
.../parser/pkg/StreamingZipContainerDetector.java | 5 +++-
.../ooxml/xwpf/ml2006/Word2006MLParserTest.java | 35 +++++++++++++++++++++-
.../parser/microsoft/xml/XML2003ParserTest.java | 35 +++++++++++++++++++++-
6 files changed, 88 insertions(+), 10 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index c711e86..f7d100f 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -520,7 +520,10 @@ public class XMLReaderUtils implements Serializable {
if (builder != null) {
return builder;
}
+ LOG.warn("Contention waiting for a DOMParser. "+
+ "Consider increasing the XMLReaderUtils.POOL_SIZE");
waiting++;
+
if (waiting > 3000) {
//freshen the pool. Something went very wrong...
setPoolSize(POOL_SIZE);
@@ -586,6 +589,8 @@ public class XMLReaderUtils implements Serializable {
if (parser != null) {
return parser;
}
+ LOG.warn("Contention waiting for a SAXParser. "+
+ "Consider increasing the XMLReaderUtils.POOL_SIZE");
waiting++;
if (waiting > 3000) {
//freshen the pool. Something went very wrong...
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
index 3ef99ca..36c82bc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
@@ -59,14 +59,16 @@ public class Word2006MLParser extends AbstractOfficeParser {
xhtml.startDocument();
try {
- XMLReaderUtils.parseSAX(
+ //need to get new SAXParser because
+ //an attachment might require another SAXParser
+ //mid-parse
+ XMLReaderUtils.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
- new Word2006MLDocHandler(xhtml, metadata, context))),
- context);
+ new Word2006MLDocHandler(xhtml, metadata, context))));
} catch (SAXException e) {
throw new TikaException("XML parse error", e);
}
- xhtml.endDocument();
+ xhtml.endDocument();
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index 7210d8c..53bf7ed 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -98,11 +98,13 @@ public abstract class AbstractXML2003Parser extends AbstractParser {
TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
try {
- XMLReaderUtils.parseSAX(
+ //need to get new SAXParser because
+ //an attachment might require another SAXParser
+ //mid-parse
+ XMLReaderUtils.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
- getContentHandler(tagged, metadata, context))),
- context);
+ getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index a9d135c..8de17de 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -37,6 +37,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@@ -207,7 +208,9 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
public static MediaType parseOOXMLContentTypes(InputStream is) {
ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
try {
- XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext());
+ XMLReaderUtils.parseSAX(is,
+ new OfflineContentHandler(contentTypeHandler),
+ new ParseContext());
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
index 4e4bf7b..c21f287 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
@@ -19,21 +19,33 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
import static org.junit.Assert.assertEquals;
+import java.io.File;
+import java.io.FileFilter;
import java.util.List;
+import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.junit.AfterClass;
import org.junit.Test;
-public class Word2006MLParserTest extends TikaTest {
+public class Word2006MLParserTest extends MultiThreadedTikaTest {
+
+ @AfterClass
+ public static void tearDown() throws TikaException {
+ XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+ }
@Test
public void basicTest() throws Exception {
@@ -167,5 +179,26 @@ public class Word2006MLParserTest extends TikaTest {
}
+ @Test(timeout = 60000)
+ public void testMultiThreaded() throws Exception {
+ XMLReaderUtils.setPoolSize(4);
+ int numThreads = XMLReaderUtils.getPoolSize()*2;
+ ParseContext[] contexts = new ParseContext[numThreads];
+ for (int i = 0; i < contexts.length; i++) {
+ contexts[i] = new ParseContext();
+ }
+
+ testMultiThreaded(new AutoDetectParser(), contexts, numThreads, 2,
+ new FileFilter() {
+ @Override
+ public boolean accept(File pathname) {
+ if (pathname.getName().equals("testWORD_2006ml.xml")) {
+ return true;
+ }
+ return false;
+ }
+ });
+
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 5cb0de9..a94055d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -18,19 +18,31 @@ package org.apache.tika.parser.microsoft.xml;
import static org.junit.Assert.assertEquals;
+import java.io.File;
+import java.io.FileFilter;
import java.util.Arrays;
import java.util.List;
+import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.junit.AfterClass;
import org.junit.Test;
-public class XML2003ParserTest extends TikaTest {
+public class XML2003ParserTest extends MultiThreadedTikaTest {
+
+ @AfterClass
+ public static void tearDown() throws TikaException {
+ XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+ }
@Test
public void testBasicWord() throws Exception {
@@ -107,4 +119,25 @@ public class XML2003ParserTest extends TikaTest {
}
+ @Test(timeout = 60000)
+ public void testMultiThreaded() throws Exception {
+ XMLReaderUtils.setPoolSize(4);
+ int numThreads = XMLReaderUtils.getPoolSize()*2;
+ ParseContext[] contexts = new ParseContext[numThreads];
+ for (int i = 0; i < contexts.length; i++) {
+ contexts[i] = new ParseContext();
+ }
+
+ testMultiThreaded(new AutoDetectParser(), contexts, numThreads, 2,
+ new FileFilter() {
+ @Override
+ public boolean accept(File pathname) {
+ if (pathname.getName().equals("testWORD2003.xml")) {
+ return true;
+ }
+ return false;
+ }
+ });
+
+ }
}