You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/24 12:14:16 UTC

[tika] branch master updated: prep for 1.22 rc2

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 22ff756  prep for 1.22 rc2
22ff756 is described below

commit 22ff7564f2641ba195f192d7c59e9df4a3a10747
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 24 08:14:01 2019 -0400

    prep for 1.22 rc2
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java |  5 ++++
 .../ooxml/xwpf/ml2006/Word2006MLParser.java        | 10 ++++---
 .../microsoft/xml/AbstractXML2003Parser.java       |  8 +++--
 .../parser/pkg/StreamingZipContainerDetector.java  |  5 +++-
 .../ooxml/xwpf/ml2006/Word2006MLParserTest.java    | 35 +++++++++++++++++++++-
 .../parser/microsoft/xml/XML2003ParserTest.java    | 35 +++++++++++++++++++++-
 6 files changed, 88 insertions(+), 10 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index c711e86..f7d100f 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -520,7 +520,10 @@ public class XMLReaderUtils implements Serializable {
             if (builder != null) {
                 return builder;
             }
+            LOG.warn("Contention waiting for a DOMParser. "+
+                    "Consider increasing the XMLReaderUtils.POOL_SIZE");
             waiting++;
+
             if (waiting > 3000) {
                 //freshen the pool.  Something went very wrong...
                 setPoolSize(POOL_SIZE);
@@ -586,6 +589,8 @@ public class XMLReaderUtils implements Serializable {
             if (parser != null) {
                 return parser;
             }
+            LOG.warn("Contention waiting for a SAXParser. "+
+                    "Consider increasing the XMLReaderUtils.POOL_SIZE");
             waiting++;
             if (waiting > 3000) {
                 //freshen the pool.  Something went very wrong...
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
index 3ef99ca..36c82bc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
@@ -59,14 +59,16 @@ public class Word2006MLParser extends AbstractOfficeParser {
 
         xhtml.startDocument();
         try {
-            XMLReaderUtils.parseSAX(
+            //need to get new SAXParser because
+            //an attachment might require another SAXParser
+            //mid-parse
+            XMLReaderUtils.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
                     new OfflineContentHandler(new EmbeddedContentHandler(
-                            new Word2006MLDocHandler(xhtml, metadata, context))),
-                    context);
+                            new Word2006MLDocHandler(xhtml, metadata, context))));
         } catch (SAXException e) {
             throw new TikaException("XML parse error", e);
         }
-            xhtml.endDocument();
+        xhtml.endDocument();
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index 7210d8c..53bf7ed 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -98,11 +98,13 @@ public abstract class AbstractXML2003Parser extends AbstractParser {
 
         TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
         try {
-            XMLReaderUtils.parseSAX(
+            //need to get new SAXParser because
+            //an attachment might require another SAXParser
+            //mid-parse
+            XMLReaderUtils.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
                     new OfflineContentHandler(new EmbeddedContentHandler(
-                            getContentHandler(tagged, metadata, context))),
-                    context);
+                            getContentHandler(tagged, metadata, context))));
         } catch (SAXException e) {
             tagged.throwIfCauseOf(e);
             throw new TikaException("XML parse error", e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index a9d135c..8de17de 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -37,6 +37,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
@@ -207,7 +208,9 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
     public static MediaType parseOOXMLContentTypes(InputStream is) {
         ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
         try {
-            XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext());
+            XMLReaderUtils.parseSAX(is,
+                    new OfflineContentHandler(contentTypeHandler),
+                    new ParseContext());
         } catch (SecurityException e) {
             throw e;
         } catch (Exception e) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
index 4e4bf7b..c21f287 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
@@ -19,21 +19,33 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
 
 import static org.junit.Assert.assertEquals;
 
+import java.io.File;
+import java.io.FileFilter;
 import java.util.List;
 
+import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.junit.AfterClass;
 import org.junit.Test;
 
 
-public class Word2006MLParserTest extends TikaTest {
+public class Word2006MLParserTest extends MultiThreadedTikaTest {
+
+    @AfterClass
+    public static void tearDown() throws TikaException {
+        XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+    }
 
     @Test
     public void basicTest() throws Exception {
@@ -167,5 +179,26 @@ public class Word2006MLParserTest extends TikaTest {
 
     }
 
+    @Test(timeout = 60000)
+    public void testMultiThreaded() throws Exception {
+        XMLReaderUtils.setPoolSize(4);
+        int numThreads = XMLReaderUtils.getPoolSize()*2;
+        ParseContext[] contexts = new ParseContext[numThreads];
+        for (int i = 0; i < contexts.length; i++) {
+            contexts[i] = new ParseContext();
+        }
+
+        testMultiThreaded(new AutoDetectParser(), contexts, numThreads, 2,
+                new FileFilter() {
+                    @Override
+                    public boolean accept(File pathname) {
+                        if (pathname.getName().equals("testWORD_2006ml.xml")) {
+                            return true;
+                        }
+                        return false;
+                    }
+                });
+
+    }
 
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 5cb0de9..a94055d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -18,19 +18,31 @@ package org.apache.tika.parser.microsoft.xml;
 
 import static org.junit.Assert.assertEquals;
 
+import java.io.File;
+import java.io.FileFilter;
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.junit.AfterClass;
 import org.junit.Test;
 
-public class XML2003ParserTest extends TikaTest {
+public class XML2003ParserTest extends MultiThreadedTikaTest {
+
+    @AfterClass
+    public static void tearDown() throws TikaException {
+        XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+    }
 
     @Test
     public void testBasicWord() throws Exception {
@@ -107,4 +119,25 @@ public class XML2003ParserTest extends TikaTest {
 
     }
 
+    @Test(timeout = 60000)
+    public void testMultiThreaded() throws Exception {
+        XMLReaderUtils.setPoolSize(4);
+        int numThreads = XMLReaderUtils.getPoolSize()*2;
+        ParseContext[] contexts = new ParseContext[numThreads];
+        for (int i = 0; i < contexts.length; i++) {
+            contexts[i] = new ParseContext();
+        }
+
+        testMultiThreaded(new AutoDetectParser(), contexts, numThreads, 2,
+                new FileFilter() {
+                    @Override
+                    public boolean accept(File pathname) {
+                        if (pathname.getName().equals("testWORD2003.xml")) {
+                            return true;
+                        }
+                        return false;
+                    }
+                });
+
+    }
 }