You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/11/18 11:27:25 UTC

[nutch] branch master updated: NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika - add method in MimeUtil to set MimeTypesReader pool size - actually adjust pool size to number of Fetcher threads / 2 (minimum pool size is 10 in case there are less than 20 Fetcher threads) - double pool size (10 -> 20) of Tika XMLReaderUtils in tika-config.xml

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 975452f  NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika - add method in MimeUtil to set MimeTypesReader pool size - actually adjust pool size to number of Fetcher threads / 2   (minimum pool size is 10 in case there are less than 20 Fetcher threads) - double pool size (10 -> 20) of Tika XMLReaderUtils in tika-config.xml
     new c1cf6bb  Merge pull request #554 from sebastian-nagel/NUTCH-2582-set-mime-types-reader-pool-size
975452f is described below

commit 975452f7ac0b60f04d79b10477a4744dfe1aa673
Author: Sebastian Nagel <se...@commoncrawl.org>
AuthorDate: Fri Oct 16 23:10:03 2020 +0200

    NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika
    - add method in MimeUtil to set MimeTypesReader pool size
    - actually adjust pool size to number of Fetcher threads / 2
      (minimum pool size is 10 in case there are less than 20 Fetcher threads)
    - double pool size (10 -> 20) of Tika XMLReaderUtils in tika-config.xml
---
 conf/tika-config.xml.template                  |  7 +++++++
 src/java/org/apache/nutch/fetcher/Fetcher.java |  4 ++++
 src/java/org/apache/nutch/util/MimeUtil.java   | 28 ++++++++++++++------------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/conf/tika-config.xml.template b/conf/tika-config.xml.template
index 571a606..35f635e 100644
--- a/conf/tika-config.xml.template
+++ b/conf/tika-config.xml.template
@@ -17,4 +17,11 @@
 -->
 <properties>
     <service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" />
+    <!--
+        Set pool size of SAX parsers to a higher value if fetcher is
+        parsing with many threads and Tika complains about "Consider
+        increasing the XMLReaderUtils.POOL_SIZE".  Tika's default pool
+        size is 10.  Cf. NUTCH-2578, TIKA-2645, NUTCH-2582.
+    -->
+    <xml-reader-utils poolSize="20" />
 </properties>
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 687411e..6d4c195 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -46,6 +46,7 @@ import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
@@ -201,6 +202,9 @@ public class Fetcher extends NutchTool implements Tool {
       int threadCount = conf.getInt("fetcher.threads.fetch", 10);
       LOG.info("Fetcher: threads: {}", threadCount);
 
+      // NUTCH-2582: adapt Tika MIME detector pool size to thread count
+      MimeUtil.setPoolSize(Math.max(10, threadCount / 2));
+
       int timeoutDivisor = conf.getInt("fetcher.threads.timeout.divisor", 2);
       LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor);
 
diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java
index 17bb380..2cc0d14 100644
--- a/src/java/org/apache/nutch/util/MimeUtil.java
+++ b/src/java/org/apache/nutch/util/MimeUtil.java
@@ -22,30 +22,24 @@ import java.io.InputStream;
 import java.lang.invoke.MethodHandles;
 
 import org.apache.hadoop.conf.Configuration;
-
+import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
-
+import org.apache.tika.mime.MimeTypesReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.nutch.protocol.ProtocolOutput;
-
 /**
- * @author mattmann
- * @since NUTCH-608
- * 
- *        <p>
- *        This is a facade class to insulate Nutch from its underlying Mime Type
- *        substrate library, <a href="http://incubator.apache.org/tika/">Apache
- *        Tika</a>. Any mime handling code should be placed in this utility
- *        class, and hidden from the Nutch classes that rely on it.
- *        </p>
+ * This is a facade class to insulate Nutch from its underlying Mime Type
+ * substrate library, <a href="https://tika.apache.org/">Apache Tika</a>. Any
+ * Mime handling code should be placed in this utility class, and hidden from
+ * the Nutch classes that rely on it.
  */
 public final class MimeUtil {
 
@@ -64,6 +58,14 @@ public final class MimeUtil {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
+  public static void setPoolSize(int poolSize) {
+    try {
+      MimeTypesReader.setPoolSize(poolSize);
+    } catch (TikaException e) {
+      LOG.error("Failed to set pool size", e);
+    }
+  }
+
   public MimeUtil(Configuration conf) {
     ObjectCache objectCache = ObjectCache.get(conf);
     tika = (Tika) objectCache.getObject(Tika.class.getName());