You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/11/18 11:27:25 UTC
[nutch] branch master updated: NUTCH-2582 Set pool size of XML SAX
parsers used for MIME detection in Tika - add method in MimeUtil to set
MimeTypesReader pool size - actually adjust pool size to number of Fetcher
threads / 2 (minimum pool size is 10 in case there are less than 20 Fetcher
threads) - double pool size (10 -> 20) of Tika XMLReaderUtils in
tika-config.xml
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 975452f NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika - add method in MimeUtil to set MimeTypesReader pool size - actually adjust pool size to number of Fetcher threads / 2 (minimum pool size is 10 in case there are less than 20 Fetcher threads) - double pool size (10 -> 20) of Tika XMLReaderUtils in tika-config.xml
new c1cf6bb Merge pull request #554 from sebastian-nagel/NUTCH-2582-set-mime-types-reader-pool-size
975452f is described below
commit 975452f7ac0b60f04d79b10477a4744dfe1aa673
Author: Sebastian Nagel <se...@commoncrawl.org>
AuthorDate: Fri Oct 16 23:10:03 2020 +0200
NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika
- add method in MimeUtil to set MimeTypesReader pool size
- actually adjust pool size to number of Fetcher threads / 2
(minimum pool size is 10 in case there are less than 20 Fetcher threads)
- double pool size (10 -> 20) of Tika XMLReaderUtils in tika-config.xml
---
conf/tika-config.xml.template | 7 +++++++
src/java/org/apache/nutch/fetcher/Fetcher.java | 4 ++++
src/java/org/apache/nutch/util/MimeUtil.java | 28 ++++++++++++++------------
3 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/conf/tika-config.xml.template b/conf/tika-config.xml.template
index 571a606..35f635e 100644
--- a/conf/tika-config.xml.template
+++ b/conf/tika-config.xml.template
@@ -17,4 +17,11 @@
-->
<properties>
<service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" />
+ <!--
+ Set pool size of SAX parsers to a higher value if fetcher is
+ parsing with many threads and Tika complains about "Consider
+ increasing the XMLReaderUtils.POOL_SIZE". Tika's default pool
+ size is 10. Cf. NUTCH-2578, TIKA-2645, NUTCH-2582.
+ -->
+ <xml-reader-utils poolSize="20" />
</properties>
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 687411e..6d4c195 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -46,6 +46,7 @@ import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
@@ -201,6 +202,9 @@ public class Fetcher extends NutchTool implements Tool {
int threadCount = conf.getInt("fetcher.threads.fetch", 10);
LOG.info("Fetcher: threads: {}", threadCount);
+ // NUTCH-2582: adapt Tika MIME detector pool size to thread count
+ MimeUtil.setPoolSize(Math.max(10, threadCount / 2));
+
int timeoutDivisor = conf.getInt("fetcher.threads.timeout.divisor", 2);
LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor);
diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java
index 17bb380..2cc0d14 100644
--- a/src/java/org/apache/nutch/util/MimeUtil.java
+++ b/src/java/org/apache/nutch/util/MimeUtil.java
@@ -22,30 +22,24 @@ import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import org.apache.hadoop.conf.Configuration;
-
+import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
-
+import org.apache.tika.mime.MimeTypesReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.nutch.protocol.ProtocolOutput;
-
/**
- * @author mattmann
- * @since NUTCH-608
- *
- * <p>
- * This is a facade class to insulate Nutch from its underlying Mime Type
- * substrate library, <a href="http://incubator.apache.org/tika/">Apache
- * Tika</a>. Any mime handling code should be placed in this utility
- * class, and hidden from the Nutch classes that rely on it.
- * </p>
+ * This is a facade class to insulate Nutch from its underlying Mime Type
+ * substrate library, <a href="https://tika.apache.org/">Apache Tika</a>. Any
+ * Mime handling code should be placed in this utility class, and hidden from
+ * the Nutch classes that rely on it.
*/
public final class MimeUtil {
@@ -64,6 +58,14 @@ public final class MimeUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
+ public static void setPoolSize(int poolSize) {
+ try {
+ MimeTypesReader.setPoolSize(poolSize);
+ } catch (TikaException e) {
+ LOG.error("Failed to set pool size", e);
+ }
+ }
+
public MimeUtil(Configuration conf) {
ObjectCache objectCache = ObjectCache.get(conf);
tika = (Tika) objectCache.getObject(Tika.class.getName());