You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/08/05 13:44:51 UTC
[nutch] branch master updated: NUTCH-2702 Fetcher: suppress stack
for frequent exceptions - exceptions listed in
http.log.exceptions.suppress.stack are logged without stack trace -
exclusions are checked when errors are logged in FetcherThread and
HttpBase.getResponse(...)
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new f02c41a NUTCH-2702 Fetcher: suppress stack for frequent exceptions - exceptions listed in http.log.exceptions.suppress.stack are logged without stack trace - exclusions are checked when errors are logged in FetcherThread and HttpBase.getResponse(...)
f02c41a is described below
commit f02c41a35f5bb6c13d3019e7f8adac177c29319b
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Mar 26 17:21:28 2019 +0100
NUTCH-2702 Fetcher: suppress stack for frequent exceptions
- exceptions listed in http.log.exceptions.suppress.stack
are logged without stack trace
- exclusions are checked when errors are logged
in FetcherThread and HttpBase.getResponse(...)
---
conf/nutch-default.xml | 11 +++
.../org/apache/nutch/fetcher/FetcherThread.java | 15 +++-
.../nutch/net/protocols/ProtocolLogUtil.java | 83 ++++++++++++++++++++++
.../apache/nutch/net/protocols/package-info.java | 2 +-
.../apache/nutch/protocol/http/api/HttpBase.java | 16 ++++-
5 files changed, 124 insertions(+), 3 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index e88991c..41a337a 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -997,6 +997,17 @@
</property>
<property>
+ <name>http.log.exceptions.suppress.stack</name>
+ <value>java.net.UnknownHostException,java.net.NoRouteToHostException</value>
+ <description>Comma-separated list of exceptions not shown with full
+ stack trace in logs of fetcher and HTTP protocol implementations.
+ The logs may shrink in size significantly, e.g., when for a large
+ unrestriced web crawl unknown hosts are logged shortly without full
+ stack trace. The full class name of the exception class (extending
+ Throwable) including the package path must be specified.</description>
+</property>
+
+<property>
<name>fetcher.parse</name>
<value>false</value>
<description>If true, fetcher will parse content. Default is false, which means
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 395a141..e52b9ea 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -43,6 +43,7 @@ import org.apache.nutch.net.URLExemptionFilters;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.ProtocolLogUtil;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -145,6 +146,8 @@ public class FetcherThread extends Thread {
private FetcherThreadPublisher publisher;
private boolean activatePublisher;
+ private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+
public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues,
QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context,
AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent,
@@ -174,6 +177,8 @@ public class FetcherThread extends Thread {
this.pages = pages;
this.bytes = bytes;
+ this.logUtil.setConf(conf);
+
// NUTCH-2413 Apply filters and normalizers on outlinks
// when parsing only if configured
if (parsing) {
@@ -457,7 +462,15 @@ public class FetcherThread extends Thread {
} catch (Throwable t) { // unexpected exception
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
- logError(fit.url, StringUtils.stringifyException(t));
+ String message;
+ if (LOG.isDebugEnabled()) {
+ message = StringUtils.stringifyException(t);
+ } else if (logUtil.logShort(t)) {
+ message = t.getClass().getName();
+ } else {
+ message = StringUtils.stringifyException(t);
+ }
+ logError(fit.url, message);
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
CrawlDatum.STATUS_FETCH_RETRY);
}
diff --git a/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
new file mode 100644
index 0000000..28d8894
--- /dev/null
+++ b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ProtocolLogUtil implements Configurable {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ public static final String HTTP_LOG_SUPPRESSION = "http.log.exceptions.suppress.stack";
+
+ private Configuration config;
+
+ /**
+ * Set of exceptions logged shortly without full Java stack trace, see
+ * property <code>http.log.exceptions.suppress.stack</code>.
+ */
+ private Set<Class<? extends Throwable>> exceptionsLogShort = new HashSet<>();
+
+ @Override
+ public Configuration getConf() {
+ return config;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ config = conf;
+ for (String exceptClassName : conf.getTrimmedStrings(HTTP_LOG_SUPPRESSION,
+ "java.net.UnknownHostException", "java.net.NoRouteToHostException")) {
+ Class<?> clazz = conf.getClassByNameOrNull(exceptClassName);
+ if (clazz == null) {
+ LOG.warn("Class {} configured for log stack suppression not found.",
+ exceptClassName);
+ continue;
+ }
+ if (!Throwable.class.isAssignableFrom(clazz)) {
+ LOG.warn(
+ "Class {} configured for log stack suppression does not extend Throwable.",
+ exceptClassName);
+ continue;
+ }
+ exceptionsLogShort.add(clazz.asSubclass(Throwable.class));
+ }
+ }
+
+ /**
+ * Return true if exception is configured to be logged as short message
+ * without stack trace, usually done for frequent exceptions with obvious
+ * reasons (e.g., UnknownHostException), configurable by
+ * <code>http.log.exceptions.suppress.stack</code>
+ */
+ public boolean logShort(Throwable t) {
+ if (exceptionsLogShort.contains(t.getClass())) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/src/java/org/apache/nutch/net/protocols/package-info.java b/src/java/org/apache/nutch/net/protocols/package-info.java
index 8823f5b..199e1e4 100644
--- a/src/java/org/apache/nutch/net/protocols/package-info.java
+++ b/src/java/org/apache/nutch/net/protocols/package-info.java
@@ -17,7 +17,7 @@
/**
* Helper classes related to the {@link org.apache.nutch.protocol.Protocol Protocol}
- * interface, sea also {@link org.apache.nutch.protocol}.
+ * interface, see also {@link org.apache.nutch.protocol}.
*/
package org.apache.nutch.net.protocols;
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 4b91f9c..d5bc0b7 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -37,6 +37,7 @@ import org.slf4j.LoggerFactory;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.ProtocolLogUtil;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -123,6 +124,12 @@ public abstract class HttpBase implements Protocol {
private Configuration conf = null;
/**
+ * Logging utility, used to suppress stack traces for common exceptions in a
+ * configurable way.
+ */
+ private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+
+ /**
* MimeUtil for MIME type detection. Note (see NUTCH-2578): MimeUtil object is
* used concurrently by parallel fetcher threads, methods to detect MIME type
* must be thread-safe.
@@ -224,6 +231,8 @@ public abstract class HttpBase implements Protocol {
this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
this.robots.setConf(conf);
+ this.logUtil.setConf(conf);
+
// NUTCH-1941: read list of alternating agent names
if (conf.getBoolean("http.agent.rotate", false)) {
String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
@@ -436,7 +445,12 @@ public abstract class HttpBase implements Protocol {
ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
- logger.error("Failed to get protocol output", e);
+ if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
+ logger.error("Failed to get protocol output", e);
+ } else {
+ logger.error("Failed to get protocol output: {}",
+ e.getClass().getName());
+ }
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}