You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/08/05 13:44:51 UTC

[nutch] branch master updated: NUTCH-2702 Fetcher: suppress stack for frequent exceptions - exceptions listed in http.log.exceptions.suppress.stack are logged without stack trace - exclusions are checked when errors are logged in FetcherThread and HttpBase.getResponse(...)

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new f02c41a  NUTCH-2702 Fetcher: suppress stack for frequent exceptions - exceptions listed in http.log.exceptions.suppress.stack   are logged without stack trace - exclusions are checked when errors are logged   in FetcherThread and HttpBase.getResponse(...)
f02c41a is described below

commit f02c41a35f5bb6c13d3019e7f8adac177c29319b
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Mar 26 17:21:28 2019 +0100

    NUTCH-2702 Fetcher: suppress stack for frequent exceptions
    - exceptions listed in http.log.exceptions.suppress.stack
      are logged without stack trace
    - exclusions are checked when errors are logged
      in FetcherThread and HttpBase.getResponse(...)
---
 conf/nutch-default.xml                             | 11 +++
 .../org/apache/nutch/fetcher/FetcherThread.java    | 15 +++-
 .../nutch/net/protocols/ProtocolLogUtil.java       | 83 ++++++++++++++++++++++
 .../apache/nutch/net/protocols/package-info.java   |  2 +-
 .../apache/nutch/protocol/http/api/HttpBase.java   | 16 ++++-
 5 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index e88991c..41a337a 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -997,6 +997,17 @@
 </property>
 
 <property>
+  <name>http.log.exceptions.suppress.stack</name>
+  <value>java.net.UnknownHostException,java.net.NoRouteToHostException</value>
+  <description>Comma-separated list of exceptions not shown with full
+  stack trace in logs of fetcher and HTTP protocol implementations.
+  The logs may shrink in size significantly, e.g., when for a large
+  unrestriced web crawl unknown hosts are logged shortly without full
+  stack trace.  The full class name of the exception class (extending
+  Throwable) including the package path must be specified.</description>
+</property>
+
+<property>
   <name>fetcher.parse</name>
   <value>false</value>
   <description>If true, fetcher will parse content. Default is false, which means
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 395a141..e52b9ea 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -43,6 +43,7 @@ import org.apache.nutch.net.URLExemptionFilters;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.ProtocolLogUtil;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
@@ -145,6 +146,8 @@ public class FetcherThread extends Thread {
   private FetcherThreadPublisher publisher;
   private boolean activatePublisher;
 
+  private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+
   public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, 
       QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context,
       AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent, 
@@ -174,6 +177,8 @@ public class FetcherThread extends Thread {
     this.pages = pages;
     this.bytes = bytes;
 
+    this.logUtil.setConf(conf);
+
     // NUTCH-2413 Apply filters and normalizers on outlinks
     // when parsing only if configured
     if (parsing) {
@@ -457,7 +462,15 @@ public class FetcherThread extends Thread {
         } catch (Throwable t) { // unexpected exception
           // unblock
           ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
-          logError(fit.url, StringUtils.stringifyException(t));
+          String message;
+          if (LOG.isDebugEnabled()) {
+            message = StringUtils.stringifyException(t);
+          } else if (logUtil.logShort(t)) {
+            message = t.getClass().getName();
+          } else {
+            message = StringUtils.stringifyException(t);
+          }
+          logError(fit.url, message);
           output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
               CrawlDatum.STATUS_FETCH_RETRY);
         }
diff --git a/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
new file mode 100644
index 0000000..28d8894
--- /dev/null
+++ b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ProtocolLogUtil implements Configurable {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String HTTP_LOG_SUPPRESSION = "http.log.exceptions.suppress.stack";
+
+  private Configuration config;
+
+  /**
+   * Set of exceptions logged shortly without full Java stack trace, see
+   * property <code>http.log.exceptions.suppress.stack</code>.
+   */
+  private Set<Class<? extends Throwable>> exceptionsLogShort = new HashSet<>();
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    for (String exceptClassName : conf.getTrimmedStrings(HTTP_LOG_SUPPRESSION,
+        "java.net.UnknownHostException", "java.net.NoRouteToHostException")) {
+      Class<?> clazz = conf.getClassByNameOrNull(exceptClassName);
+      if (clazz == null) {
+        LOG.warn("Class {} configured for log stack suppression not found.",
+            exceptClassName);
+        continue;
+      }
+      if (!Throwable.class.isAssignableFrom(clazz)) {
+        LOG.warn(
+            "Class {} configured for log stack suppression does not extend Throwable.",
+            exceptClassName);
+        continue;
+      }
+      exceptionsLogShort.add(clazz.asSubclass(Throwable.class));
+    }
+  }
+
+  /**
+   * Return true if exception is configured to be logged as short message
+   * without stack trace, usually done for frequent exceptions with obvious
+   * reasons (e.g., UnknownHostException), configurable by
+   * <code>http.log.exceptions.suppress.stack</code>
+   */
+  public boolean logShort(Throwable t) {
+    if (exceptionsLogShort.contains(t.getClass())) {
+      return true;
+    }
+    return false;
+  }
+
+}
diff --git a/src/java/org/apache/nutch/net/protocols/package-info.java b/src/java/org/apache/nutch/net/protocols/package-info.java
index 8823f5b..199e1e4 100644
--- a/src/java/org/apache/nutch/net/protocols/package-info.java
+++ b/src/java/org/apache/nutch/net/protocols/package-info.java
@@ -17,7 +17,7 @@
 
 /**
  * Helper classes related to the {@link org.apache.nutch.protocol.Protocol Protocol}
- * interface, sea also {@link org.apache.nutch.protocol}.
+ * interface, see also {@link org.apache.nutch.protocol}.
  */
 package org.apache.nutch.net.protocols;
 
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 4b91f9c..d5bc0b7 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -37,6 +37,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.ProtocolLogUtil;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -123,6 +124,12 @@ public abstract class HttpBase implements Protocol {
   private Configuration conf = null;
 
   /**
+   * Logging utility, used to suppress stack traces for common exceptions in a
+   * configurable way.
+   */
+  private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+
+  /**
    * MimeUtil for MIME type detection. Note (see NUTCH-2578): MimeUtil object is
    * used concurrently by parallel fetcher threads, methods to detect MIME type
    * must be thread-safe.
@@ -224,6 +231,8 @@ public abstract class HttpBase implements Protocol {
     this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
     this.robots.setConf(conf);
 
+    this.logUtil.setConf(conf);
+
     // NUTCH-1941: read list of alternating agent names
     if (conf.getBoolean("http.agent.rotate", false)) {
       String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
@@ -436,7 +445,12 @@ public abstract class HttpBase implements Protocol {
             ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
       }
     } catch (Throwable e) {
-      logger.error("Failed to get protocol output", e);
+      if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
+        logger.error("Failed to get protocol output", e);
+      } else {
+        logger.error("Failed to get protocol output: {}",
+            e.getClass().getName());
+      }
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }