You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by bu...@apache.org on 2014/01/23 08:36:29 UTC

[11/23] git commit: ACCUMULO-2225 handle IllegalArgumentExceptions from Hadoop on host lookup failure.

ACCUMULO-2225 handle IllegalArgumentExceptions from Hadoop on host lookup failure.

Looks for cases where we treat IOExceptions out of Hadoop specially, then attempts to replicate for UnknownHostExceptions that have been wrapped in IllegalArgumentExceptions.


Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/f778dcf5
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/f778dcf5
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/f778dcf5

Branch: refs/heads/1.6.0-SNAPSHOT
Commit: f778dcf50cd7163722f63471d1585333704e639c
Parents: f42ead0
Author: Sean Busbey <bu...@cloudera.com>
Authored: Tue Jan 21 08:26:16 2014 -0600
Committer: Sean Busbey <bu...@cloudera.com>
Committed: Wed Jan 22 23:12:29 2014 -0600

----------------------------------------------------------------------
 .../accumulo/core/client/ZooKeeperInstance.java | 10 ++++++-
 .../org/apache/accumulo/server/Accumulo.java    | 28 +++++++++++++++++---
 .../server/master/tableOps/DeleteTable.java     |  8 ++++++
 .../accumulo/server/tabletserver/Compactor.java |  2 ++
 .../randomwalk/security/SecurityHelper.java     | 11 ++++++--
 .../accumulo/server/util/TabletOperations.java  | 10 ++++++-
 6 files changed, 62 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo/blob/f778dcf5/src/core/src/main/java/org/apache/accumulo/core/client/ZooKeeperInstance.java
----------------------------------------------------------------------
diff --git a/src/core/src/main/java/org/apache/accumulo/core/client/ZooKeeperInstance.java b/src/core/src/main/java/org/apache/accumulo/core/client/ZooKeeperInstance.java
index e02c197..05982e4 100644
--- a/src/core/src/main/java/org/apache/accumulo/core/client/ZooKeeperInstance.java
+++ b/src/core/src/main/java/org/apache/accumulo/core/client/ZooKeeperInstance.java
@@ -18,6 +18,7 @@ package org.apache.accumulo.core.client;
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.util.Collections;
 import java.util.List;
@@ -276,7 +277,14 @@ public class ZooKeeperInstance implements Instance {
         return result;
       }
     } catch (IOException e) {
-      throw new RuntimeException("Accumulo not initialized, there is no instance id at " + instanceDirectory, e);
+      log.error("Problem reading instance id out of hdfs at " + instanceDirectory, e);
+      throw new RuntimeException("Can't tell if Accumulo is initialized; can't read instance id at " + instanceDirectory, e);
+    } catch (IllegalArgumentException exception) {
+      /* HDFS throws this when there's a UnknownHostException due to DNS troubles. */
+      if (exception.getCause() instanceof UnknownHostException) {
+        log.error("Problem reading instance id out of hdfs at " + instanceDirectory, exception);
+      }
+      throw exception;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/accumulo/blob/f778dcf5/src/server/src/main/java/org/apache/accumulo/server/Accumulo.java
----------------------------------------------------------------------
diff --git a/src/server/src/main/java/org/apache/accumulo/server/Accumulo.java b/src/server/src/main/java/org/apache/accumulo/server/Accumulo.java
index 253962b..184692c 100644
--- a/src/server/src/main/java/org/apache/accumulo/server/Accumulo.java
+++ b/src/server/src/main/java/org/apache/accumulo/server/Accumulo.java
@@ -30,6 +30,7 @@ import java.util.TreeMap;
 import org.apache.accumulo.core.Constants;
 import org.apache.accumulo.core.file.FileUtil;
 import org.apache.accumulo.core.trace.DistributedTrace;
+import org.apache.accumulo.core.util.AddressUtil;
 import org.apache.accumulo.core.util.CachedConfiguration;
 import org.apache.accumulo.core.util.UtilWaitThread;
 import org.apache.accumulo.core.util.Version;
@@ -209,6 +210,7 @@ public class Accumulo {
     }
     log.info("Zookeeper connected and initialized, attemping to talk to HDFS");
     long sleep = 1000;
+    int unknownHostTries = 3;
     while (true) {
       try {
         FileSystem fs = FileSystem.get(CachedConfiguration.getInstance());
@@ -216,10 +218,26 @@ public class Accumulo {
           break;
         log.warn("Waiting for the NameNode to leave safemode");
       } catch (IOException ex) {
-        log.warn("Unable to connect to HDFS");
+        log.warn("Unable to connect to HDFS", ex);
+      } catch (IllegalArgumentException exception) {
+        /* Unwrap the UnknownHostException so we can deal with it directly */
+        if (exception.getCause() instanceof UnknownHostException) {
+          if (unknownHostTries > 0) {
+            log.warn("Unable to connect to HDFS, will retry. cause: " + exception.getCause());
+            /* We need to make sure our sleep period is long enough to avoid getting a cached failure of the host lookup. */
+            sleep = Math.max(sleep, (AddressUtil.getAddressCacheNegativeTtl((UnknownHostException)(exception.getCause()))+1)*1000);
+          } else {
+            log.error("Unable to connect to HDFS and have exceeded max number of retries.", exception);
+            throw exception;
+          }
+          unknownHostTries--;
+        } else {
+          throw exception;
+        }
       }
-      log.info("Sleeping " + sleep / 1000. + " seconds");
+      log.info("Backing off due to failure; current sleep period is " + sleep / 1000. + " seconds");
       UtilWaitThread.sleep(sleep);
+      /* Back off to give transient failures more time to clear. */
       sleep = Math.min(60 * 1000, sleep * 2);
     }
     log.info("Connected to HDFS");
@@ -228,6 +246,7 @@ public class Accumulo {
   private static boolean isInSafeMode(FileSystem fs) throws IOException {
     if (!(fs instanceof DistributedFileSystem))
       return false;
+    /* Might throw an IllegalArgumentException wrapping UnknownHostException, dealt with above. */
     DistributedFileSystem dfs = (DistributedFileSystem) FileSystem.get(CachedConfiguration.getInstance());
     // So this:  if (!dfs.setSafeMode(SafeModeAction.SAFEMODE_GET))
     // Becomes this:
@@ -265,8 +284,11 @@ public class Accumulo {
     try {
       Method setSafeMode = dfs.getClass().getMethod("setSafeMode", safeModeAction);
       return (Boolean)setSafeMode.invoke(dfs, get);
+    } catch (IllegalArgumentException exception) {
+      /* Send IAEs back as-is, so that those that wrap UnknownHostException can be handled in the same place as similar sources of failure. */
+      throw exception;
     } catch (Exception ex) {
-      throw new RuntimeException("cannot find method setSafeMode");
+      throw new RuntimeException("cannot find method setSafeMode", ex);
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/accumulo/blob/f778dcf5/src/server/src/main/java/org/apache/accumulo/server/master/tableOps/DeleteTable.java
----------------------------------------------------------------------
diff --git a/src/server/src/main/java/org/apache/accumulo/server/master/tableOps/DeleteTable.java b/src/server/src/main/java/org/apache/accumulo/server/master/tableOps/DeleteTable.java
index 1c4d4ad..5ada93a 100644
--- a/src/server/src/main/java/org/apache/accumulo/server/master/tableOps/DeleteTable.java
+++ b/src/server/src/main/java/org/apache/accumulo/server/master/tableOps/DeleteTable.java
@@ -17,6 +17,7 @@
 package org.apache.accumulo.server.master.tableOps;
 
 import java.io.IOException;
+import java.net.UnknownHostException;
 import java.util.Collections;
 import java.util.Map.Entry;
 
@@ -177,6 +178,13 @@ class CleanUp extends MasterRepo {
         fs.delete(new Path(ServerConstants.getTablesDir(), tableId), true);
       } catch (IOException e) {
         log.error("Unable to remove deleted table directory", e);
+      } catch (IllegalArgumentException exception) {
+        if (exception.getCause() instanceof UnknownHostException) {
+          /* Thrown if HDFS encounters a DNS problem in some edge cases */
+          log.error("Unable to remove deleted table directory", exception);
+        } else {
+          throw exception;
+        }
       }
     }
     

http://git-wip-us.apache.org/repos/asf/accumulo/blob/f778dcf5/src/server/src/main/java/org/apache/accumulo/server/tabletserver/Compactor.java
----------------------------------------------------------------------
diff --git a/src/server/src/main/java/org/apache/accumulo/server/tabletserver/Compactor.java b/src/server/src/main/java/org/apache/accumulo/server/tabletserver/Compactor.java
index 29b8455..24254c8 100644
--- a/src/server/src/main/java/org/apache/accumulo/server/tabletserver/Compactor.java
+++ b/src/server/src/main/java/org/apache/accumulo/server/tabletserver/Compactor.java
@@ -180,6 +180,8 @@ public class Compactor implements Callable<CompactionStats> {
         }
       } catch (IOException e) {
         log.warn(e, e);
+      } catch (RuntimeException exception) {
+        log.warn(exception, exception);
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/accumulo/blob/f778dcf5/src/server/src/main/java/org/apache/accumulo/server/test/randomwalk/security/SecurityHelper.java
----------------------------------------------------------------------
diff --git a/src/server/src/main/java/org/apache/accumulo/server/test/randomwalk/security/SecurityHelper.java b/src/server/src/main/java/org/apache/accumulo/server/test/randomwalk/security/SecurityHelper.java
index c8d1ea0..a4f715d 100644
--- a/src/server/src/main/java/org/apache/accumulo/server/test/randomwalk/security/SecurityHelper.java
+++ b/src/server/src/main/java/org/apache/accumulo/server/test/randomwalk/security/SecurityHelper.java
@@ -20,6 +20,7 @@
 package org.apache.accumulo.server.test.randomwalk.security;
 
 import java.io.IOException;
+import java.net.UnknownHostException;
 import java.util.Map;
 
 import org.apache.accumulo.core.client.Connector;
@@ -192,8 +193,14 @@ public class SecurityHelper {
       try {
         fs = FileSystem.get(CachedConfiguration.getInstance());
       } catch (IOException e) {
-        // TODO Auto-generated catch block
-        e.printStackTrace();
+        log.error("problem getting default file system.", e);
+      } catch (IllegalArgumentException exception) {
+        /* Hadoop throws a wrapped UHE in some edge cases of DNS trouble */
+        if (exception.getCause() instanceof UnknownHostException) {
+          log.error("problem getting default file system.", exception);
+        } else {
+          throw exception;
+        }
       }
       state.set(filesystem, fs);
     }

http://git-wip-us.apache.org/repos/asf/accumulo/blob/f778dcf5/src/server/src/main/java/org/apache/accumulo/server/util/TabletOperations.java
----------------------------------------------------------------------
diff --git a/src/server/src/main/java/org/apache/accumulo/server/util/TabletOperations.java b/src/server/src/main/java/org/apache/accumulo/server/util/TabletOperations.java
index 4a39050..d1c8425 100644
--- a/src/server/src/main/java/org/apache/accumulo/server/util/TabletOperations.java
+++ b/src/server/src/main/java/org/apache/accumulo/server/util/TabletOperations.java
@@ -17,6 +17,7 @@
 package org.apache.accumulo.server.util;
 
 import java.io.IOException;
+import java.net.UnknownHostException;
 
 import org.apache.accumulo.core.Constants;
 import org.apache.accumulo.core.util.CachedConfiguration;
@@ -68,7 +69,14 @@ public class TabletOperations {
         FileSystem fs = FileSystem.get(CachedConfiguration.getInstance());
         return createTabletDirectory(fs, tableDir, endRow);
       } catch (IOException e) {
-        log.warn(e);
+        log.warn("problem creating tablet directory", e);
+      } catch (IllegalArgumentException exception) {
+        /* thrown in some edge cases of DNS failure by Hadoop instead of UnknownHostException */
+        if (exception.getCause() instanceof UnknownHostException) {
+          log.warn("problem creating tablet directory", exception);
+        } else {
+          throw exception;
+        }
       }
       UtilWaitThread.sleep(3000);
     }