You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2013/01/16 00:18:56 UTC
svn commit: r1433733 - in
/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase:
master/SplitLogManager.java zookeeper/RecoverableZooKeeper.java
Author: tedyu
Date: Tue Jan 15 23:18:56 2013
New Revision: 1433733
URL: http://svn.apache.org/viewvc?rev=1433733&view=rev
Log:
HBASE-6748 Endless recursive of deleteNode happened in SplitLogManager#DeleteAsyncCallback (Jeffrey Zhong)
Modified:
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java?rev=1433733&r1=1433732&r2=1433733&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java Tue Jan 15 23:18:56 2013
@@ -26,6 +26,7 @@ import static org.apache.hadoop.hbase.ma
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@@ -42,9 +43,9 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.Chore;
-import org.apache.hadoop.hbase.SplitLogCounters;
import org.apache.hadoop.hbase.DeserializationException;
import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.SplitLogCounters;
import org.apache.hadoop.hbase.SplitLogTask;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
@@ -125,6 +126,8 @@ public class SplitLogManager extends Zoo
private volatile Set<ServerName> deadWorkers = null;
private final Object deadWorkersLock = new Object();
+ private Set<String> failedDeletions = null;
+
/**
* Wrapper around {@link #SplitLogManager(ZooKeeperWatcher zkw, Configuration conf,
* Stoppable stopper, MasterServices master, ServerName serverName, TaskFinisher tf)}
@@ -182,6 +185,8 @@ public class SplitLogManager extends Zoo
this.serverName = serverName;
this.timeoutMonitor =
new TimeoutMonitor(conf.getInt("hbase.splitlog.manager.timeoutmonitor.period", 1000), stopper);
+
+ this.failedDeletions = Collections.synchronizedSet(new HashSet<String>());
}
public void finishInitialization(boolean masterRecovery) {
@@ -436,11 +441,12 @@ public class SplitLogManager extends Zoo
}
}
}
- // delete the task node in zk. Keep trying indefinitely - its an async
+ // delete the task node in zk. It's an async
// call and no one is blocked waiting for this node to be deleted. All
// task names are unique (log.<timestamp>) there is no risk of deleting
// a future task.
- deleteNode(path, Long.MAX_VALUE);
+ // if a deletion fails, TimeoutMonitor will retry the same deletion later
+ deleteNode(path, zkretries);
return;
}
@@ -549,6 +555,21 @@ public class SplitLogManager extends Zoo
}
}
+ /**
+ * Helper function to check whether to abandon retries in ZooKeeper AsyncCallback functions
+ * @param statusCode integer value of a ZooKeeper exception code
+ * @param action description message about the retried action
+ * @return true when need to abandon retries otherwise false
+ */
+ private boolean needAbandonRetries(int statusCode, String action) {
+ if (statusCode == KeeperException.Code.SESSIONEXPIRED.intValue()) {
+ LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries for "
+ + "action=" + action);
+ return true;
+ }
+ return false;
+ }
+
private void heartbeat(String path, int new_version, ServerName workerName) {
Task task = findOrCreateOrphanTask(path);
if (new_version != task.last_version) {
@@ -680,8 +701,7 @@ public class SplitLogManager extends Zoo
}
private void deleteNodeFailure(String path) {
- LOG.fatal("logic failure, failing to delete a node should never happen " +
- "because delete has infinite retries");
+ LOG.info("Failed to delete node " + path + " and will retry soon.");
return;
}
@@ -1023,6 +1043,16 @@ public class SplitLogManager extends Zoo
SplitLogCounters.tot_mgr_resubmit_unassigned.incrementAndGet();
LOG.debug("resubmitting unassigned task(s) after timeout");
}
+
+ // Retry previously failed deletes
+ if (failedDeletions.size() > 0) {
+ List<String> tmpPaths = new ArrayList<String>(failedDeletions);
+ for (String tmpPath : tmpPaths) {
+ // deleteNode is an async call
+ deleteNode(tmpPath, zkretries);
+ }
+ failedDeletions.removeAll(tmpPaths);
+ }
}
}
@@ -1037,6 +1067,10 @@ public class SplitLogManager extends Zoo
public void processResult(int rc, String path, Object ctx, String name) {
SplitLogCounters.tot_mgr_node_create_result.incrementAndGet();
if (rc != 0) {
+ if (needAbandonRetries(rc, "Create znode " + path)) {
+ createNodeFailure(path);
+ return;
+ }
if (rc == KeeperException.Code.NODEEXISTS.intValue()) {
// What if there is a delete pending against this pre-existing
// znode? Then this soon-to-be-deleted task znode must be in TASK_DONE
@@ -1076,8 +1110,7 @@ public class SplitLogManager extends Zoo
Stat stat) {
SplitLogCounters.tot_mgr_get_data_result.incrementAndGet();
if (rc != 0) {
- if (rc == KeeperException.Code.SESSIONEXPIRED.intValue()) {
- LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries.");
+ if (needAbandonRetries(rc, "GetData from znode " + path)) {
return;
}
if (rc == KeeperException.Code.NONODE.intValue()) {
@@ -1131,6 +1164,10 @@ public class SplitLogManager extends Zoo
public void processResult(int rc, String path, Object ctx) {
SplitLogCounters.tot_mgr_node_delete_result.incrementAndGet();
if (rc != 0) {
+ if (needAbandonRetries(rc, "Delete znode " + path)) {
+ failedDeletions.add(path);
+ return;
+ }
if (rc != KeeperException.Code.NONODE.intValue()) {
SplitLogCounters.tot_mgr_node_delete_err.incrementAndGet();
Long retry_count = (Long) ctx;
@@ -1138,6 +1175,7 @@ public class SplitLogManager extends Zoo
path + " remaining retries=" + retry_count);
if (retry_count == 0) {
LOG.warn("delete failed " + path);
+ failedDeletions.add(path);
deleteNodeFailure(path);
} else {
deleteNode(path, retry_count - 1);
@@ -1169,8 +1207,7 @@ public class SplitLogManager extends Zoo
@Override
public void processResult(int rc, String path, Object ctx, String name) {
if (rc != 0) {
- if (rc == KeeperException.Code.SESSIONEXPIRED.intValue()) {
- LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries.");
+ if (needAbandonRetries(rc, "CreateRescan znode " + path)) {
return;
}
Long retry_count = (Long)ctx;
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java?rev=1433733&r1=1433732&r2=1433733&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java Tue Jan 15 23:18:56 2013
@@ -69,7 +69,7 @@ import org.apache.zookeeper.data.Stat;
public class RecoverableZooKeeper {
private static final Log LOG = LogFactory.getLog(RecoverableZooKeeper.class);
// the actual ZooKeeper client instance
- private ZooKeeper zk;
+ volatile private ZooKeeper zk;
private final RetryCounterFactory retryCounterFactory;
// An identifier of this process in the cluster
private final String identifier;