You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ek...@apache.org on 2015/10/01 01:05:45 UTC
hive git commit: HIVE-11934 Transaction lock retry logic results in infinite loop(Eugene Koifman, reviewed by Ashutosh Chauhan)

Repository: hive
Updated Branches:
  refs/heads/master edd630043 -> 0d43e876b


HIVE-11934 Transaction lock retry logic results in infinite loop(Eugene Koifman, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0d43e876
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0d43e876
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0d43e876

Branch: refs/heads/master
Commit: 0d43e876be9c36156a28bd2c2b9493f986841dd7
Parents: edd6300
Author: Eugene Koifman <ek...@hortonworks.com>
Authored: Wed Sep 30 16:05:34 2015 -0700
Committer: Eugene Koifman <ek...@hortonworks.com>
Committed: Wed Sep 30 16:05:34 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hive/metastore/txn/TxnHandler.java   | 117 +++++++++----------
 1 file changed, 57 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/0d43e876/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
----------------------------------------------------------------------
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
index 0b19368..cc7e2c6 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
@@ -91,8 +91,8 @@ public class TxnHandler {
   /**
    * Number of consecutive deadlocks we have seen
    */
-  protected int deadlockCnt;
-  private long deadlockRetryInterval;
+  private int deadlockCnt;
+  private final long deadlockRetryInterval;
   protected HiveConf conf;
   protected DatabaseProduct dbProduct;
 
@@ -115,10 +115,8 @@ public class TxnHandler {
   //
   // All public methods that write to the database have to check for deadlocks when a SQLException
   // comes back and handle it if they see one.  This has to be done with the connection pooling
-  // in mind.  To do this they should call detectDeadlock AFTER rolling back the db transaction,
-  // and then in an outer loop they should catch DeadlockException.  In the catch for this they
-  // should increment the deadlock counter and recall themselves.  See commitTxn for an example.
-  // the connection has been closed and returned to the pool.
+  // in mind.  To do this they should call checkRetryable() AFTER rolling back the db transaction,
+  // and then they should catch RetryException and call themselves recursively. See commitTxn for an example.
 
   public TxnHandler(HiveConf conf) {
     this.conf = conf;
@@ -135,7 +133,6 @@ public class TxnHandler {
     }
 
     timeout = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HIVE_TXN_TIMEOUT, TimeUnit.MILLISECONDS);
-    deadlockCnt = 0;
     buildJumpTable();
     retryInterval = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HMSHANDLERINTERVAL,
         TimeUnit.MILLISECONDS);
@@ -280,7 +277,6 @@ public class TxnHandler {
   }
 
   public OpenTxnsResponse openTxns(OpenTxnRequest rqst) throws MetaException {
-    deadlockCnt = 0;  // Reset deadlock count since this is a new transaction
     int numTxns = rqst.getNum_txns();
     try {
       Connection dbConn = null;
@@ -420,7 +416,6 @@ public class TxnHandler {
 
   public LockResponse lock(LockRequest rqst)
     throws NoSuchTxnException, TxnAbortedException, MetaException {
-    deadlockCnt = 0;
     try {
       Connection dbConn = null;
       try {
@@ -636,8 +631,6 @@ public class TxnHandler {
       }
     } catch (RetryException e) {
       heartbeat(ids);
-    } finally {
-      deadlockCnt = 0;
     }
   }
 
@@ -903,14 +896,14 @@ public class TxnHandler {
 
   void rollbackDBConn(Connection dbConn) {
     try {
-      if (dbConn != null) dbConn.rollback();
+      if (dbConn != null && !dbConn.isClosed()) dbConn.rollback();
     } catch (SQLException e) {
       LOG.warn("Failed to rollback db connection " + getMessage(e));
     }
   }
   protected void closeDbConn(Connection dbConn) {
     try {
-      if (dbConn != null) dbConn.close();
+      if (dbConn != null && !dbConn.isClosed()) dbConn.close();
     } catch (SQLException e) {
       LOG.warn("Failed to close db connection " + getMessage(e));
     }
@@ -922,7 +915,7 @@ public class TxnHandler {
    */
   protected void closeStmt(Statement stmt) {
     try {
-      if (stmt != null) stmt.close();
+      if (stmt != null && !stmt.isClosed()) stmt.close();
     } catch (SQLException e) {
       LOG.warn("Failed to close statement " + getMessage(e));
     }
@@ -952,15 +945,14 @@ public class TxnHandler {
     closeDbConn(dbConn);
   }
   /**
-   * Determine if an exception was such that it makse sense to retry.  Unfortunately there is no standard way to do
+   * Determine if an exception was such that it makes sense to retry.  Unfortunately there is no standard way to do
    * this, so we have to inspect the error messages and catch the telltale signs for each
-   * different database.
+   * different database.  This method will throw {@code RetryException}
+   * if the error is retry-able.
    * @param conn database connection
    * @param e exception that was thrown.
-   * @param caller name of the method calling this
-   * @throws org.apache.hadoop.hive.metastore.txn.TxnHandler.RetryException when deadlock
-   * detected and retry count has not been exceeded.
-   * TODO: make "caller" more elaborate like include lockId for example
+   * @param caller name of the method calling this (and other info useful to log)
+   * @throws org.apache.hadoop.hive.metastore.txn.TxnHandler.RetryException when the operation should be retried
    */
   protected void checkRetryable(Connection conn,
                                 SQLException e,
@@ -973,53 +965,57 @@ public class TxnHandler {
     // so I've tried to capture the different error messages (there appear to be fewer different
     // error messages than SQL states).
     // Derby and newer MySQL driver use the new SQLTransactionRollbackException
-    if (dbProduct == null && conn != null) {
-      determineDatabaseProduct(conn);
-    }
-    if (e instanceof SQLTransactionRollbackException ||
-      ((dbProduct == DatabaseProduct.MYSQL || dbProduct == DatabaseProduct.POSTGRES ||
-        dbProduct == DatabaseProduct.SQLSERVER) && e.getSQLState().equals("40001")) ||
-      (dbProduct == DatabaseProduct.POSTGRES && e.getSQLState().equals("40P01")) ||
-      (dbProduct == DatabaseProduct.ORACLE && (e.getMessage().contains("deadlock detected")
-        || e.getMessage().contains("can't serialize access for this transaction")))) {
-      if (deadlockCnt++ < ALLOWED_REPEATED_DEADLOCKS) {
-        long waitInterval = deadlockRetryInterval * deadlockCnt;
-        LOG.warn("Deadlock detected in " + caller + ". Will wait " + waitInterval +
-          "ms try again up to " + (ALLOWED_REPEATED_DEADLOCKS - deadlockCnt + 1) + " times.");
-        // Pause for a just a bit for retrying to avoid immediately jumping back into the deadlock.
-        try {
-          Thread.sleep(waitInterval);
-        } catch (InterruptedException ie) {
-          // NOP
-        }
-        throw new RetryException();
-      } else {
-        LOG.error("Too many repeated deadlocks in " + caller + ", giving up.");
-        deadlockCnt = 0;
+    boolean sendRetrySignal = false;
+    try {
+      if (dbProduct == null && conn != null) {
+        determineDatabaseProduct(conn);
       }
-    }
-    else if(isRetryable(e)) {
-      //in MSSQL this means Communication Link Failure
-      if(retryNum++ < retryLimit) {
-        LOG.warn("Retryable error detected in " + caller + ".  Will wait " + retryInterval +
-          "ms and retry up to " + (retryLimit - retryNum + 1) + " times.  Error: " + getMessage(e));
-        try {
-          Thread.sleep(retryInterval);
+      if (e instanceof SQLTransactionRollbackException ||
+        ((dbProduct == DatabaseProduct.MYSQL || dbProduct == DatabaseProduct.POSTGRES ||
+          dbProduct == DatabaseProduct.SQLSERVER) && e.getSQLState().equals("40001")) ||
+        (dbProduct == DatabaseProduct.POSTGRES && e.getSQLState().equals("40P01")) ||
+        (dbProduct == DatabaseProduct.ORACLE && (e.getMessage().contains("deadlock detected")
+          || e.getMessage().contains("can't serialize access for this transaction")))) {
+        if (deadlockCnt++ < ALLOWED_REPEATED_DEADLOCKS) {
+          long waitInterval = deadlockRetryInterval * deadlockCnt;
+          LOG.warn("Deadlock detected in " + caller + ". Will wait " + waitInterval +
+            "ms try again up to " + (ALLOWED_REPEATED_DEADLOCKS - deadlockCnt + 1) + " times.");
+          // Pause for a just a bit for retrying to avoid immediately jumping back into the deadlock.
+          try {
+            Thread.sleep(waitInterval);
+          } catch (InterruptedException ie) {
+            // NOP
+          }
+          sendRetrySignal = true;
+        } else {
+          LOG.error("Too many repeated deadlocks in " + caller + ", giving up.");
         }
-        catch(InterruptedException ex) {
-          //
+      } else if (isRetryable(e)) {
+        //in MSSQL this means Communication Link Failure
+        if (retryNum++ < retryLimit) {
+          LOG.warn("Retryable error detected in " + caller + ".  Will wait " + retryInterval +
+            "ms and retry up to " + (retryLimit - retryNum + 1) + " times.  Error: " + getMessage(e));
+          try {
+            Thread.sleep(retryInterval);
+          } catch (InterruptedException ex) {
+            //
+          }
+          sendRetrySignal = true;
+        } else {
+          LOG.error("Fatal error. Retry limit (" + retryLimit + ") reached. Last error: " + getMessage(e));
         }
-        throw new RetryException();
       }
-      else {
-        LOG.error("Fatal error. Retry limit (" + retryLimit + ") reached. Last error: " + getMessage(e));
+    }
+    finally {
+      /*if this method ends with anything except a retry signal, the caller should fail the operation
+      and propagate the error up to the its caller (Metastore client); thus must reset retry counters*/
+      if(!sendRetrySignal) {
+        deadlockCnt = 0;
         retryNum = 0;
       }
     }
-    else {
-      //if here, we got something that will propagate the error (rather than retry), so reset counters
-      deadlockCnt = 0;
-      retryNum = 0;
+    if(sendRetrySignal) {
+      throw new RetryException();
     }
   }
 
@@ -2100,6 +2096,7 @@ public class TxnHandler {
         //in MSSQL this means Communication Link Failure
         return true;
       }
+      //see https://issues.apache.org/jira/browse/HIVE-9938
     }
     return false;
   }