You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ek...@apache.org on 2015/10/01 01:05:45 UTC
hive git commit: HIVE-11934 Transaction lock retry logic results in
infinite loop(Eugene Koifman, reviewed by Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master edd630043 -> 0d43e876b
HIVE-11934 Transaction lock retry logic results in infinite loop(Eugene Koifman, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0d43e876
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0d43e876
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0d43e876
Branch: refs/heads/master
Commit: 0d43e876be9c36156a28bd2c2b9493f986841dd7
Parents: edd6300
Author: Eugene Koifman <ek...@hortonworks.com>
Authored: Wed Sep 30 16:05:34 2015 -0700
Committer: Eugene Koifman <ek...@hortonworks.com>
Committed: Wed Sep 30 16:05:34 2015 -0700
----------------------------------------------------------------------
.../hadoop/hive/metastore/txn/TxnHandler.java | 117 +++++++++----------
1 file changed, 57 insertions(+), 60 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/0d43e876/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
----------------------------------------------------------------------
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
index 0b19368..cc7e2c6 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
@@ -91,8 +91,8 @@ public class TxnHandler {
/**
* Number of consecutive deadlocks we have seen
*/
- protected int deadlockCnt;
- private long deadlockRetryInterval;
+ private int deadlockCnt;
+ private final long deadlockRetryInterval;
protected HiveConf conf;
protected DatabaseProduct dbProduct;
@@ -115,10 +115,8 @@ public class TxnHandler {
//
// All public methods that write to the database have to check for deadlocks when a SQLException
// comes back and handle it if they see one. This has to be done with the connection pooling
- // in mind. To do this they should call detectDeadlock AFTER rolling back the db transaction,
- // and then in an outer loop they should catch DeadlockException. In the catch for this they
- // should increment the deadlock counter and recall themselves. See commitTxn for an example.
- // the connection has been closed and returned to the pool.
+ // in mind. To do this they should call checkRetryable() AFTER rolling back the db transaction,
+ // and then they should catch RetryException and call themselves recursively. See commitTxn for an example.
public TxnHandler(HiveConf conf) {
this.conf = conf;
@@ -135,7 +133,6 @@ public class TxnHandler {
}
timeout = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HIVE_TXN_TIMEOUT, TimeUnit.MILLISECONDS);
- deadlockCnt = 0;
buildJumpTable();
retryInterval = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HMSHANDLERINTERVAL,
TimeUnit.MILLISECONDS);
@@ -280,7 +277,6 @@ public class TxnHandler {
}
public OpenTxnsResponse openTxns(OpenTxnRequest rqst) throws MetaException {
- deadlockCnt = 0; // Reset deadlock count since this is a new transaction
int numTxns = rqst.getNum_txns();
try {
Connection dbConn = null;
@@ -420,7 +416,6 @@ public class TxnHandler {
public LockResponse lock(LockRequest rqst)
throws NoSuchTxnException, TxnAbortedException, MetaException {
- deadlockCnt = 0;
try {
Connection dbConn = null;
try {
@@ -636,8 +631,6 @@ public class TxnHandler {
}
} catch (RetryException e) {
heartbeat(ids);
- } finally {
- deadlockCnt = 0;
}
}
@@ -903,14 +896,14 @@ public class TxnHandler {
void rollbackDBConn(Connection dbConn) {
try {
- if (dbConn != null) dbConn.rollback();
+ if (dbConn != null && !dbConn.isClosed()) dbConn.rollback();
} catch (SQLException e) {
LOG.warn("Failed to rollback db connection " + getMessage(e));
}
}
protected void closeDbConn(Connection dbConn) {
try {
- if (dbConn != null) dbConn.close();
+ if (dbConn != null && !dbConn.isClosed()) dbConn.close();
} catch (SQLException e) {
LOG.warn("Failed to close db connection " + getMessage(e));
}
@@ -922,7 +915,7 @@ public class TxnHandler {
*/
protected void closeStmt(Statement stmt) {
try {
- if (stmt != null) stmt.close();
+ if (stmt != null && !stmt.isClosed()) stmt.close();
} catch (SQLException e) {
LOG.warn("Failed to close statement " + getMessage(e));
}
@@ -952,15 +945,14 @@ public class TxnHandler {
closeDbConn(dbConn);
}
/**
- * Determine if an exception was such that it makse sense to retry. Unfortunately there is no standard way to do
+ * Determine if an exception was such that it makes sense to retry. Unfortunately there is no standard way to do
* this, so we have to inspect the error messages and catch the telltale signs for each
- * different database.
+ * different database. This method will throw {@code RetryException}
+ * if the error is retry-able.
* @param conn database connection
* @param e exception that was thrown.
- * @param caller name of the method calling this
- * @throws org.apache.hadoop.hive.metastore.txn.TxnHandler.RetryException when deadlock
- * detected and retry count has not been exceeded.
- * TODO: make "caller" more elaborate like include lockId for example
+ * @param caller name of the method calling this (and other info useful to log)
+ * @throws org.apache.hadoop.hive.metastore.txn.TxnHandler.RetryException when the operation should be retried
*/
protected void checkRetryable(Connection conn,
SQLException e,
@@ -973,53 +965,57 @@ public class TxnHandler {
// so I've tried to capture the different error messages (there appear to be fewer different
// error messages than SQL states).
// Derby and newer MySQL driver use the new SQLTransactionRollbackException
- if (dbProduct == null && conn != null) {
- determineDatabaseProduct(conn);
- }
- if (e instanceof SQLTransactionRollbackException ||
- ((dbProduct == DatabaseProduct.MYSQL || dbProduct == DatabaseProduct.POSTGRES ||
- dbProduct == DatabaseProduct.SQLSERVER) && e.getSQLState().equals("40001")) ||
- (dbProduct == DatabaseProduct.POSTGRES && e.getSQLState().equals("40P01")) ||
- (dbProduct == DatabaseProduct.ORACLE && (e.getMessage().contains("deadlock detected")
- || e.getMessage().contains("can't serialize access for this transaction")))) {
- if (deadlockCnt++ < ALLOWED_REPEATED_DEADLOCKS) {
- long waitInterval = deadlockRetryInterval * deadlockCnt;
- LOG.warn("Deadlock detected in " + caller + ". Will wait " + waitInterval +
- "ms try again up to " + (ALLOWED_REPEATED_DEADLOCKS - deadlockCnt + 1) + " times.");
- // Pause for a just a bit for retrying to avoid immediately jumping back into the deadlock.
- try {
- Thread.sleep(waitInterval);
- } catch (InterruptedException ie) {
- // NOP
- }
- throw new RetryException();
- } else {
- LOG.error("Too many repeated deadlocks in " + caller + ", giving up.");
- deadlockCnt = 0;
+ boolean sendRetrySignal = false;
+ try {
+ if (dbProduct == null && conn != null) {
+ determineDatabaseProduct(conn);
}
- }
- else if(isRetryable(e)) {
- //in MSSQL this means Communication Link Failure
- if(retryNum++ < retryLimit) {
- LOG.warn("Retryable error detected in " + caller + ". Will wait " + retryInterval +
- "ms and retry up to " + (retryLimit - retryNum + 1) + " times. Error: " + getMessage(e));
- try {
- Thread.sleep(retryInterval);
+ if (e instanceof SQLTransactionRollbackException ||
+ ((dbProduct == DatabaseProduct.MYSQL || dbProduct == DatabaseProduct.POSTGRES ||
+ dbProduct == DatabaseProduct.SQLSERVER) && e.getSQLState().equals("40001")) ||
+ (dbProduct == DatabaseProduct.POSTGRES && e.getSQLState().equals("40P01")) ||
+ (dbProduct == DatabaseProduct.ORACLE && (e.getMessage().contains("deadlock detected")
+ || e.getMessage().contains("can't serialize access for this transaction")))) {
+ if (deadlockCnt++ < ALLOWED_REPEATED_DEADLOCKS) {
+ long waitInterval = deadlockRetryInterval * deadlockCnt;
+ LOG.warn("Deadlock detected in " + caller + ". Will wait " + waitInterval +
+ "ms try again up to " + (ALLOWED_REPEATED_DEADLOCKS - deadlockCnt + 1) + " times.");
+ // Pause for a just a bit for retrying to avoid immediately jumping back into the deadlock.
+ try {
+ Thread.sleep(waitInterval);
+ } catch (InterruptedException ie) {
+ // NOP
+ }
+ sendRetrySignal = true;
+ } else {
+ LOG.error("Too many repeated deadlocks in " + caller + ", giving up.");
}
- catch(InterruptedException ex) {
- //
+ } else if (isRetryable(e)) {
+ //in MSSQL this means Communication Link Failure
+ if (retryNum++ < retryLimit) {
+ LOG.warn("Retryable error detected in " + caller + ". Will wait " + retryInterval +
+ "ms and retry up to " + (retryLimit - retryNum + 1) + " times. Error: " + getMessage(e));
+ try {
+ Thread.sleep(retryInterval);
+ } catch (InterruptedException ex) {
+ //
+ }
+ sendRetrySignal = true;
+ } else {
+ LOG.error("Fatal error. Retry limit (" + retryLimit + ") reached. Last error: " + getMessage(e));
}
- throw new RetryException();
}
- else {
- LOG.error("Fatal error. Retry limit (" + retryLimit + ") reached. Last error: " + getMessage(e));
+ }
+ finally {
+ /*if this method ends with anything except a retry signal, the caller should fail the operation
+ and propagate the error up to the its caller (Metastore client); thus must reset retry counters*/
+ if(!sendRetrySignal) {
+ deadlockCnt = 0;
retryNum = 0;
}
}
- else {
- //if here, we got something that will propagate the error (rather than retry), so reset counters
- deadlockCnt = 0;
- retryNum = 0;
+ if(sendRetrySignal) {
+ throw new RetryException();
}
}
@@ -2100,6 +2096,7 @@ public class TxnHandler {
//in MSSQL this means Communication Link Failure
return true;
}
+ //see https://issues.apache.org/jira/browse/HIVE-9938
}
return false;
}