You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2018/07/25 18:27:25 UTC
[16/50] [abbrv] hive git commit: HIVE-19416 : merge master into
branch (Sergey Shelukhin) 0719
http://git-wip-us.apache.org/repos/asf/hive/blob/651e7950/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
----------------------------------------------------------------------
diff --cc standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
index 0000000,9dd3787..7fd0642
mode 000000,100644..100644
--- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
+++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java
@@@ -1,0 -1,5051 +1,5094 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ package org.apache.hadoop.hive.metastore.txn;
+
+ import java.io.PrintWriter;
+ import java.nio.ByteBuffer;
+ import java.sql.Connection;
+ import java.sql.Driver;
+ import java.sql.ResultSet;
+ import java.sql.SQLException;
+ import java.sql.SQLFeatureNotSupportedException;
+ import java.sql.Savepoint;
+ import java.sql.Statement;
+ import java.time.Instant;
+ import java.util.ArrayList;
+ import java.util.Arrays;
+ import java.util.BitSet;
+ import java.util.Collections;
+ import java.util.Comparator;
+ import java.util.HashMap;
+ import java.util.HashSet;
+ import java.util.Iterator;
+ import java.util.List;
+ import java.util.Map;
+ import java.util.Properties;
+ import java.util.Set;
+ import java.util.SortedSet;
+ import java.util.TreeSet;
+ import java.util.concurrent.ConcurrentHashMap;
+ import java.util.concurrent.Semaphore;
+ import java.util.concurrent.TimeUnit;
+ import java.util.concurrent.atomic.AtomicInteger;
+ import java.util.concurrent.locks.ReentrantLock;
+ import java.util.regex.Pattern;
+
+ import javax.sql.DataSource;
+
+ import org.apache.commons.lang.ArrayUtils;
+ import org.apache.commons.lang.NotImplementedException;
+ import org.apache.hadoop.classification.InterfaceAudience;
+ import org.apache.hadoop.classification.InterfaceStability;
+ import org.apache.hadoop.conf.Configuration;
+ import org.apache.hadoop.hive.common.ValidReadTxnList;
+ import org.apache.hadoop.hive.common.ValidReaderWriteIdList;
+ import org.apache.hadoop.hive.common.ValidTxnList;
+ import org.apache.hadoop.hive.common.ValidTxnWriteIdList;
+ import org.apache.hadoop.hive.common.ValidWriteIdList;
+ import org.apache.hadoop.hive.common.classification.RetrySemantics;
+ import org.apache.hadoop.hive.metastore.DatabaseProduct;
+ import org.apache.hadoop.hive.metastore.Warehouse;
+ import org.apache.hadoop.hive.metastore.MetaStoreListenerNotifier;
+ import org.apache.hadoop.hive.metastore.TransactionalMetaStoreEventListener;
+ import org.apache.hadoop.hive.metastore.api.*;
+ import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+ import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars;
+ import org.apache.hadoop.hive.metastore.datasource.DataSourceProvider;
+ import org.apache.hadoop.hive.metastore.datasource.DataSourceProviderFactory;
+ import org.apache.hadoop.hive.metastore.events.AbortTxnEvent;
+ import org.apache.hadoop.hive.metastore.events.AllocWriteIdEvent;
+ import org.apache.hadoop.hive.metastore.events.CommitTxnEvent;
+ import org.apache.hadoop.hive.metastore.events.OpenTxnEvent;
+ import org.apache.hadoop.hive.metastore.events.AcidWriteEvent;
+ import org.apache.hadoop.hive.metastore.messaging.EventMessage;
+ import org.apache.hadoop.hive.metastore.metrics.Metrics;
+ import org.apache.hadoop.hive.metastore.metrics.MetricsConstants;
+ import org.apache.hadoop.hive.metastore.tools.SQLGenerator;
+ import org.apache.hadoop.hive.metastore.utils.JavaUtils;
+ import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
+ import org.apache.hadoop.hive.metastore.utils.StringableMap;
+ import org.apache.hadoop.util.StringUtils;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
++
+ import com.google.common.annotations.VisibleForTesting;
+
+ /**
+ * A handler to answer transaction related calls that come into the metastore
+ * server.
+ *
+ * Note on log messages: Please include txnid:X and lockid info using
+ * {@link JavaUtils#txnIdToString(long)}
+ * and {@link JavaUtils#lockIdToString(long)} in all messages.
+ * The txnid:X and lockid:Y matches how Thrift object toString() methods are generated,
+ * so keeping the format consistent makes grep'ing the logs much easier.
+ *
+ * Note on HIVE_LOCKS.hl_last_heartbeat.
+ * For locks that are part of transaction, we set this 0 (would rather set it to NULL but
+ * Currently the DB schema has this NOT NULL) and only update/read heartbeat from corresponding
+ * transaction in TXNS.
+ *
+ * In general there can be multiple metastores where this logic can execute, thus the DB is
+ * used to ensure proper mutexing of operations.
+ * Select ... For Update (or equivalent: either MsSql with(updlock) or actual Update stmt) is
+ * used to properly sequence operations. Most notably:
+ * 1. various sequence IDs are generated with aid of this mutex
+ * 2. ensuring that each (Hive) Transaction state is transitioned atomically. Transaction state
+ * includes its actual state (Open, Aborted) as well as it's lock list/component list. Thus all
+ * per transaction ops, either start by update/delete of the relevant TXNS row or do S4U on that row.
+ * This allows almost all operations to run at READ_COMMITTED and minimizes DB deadlocks.
+ * 3. checkLock() - this is mutexted entirely since we must ensure that while we check if some lock
+ * can be granted, no other (strictly speaking "earlier") lock can change state.
+ *
+ * The exception to his is Derby which doesn't support proper S4U. Derby is always running embedded
+ * (this is the only supported configuration for Derby)
+ * in the same JVM as HiveMetaStoreHandler thus we use JVM wide lock to properly sequnce the operations.
+ *
+ * {@link #derbyLock}
+
+ * If we ever decide to run remote Derby server, according to
+ * https://db.apache.org/derby/docs/10.0/manuals/develop/develop78.html all transactions will be
+ * seriazlied, so that would also work though has not been tested.
+ *
+ * General design note:
+ * It's imperative that any operation on a txn (e.g. commit), ensure (atomically) that this txn is
+ * still valid and active. In the code this is usually achieved at the same time the txn record
+ * is locked for some operation.
+ *
+ * Note on retry logic:
+ * Metastore has retry logic in both {@link org.apache.hadoop.hive.metastore.RetryingMetaStoreClient}
+ * and {@link org.apache.hadoop.hive.metastore.RetryingHMSHandler}. The retry logic there is very
+ * generic and is not aware whether the operations are idempotent or not. (This is separate from
+ * retry logic here in TxnHander which can/does retry DB errors intelligently). The worst case is
+ * when an op here issues a successful commit against the RDBMS but the calling stack doesn't
+ * receive the ack and retries. (If an op fails before commit, it's trivially idempotent)
+ * Thus the ops here need to be made idempotent as much as possible or
+ * the metstore call stack should have logic not to retry. There are {@link RetrySemantics}
+ * annotations to document the behavior.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Evolving
+ abstract class TxnHandler implements TxnStore, TxnStore.MutexAPI {
+
+ static final protected char INITIATED_STATE = 'i';
+ static final protected char WORKING_STATE = 'w';
+ static final protected char READY_FOR_CLEANING = 'r';
+ static final char FAILED_STATE = 'f';
+ static final char SUCCEEDED_STATE = 's';
+ static final char ATTEMPTED_STATE = 'a';
+
+ // Compactor types
+ static final protected char MAJOR_TYPE = 'a';
+ static final protected char MINOR_TYPE = 'i';
+
+ // Transaction states
+ static final protected char TXN_ABORTED = 'a';
+ static final protected char TXN_OPEN = 'o';
+ //todo: make these like OperationType and remove above char constatns
+ enum TxnStatus {OPEN, ABORTED, COMMITTED, UNKNOWN}
+
+ public enum TxnType {
+ DEFAULT(0), REPL_CREATED(1), READ_ONLY(2);
+
+ private final int value;
+ TxnType(int value) {
+ this.value = value;
+ }
+
+ public int getValue() {
+ return value;
+ }
+ }
+
+ // Lock states
+ static final protected char LOCK_ACQUIRED = 'a';
+ static final protected char LOCK_WAITING = 'w';
+
+ // Lock types
+ static final protected char LOCK_EXCLUSIVE = 'e';
+ static final protected char LOCK_SHARED = 'r';
+ static final protected char LOCK_SEMI_SHARED = 'w';
+
+ static final private int ALLOWED_REPEATED_DEADLOCKS = 10;
+ static final private Logger LOG = LoggerFactory.getLogger(TxnHandler.class.getName());
+
+ static private DataSource connPool;
+ private static DataSource connPoolMutex;
+ static private boolean doRetryOnConnPool = false;
+
+ private List<TransactionalMetaStoreEventListener> transactionalListeners;
+
+ private enum OpertaionType {
+ SELECT('s'), INSERT('i'), UPDATE('u'), DELETE('d');
+ private final char sqlConst;
+ OpertaionType(char sqlConst) {
+ this.sqlConst = sqlConst;
+ }
+ public String toString() {
+ return Character.toString(sqlConst);
+ }
+ public static OpertaionType fromString(char sqlConst) {
+ switch (sqlConst) {
+ case 's':
+ return SELECT;
+ case 'i':
+ return INSERT;
+ case 'u':
+ return UPDATE;
+ case 'd':
+ return DELETE;
+ default:
+ throw new IllegalArgumentException(quoteChar(sqlConst));
+ }
+ }
+ public static OpertaionType fromDataOperationType(DataOperationType dop) {
+ switch (dop) {
+ case SELECT:
+ return OpertaionType.SELECT;
+ case INSERT:
+ return OpertaionType.INSERT;
+ case UPDATE:
+ return OpertaionType.UPDATE;
+ case DELETE:
+ return OpertaionType.DELETE;
+ default:
+ throw new IllegalArgumentException("Unexpected value: " + dop);
+ }
+ }
+ }
+
+ // Maximum number of open transactions that's allowed
+ private static volatile int maxOpenTxns = 0;
+ // Whether number of open transactions reaches the threshold
+ private static volatile boolean tooManyOpenTxns = false;
+
+ /**
+ * Number of consecutive deadlocks we have seen
+ */
+ private int deadlockCnt;
+ private long deadlockRetryInterval;
+ protected Configuration conf;
+ private static DatabaseProduct dbProduct;
+ private static SQLGenerator sqlGenerator;
+
+ // (End user) Transaction timeout, in milliseconds.
+ private long timeout;
+
+ private String identifierQuoteString; // quotes to use for quoting tables, where necessary
+ private long retryInterval;
+ private int retryLimit;
+ private int retryNum;
+ // Current number of open txns
+ private AtomicInteger numOpenTxns;
+
+ /**
+ * Derby specific concurrency control
+ */
+ private static final ReentrantLock derbyLock = new ReentrantLock(true);
+ /**
+ * must be static since even in UT there may be > 1 instance of TxnHandler
+ * (e.g. via Compactor services)
+ */
+ private final static ConcurrentHashMap<String, Semaphore> derbyKey2Lock = new ConcurrentHashMap<>();
+ private static final String hostname = JavaUtils.hostname();
+
+ // Private methods should never catch SQLException and then throw MetaException. The public
+ // methods depend on SQLException coming back so they can detect and handle deadlocks. Private
+ // methods should only throw MetaException when they explicitly know there's a logic error and
+ // they want to throw past the public methods.
+ //
+ // All public methods that write to the database have to check for deadlocks when a SQLException
+ // comes back and handle it if they see one. This has to be done with the connection pooling
+ // in mind. To do this they should call checkRetryable() AFTER rolling back the db transaction,
+ // and then they should catch RetryException and call themselves recursively. See commitTxn for an example.
+
+ public TxnHandler() {
+ }
+
+ /**
+ * This is logically part of c'tor and must be called prior to any other method.
+ * Not physically part of c'tor due to use of reflection
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ checkQFileTestHack();
+
+ synchronized (TxnHandler.class) {
+ if (connPool == null) {
+ Connection dbConn = null;
+ // Set up the JDBC connection pool
+ try {
+ int maxPoolSize = MetastoreConf.getIntVar(conf, ConfVars.CONNECTION_POOLING_MAX_CONNECTIONS);
+ long getConnectionTimeoutMs = 30000;
+ connPool = setupJdbcConnectionPool(conf, maxPoolSize, getConnectionTimeoutMs);
+ /*the mutex pools should ideally be somewhat larger since some operations require 1
+ connection from each pool and we want to avoid taking a connection from primary pool
+ and then blocking because mutex pool is empty. There is only 1 thread in any HMS trying
+ to mutex on each MUTEX_KEY except MUTEX_KEY.CheckLock. The CheckLock operation gets a
+ connection from connPool first, then connPoolMutex. All others, go in the opposite
+ order (not very elegant...). So number of connection requests for connPoolMutex cannot
+ exceed (size of connPool + MUTEX_KEY.values().length - 1).*/
+ connPoolMutex = setupJdbcConnectionPool(conf, maxPoolSize + MUTEX_KEY.values().length, getConnectionTimeoutMs);
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ determineDatabaseProduct(dbConn);
+ sqlGenerator = new SQLGenerator(dbProduct, conf);
+ } catch (SQLException e) {
+ String msg = "Unable to instantiate JDBC connection pooling, " + e.getMessage();
+ LOG.error(msg);
+ throw new RuntimeException(e);
+ } finally {
+ closeDbConn(dbConn);
+ }
+ }
+ }
+
+ numOpenTxns = Metrics.getOrCreateGauge(MetricsConstants.NUM_OPEN_TXNS);
+
+ timeout = MetastoreConf.getTimeVar(conf, ConfVars.TXN_TIMEOUT, TimeUnit.MILLISECONDS);
+ buildJumpTable();
+ retryInterval = MetastoreConf.getTimeVar(conf, ConfVars.HMS_HANDLER_INTERVAL,
+ TimeUnit.MILLISECONDS);
+ retryLimit = MetastoreConf.getIntVar(conf, ConfVars.HMS_HANDLER_ATTEMPTS);
+ deadlockRetryInterval = retryInterval / 10;
+ maxOpenTxns = MetastoreConf.getIntVar(conf, ConfVars.MAX_OPEN_TXNS);
+
+ try {
+ transactionalListeners = MetaStoreUtils.getMetaStoreListeners(
+ TransactionalMetaStoreEventListener.class,
+ conf, MetastoreConf.getVar(conf, ConfVars.TRANSACTIONAL_EVENT_LISTENERS));
+ } catch(MetaException e) {
+ String msg = "Unable to get transaction listeners, " + e.getMessage();
+ LOG.error(msg);
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ @RetrySemantics.ReadOnly
+ public GetOpenTxnsInfoResponse getOpenTxnsInfo() throws MetaException {
+ try {
+ // We need to figure out the current transaction number and the list of
+ // open transactions. To avoid needing a transaction on the underlying
+ // database we'll look at the current transaction number first. If it
+ // subsequently shows up in the open list that's ok.
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ /**
+ * This method can run at READ_COMMITTED as long as long as
+ * {@link #openTxns(org.apache.hadoop.hive.metastore.api.OpenTxnRequest)} is atomic.
+ * More specifically, as long as advancing TransactionID in NEXT_TXN_ID is atomic with
+ * adding corresponding entries into TXNS. The reason is that any txnid below HWM
+ * is either in TXNS and thus considered open (Open/Aborted) or it's considered Committed.
+ */
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+ String s = "select ntxn_next - 1 from NEXT_TXN_ID";
+ LOG.debug("Going to execute query <" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (!rs.next()) {
+ throw new MetaException("Transaction tables not properly " +
+ "initialized, no record found in next_txn_id");
+ }
+ long hwm = rs.getLong(1);
+ if (rs.wasNull()) {
+ throw new MetaException("Transaction tables not properly " +
+ "initialized, null record found in next_txn_id");
+ }
+ close(rs);
+ List<TxnInfo> txnInfos = new ArrayList<>();
+ //need the WHERE clause below to ensure consistent results with READ_COMMITTED
+ s = "select txn_id, txn_state, txn_user, txn_host, txn_started, txn_last_heartbeat from " +
+ "TXNS where txn_id <= " + hwm;
+ LOG.debug("Going to execute query<" + s + ">");
+ rs = stmt.executeQuery(s);
+ while (rs.next()) {
+ char c = rs.getString(2).charAt(0);
+ TxnState state;
+ switch (c) {
+ case TXN_ABORTED:
+ state = TxnState.ABORTED;
+ break;
+
+ case TXN_OPEN:
+ state = TxnState.OPEN;
+ break;
+
+ default:
+ throw new MetaException("Unexpected transaction state " + c +
+ " found in txns table");
+ }
+ TxnInfo txnInfo = new TxnInfo(rs.getLong(1), state, rs.getString(3), rs.getString(4));
+ txnInfo.setStartedTime(rs.getLong(5));
+ txnInfo.setLastHeartbeatTime(rs.getLong(6));
+ txnInfos.add(txnInfo);
+ }
+ LOG.debug("Going to rollback");
+ dbConn.rollback();
+ return new GetOpenTxnsInfoResponse(hwm, txnInfos);
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "getOpenTxnsInfo");
+ throw new MetaException("Unable to select from transaction database: " + getMessage(e)
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(rs, stmt, dbConn);
+ }
+ } catch (RetryException e) {
+ return getOpenTxnsInfo();
+ }
+ }
++
+ @Override
+ @RetrySemantics.ReadOnly
+ public GetOpenTxnsResponse getOpenTxns() throws MetaException {
+ try {
+ // We need to figure out the current transaction number and the list of
+ // open transactions. To avoid needing a transaction on the underlying
+ // database we'll look at the current transaction number first. If it
+ // subsequently shows up in the open list that's ok.
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ /**
+ * This runs at READ_COMMITTED for exactly the same reason as {@link #getOpenTxnsInfo()}
+ */
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+ String s = "select ntxn_next - 1 from NEXT_TXN_ID";
+ LOG.debug("Going to execute query <" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (!rs.next()) {
+ throw new MetaException("Transaction tables not properly " +
+ "initialized, no record found in next_txn_id");
+ }
+ long hwm = rs.getLong(1);
+ if (rs.wasNull()) {
+ throw new MetaException("Transaction tables not properly " +
+ "initialized, null record found in next_txn_id");
+ }
+ close(rs);
+ List<Long> openList = new ArrayList<>();
+ //need the WHERE clause below to ensure consistent results with READ_COMMITTED
+ s = "select txn_id, txn_state from TXNS where txn_id <= " + hwm + " order by txn_id";
+ LOG.debug("Going to execute query<" + s + ">");
+ rs = stmt.executeQuery(s);
+ long minOpenTxn = Long.MAX_VALUE;
+ BitSet abortedBits = new BitSet();
+ while (rs.next()) {
+ long txnId = rs.getLong(1);
+ openList.add(txnId);
+ char c = rs.getString(2).charAt(0);
+ if(c == TXN_OPEN) {
+ minOpenTxn = Math.min(minOpenTxn, txnId);
+ } else if (c == TXN_ABORTED) {
+ abortedBits.set(openList.size() - 1);
+ }
+ }
+ LOG.debug("Going to rollback");
+ dbConn.rollback();
+ ByteBuffer byteBuffer = ByteBuffer.wrap(abortedBits.toByteArray());
+ GetOpenTxnsResponse otr = new GetOpenTxnsResponse(hwm, openList, byteBuffer);
+ if(minOpenTxn < Long.MAX_VALUE) {
+ otr.setMin_open_txn(minOpenTxn);
+ }
+ return otr;
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "getOpenTxns");
+ throw new MetaException("Unable to select from transaction database, "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(rs, stmt, dbConn);
+ }
+ } catch (RetryException e) {
+ return getOpenTxns();
+ }
+ }
+
+ /**
+ * Retry-by-caller note:
+ * Worst case, it will leave an open txn which will timeout.
+ */
+ @Override
+ @RetrySemantics.Idempotent
+ public OpenTxnsResponse openTxns(OpenTxnRequest rqst) throws MetaException {
+ if (!tooManyOpenTxns && numOpenTxns.get() >= maxOpenTxns) {
+ tooManyOpenTxns = true;
+ }
+ if (tooManyOpenTxns) {
+ if (numOpenTxns.get() < maxOpenTxns * 0.9) {
+ tooManyOpenTxns = false;
+ } else {
+ LOG.warn("Maximum allowed number of open transactions (" + maxOpenTxns + ") has been " +
+ "reached. Current number of open transactions: " + numOpenTxns);
+ throw new MetaException("Maximum allowed number of open transactions has been reached. " +
+ "See hive.max.open.txns.");
+ }
+ }
+
+ int numTxns = rqst.getNum_txns();
+ if (numTxns <= 0) {
+ throw new MetaException("Invalid input for number of txns: " + numTxns);
+ }
+
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ try {
+ lockInternal();
+ /**
+ * To make {@link #getOpenTxns()}/{@link #getOpenTxnsInfo()} work correctly, this operation must ensure
+ * that advancing the counter in NEXT_TXN_ID and adding appropriate entries to TXNS is atomic.
+ * Also, advancing the counter must work when multiple metastores are running.
+ * SELECT ... FOR UPDATE is used to prevent
+ * concurrent DB transactions being rolled back due to Write-Write conflict on NEXT_TXN_ID.
+ *
+ * In the current design, there can be several metastore instances running in a given Warehouse.
+ * This makes ideas like reserving a range of IDs to save trips to DB impossible. For example,
+ * a client may go to MS1 and start a transaction with ID 500 to update a particular row.
+ * Now the same client will start another transaction, except it ends up on MS2 and may get
+ * transaction ID 400 and update the same row. Now the merge that happens to materialize the snapshot
+ * on read will thing the version of the row from transaction ID 500 is the latest one.
+ *
+ * Longer term we can consider running Active-Passive MS (at least wrt to ACID operations). This
+ * set could support a write-through cache for added performance.
+ */
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ // Make sure the user has not requested an insane amount of txns.
+ int maxTxns = MetastoreConf.getIntVar(conf, ConfVars.TXN_MAX_OPEN_BATCH);
+ if (numTxns > maxTxns) numTxns = maxTxns;
+
+ stmt = dbConn.createStatement();
+ List<Long> txnIds = openTxns(dbConn, stmt, rqst);
+
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ return new OpenTxnsResponse(txnIds);
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "openTxns(" + rqst + ")");
+ throw new MetaException("Unable to select from transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(null, stmt, dbConn);
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ return openTxns(rqst);
+ }
+ }
+
+ private List<Long> openTxns(Connection dbConn, Statement stmt, OpenTxnRequest rqst)
+ throws SQLException, MetaException {
+ int numTxns = rqst.getNum_txns();
+ ResultSet rs = null;
+ TxnType txnType = TxnType.DEFAULT;
+ try {
+ if (rqst.isSetReplPolicy()) {
+ List<Long> targetTxnIdList = getTargetTxnIdList(rqst.getReplPolicy(), rqst.getReplSrcTxnIds(), stmt);
+
+ if (!targetTxnIdList.isEmpty()) {
+ if (targetTxnIdList.size() != rqst.getReplSrcTxnIds().size()) {
+ LOG.warn("target txn id number " + targetTxnIdList.toString() +
+ " is not matching with source txn id number " + rqst.getReplSrcTxnIds().toString());
+ }
+ LOG.info("Target transactions " + targetTxnIdList.toString() + " are present for repl policy :" +
+ rqst.getReplPolicy() + " and Source transaction id : " + rqst.getReplSrcTxnIds().toString());
+ return targetTxnIdList;
+ }
+ txnType = TxnType.REPL_CREATED;
+ }
+
+ String s = sqlGenerator.addForUpdateClause("select ntxn_next from NEXT_TXN_ID");
+ LOG.debug("Going to execute query <" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (!rs.next()) {
+ throw new MetaException("Transaction database not properly " +
+ "configured, can't find next transaction id.");
+ }
+ long first = rs.getLong(1);
+ s = "update NEXT_TXN_ID set ntxn_next = " + (first + numTxns);
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+
+ long now = getDbTime(dbConn);
+ List<Long> txnIds = new ArrayList<>(numTxns);
+
+ List<String> rows = new ArrayList<>();
+ for (long i = first; i < first + numTxns; i++) {
+ txnIds.add(i);
+ rows.add(i + "," + quoteChar(TXN_OPEN) + "," + now + "," + now + ","
+ + quoteString(rqst.getUser()) + "," + quoteString(rqst.getHostname()) + "," + txnType.getValue());
+ }
+ List<String> queries = sqlGenerator.createInsertValuesStmt(
+ "TXNS (txn_id, txn_state, txn_started, txn_last_heartbeat, txn_user, txn_host, txn_type)", rows);
+ for (String q : queries) {
+ LOG.debug("Going to execute update <" + q + ">");
+ stmt.execute(q);
+ }
+
+ // Need to register minimum open txnid for current transactions into MIN_HISTORY table.
+ s = "select min(txn_id) from TXNS where txn_state = " + quoteChar(TXN_OPEN);
+ LOG.debug("Going to execute query <" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (!rs.next()) {
+ throw new IllegalStateException("Scalar query returned no rows?!?!!");
+ }
+
+ // TXNS table should have atleast one entry because we just inserted the newly opened txns.
+ // So, min(txn_id) would be a non-zero txnid.
+ long minOpenTxnId = rs.getLong(1);
+ assert (minOpenTxnId > 0);
+ rows.clear();
+ for (long txnId = first; txnId < first + numTxns; txnId++) {
+ rows.add(txnId + ", " + minOpenTxnId);
+ }
+
+ // Insert transaction entries into MIN_HISTORY_LEVEL.
+ List<String> inserts = sqlGenerator.createInsertValuesStmt(
+ "MIN_HISTORY_LEVEL (mhl_txnid, mhl_min_open_txnid)", rows);
+ for (String insert : inserts) {
+ LOG.debug("Going to execute insert <" + insert + ">");
+ stmt.execute(insert);
+ }
+ LOG.info("Added entries to MIN_HISTORY_LEVEL for current txns: (" + txnIds
+ + ") with min_open_txn: " + minOpenTxnId);
+
+ if (rqst.isSetReplPolicy()) {
+ List<String> rowsRepl = new ArrayList<>();
+
+ for (int i = 0; i < numTxns; i++) {
+ rowsRepl.add(
+ quoteString(rqst.getReplPolicy()) + "," + rqst.getReplSrcTxnIds().get(i) + "," + txnIds.get(i));
+ }
+
+ List<String> queriesRepl = sqlGenerator.createInsertValuesStmt(
+ "REPL_TXN_MAP (RTM_REPL_POLICY, RTM_SRC_TXN_ID, RTM_TARGET_TXN_ID)", rowsRepl);
+
+ for (String query : queriesRepl) {
+ LOG.info("Going to execute insert <" + query + ">");
+ stmt.execute(query);
+ }
+ }
+
+ if (transactionalListeners != null) {
+ MetaStoreListenerNotifier.notifyEventWithDirectSql(transactionalListeners,
+ EventMessage.EventType.OPEN_TXN, new OpenTxnEvent(txnIds, null), dbConn, sqlGenerator);
+ }
+ return txnIds;
+ } finally {
+ close(rs);
+ }
+ }
+
+ private List<Long> getTargetTxnIdList(String replPolicy, List<Long> sourceTxnIdList, Statement stmt)
+ throws SQLException {
+ ResultSet rs = null;
+ try {
+ List<String> inQueries = new ArrayList<>();
+ StringBuilder prefix = new StringBuilder();
+ StringBuilder suffix = new StringBuilder();
+ List<Long> targetTxnIdList = new ArrayList<>();
+ prefix.append("select RTM_TARGET_TXN_ID from REPL_TXN_MAP where ");
+ suffix.append(" and RTM_REPL_POLICY = " + quoteString(replPolicy));
+ TxnUtils.buildQueryWithINClause(conf, inQueries, prefix, suffix, sourceTxnIdList,
+ "RTM_SRC_TXN_ID", false, false);
+ for (String query : inQueries) {
+ LOG.debug("Going to execute select <" + query + ">");
+ rs = stmt.executeQuery(query);
+ while (rs.next()) {
+ targetTxnIdList.add(rs.getLong(1));
+ }
+ }
+ LOG.debug("targetTxnid for srcTxnId " + sourceTxnIdList.toString() + " is " + targetTxnIdList.toString());
+ return targetTxnIdList;
+ } catch (SQLException e) {
+ LOG.warn("failed to get target txn ids " + e.getMessage());
+ throw e;
+ } finally {
+ close(rs);
+ }
+ }
+
+ @Override
+ @RetrySemantics.Idempotent
+ public long getTargetTxnId(String replPolicy, long sourceTxnId) throws MetaException {
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+ List<Long> targetTxnIds = getTargetTxnIdList(replPolicy, Collections.singletonList(sourceTxnId), stmt);
+ if (targetTxnIds.isEmpty()) {
+ LOG.info("Txn {} not present for repl policy {}", sourceTxnId, replPolicy);
+ return -1;
+ }
+ assert (targetTxnIds.size() == 1);
+ return targetTxnIds.get(0);
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "getTargetTxnId(" + replPolicy + sourceTxnId + ")");
+ throw new MetaException("Unable to get target transaction id "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(null, stmt, dbConn);
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ return getTargetTxnId(replPolicy, sourceTxnId);
+ }
+ }
+
+ @Override
+ @RetrySemantics.Idempotent
+ public void abortTxn(AbortTxnRequest rqst) throws NoSuchTxnException, MetaException, TxnAbortedException {
+ long txnid = rqst.getTxnid();
+ long sourceTxnId = -1;
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ if (rqst.isSetReplPolicy()) {
+ sourceTxnId = rqst.getTxnid();
+ List<Long> targetTxnIds = getTargetTxnIdList(rqst.getReplPolicy(),
+ Collections.singletonList(sourceTxnId), stmt);
+ if (targetTxnIds.isEmpty()) {
+ LOG.info("Target txn id is missing for source txn id : " + sourceTxnId +
+ " and repl policy " + rqst.getReplPolicy());
+ return;
+ }
+ assert targetTxnIds.size() == 1;
+ txnid = targetTxnIds.get(0);
+ }
+
+ if (abortTxns(dbConn, Collections.singletonList(txnid), true) != 1) {
+ TxnStatus status = findTxnState(txnid,stmt);
+ if(status == TxnStatus.ABORTED) {
+ if (rqst.isSetReplPolicy()) {
+ // in case of replication, idempotent is taken care by getTargetTxnId
+ LOG.warn("Invalid state ABORTED for transactions started using replication replay task");
+ String s = "delete from REPL_TXN_MAP where RTM_SRC_TXN_ID = " + sourceTxnId +
+ " and RTM_REPL_POLICY = " + quoteString(rqst.getReplPolicy());
+ LOG.info("Going to execute <" + s + ">");
+ stmt.executeUpdate(s);
+ }
+ LOG.info("abortTxn(" + JavaUtils.txnIdToString(txnid) +
+ ") requested by it is already " + TxnStatus.ABORTED);
+ return;
+ }
+ raiseTxnUnexpectedState(status, txnid);
+ }
+
+ if (rqst.isSetReplPolicy()) {
+ String s = "delete from REPL_TXN_MAP where RTM_SRC_TXN_ID = " + sourceTxnId +
+ " and RTM_REPL_POLICY = " + quoteString(rqst.getReplPolicy());
+ LOG.info("Going to execute <" + s + ">");
+ stmt.executeUpdate(s);
+ }
+
+ if (transactionalListeners != null) {
+ MetaStoreListenerNotifier.notifyEventWithDirectSql(transactionalListeners,
+ EventMessage.EventType.ABORT_TXN, new AbortTxnEvent(txnid, null), dbConn, sqlGenerator);
+ }
+
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "abortTxn(" + rqst + ")");
+ throw new MetaException("Unable to update transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(null, stmt, dbConn);
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ abortTxn(rqst);
+ }
+ }
+
+ @Override
+ @RetrySemantics.Idempotent
+ public void abortTxns(AbortTxnsRequest rqst) throws NoSuchTxnException, MetaException {
+ List<Long> txnids = rqst.getTxn_ids();
+ try {
+ Connection dbConn = null;
+ try {
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ int numAborted = abortTxns(dbConn, txnids, false);
+ if (numAborted != txnids.size()) {
+ LOG.warn("Abort Transactions command only aborted " + numAborted + " out of " +
+ txnids.size() + " transactions. It's possible that the other " +
+ (txnids.size() - numAborted) +
+ " transactions have been aborted or committed, or the transaction ids are invalid.");
+ }
+
+ for (Long txnId : txnids) {
+ if (transactionalListeners != null) {
+ MetaStoreListenerNotifier.notifyEventWithDirectSql(transactionalListeners,
+ EventMessage.EventType.ABORT_TXN, new AbortTxnEvent(txnId, null), dbConn, sqlGenerator);
+ }
+ }
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "abortTxns(" + rqst + ")");
+ throw new MetaException("Unable to update transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ closeDbConn(dbConn);
+ }
+ } catch (RetryException e) {
+ abortTxns(rqst);
+ }
+ }
+
+ /**
+ * Concurrency/isolation notes:
+ * This is mutexed with {@link #openTxns(OpenTxnRequest)} and other {@link #commitTxn(CommitTxnRequest)}
+ * operations using select4update on NEXT_TXN_ID. Also, mutexes on TXNX table for specific txnid:X
+ * see more notes below.
+ * In order to prevent lost updates, we need to determine if any 2 transactions overlap. Each txn
+ * is viewed as an interval [M,N]. M is the txnid and N is taken from the same NEXT_TXN_ID sequence
+ * so that we can compare commit time of txn T with start time of txn S. This sequence can be thought of
+ * as a logical time counter. If S.commitTime < T.startTime, T and S do NOT overlap.
+ *
+ * Motivating example:
+ * Suppose we have multi-statment transactions T and S both of which are attempting x = x + 1
+ * In order to prevent lost update problem, the the non-overlapping txns must lock in the snapshot
+ * that they read appropriately. In particular, if txns do not overlap, then one follows the other
+ * (assumig they write the same entity), and thus the 2nd must see changes of the 1st. We ensure
+ * this by locking in snapshot after
+ * {@link #openTxns(OpenTxnRequest)} call is made (see org.apache.hadoop.hive.ql.Driver.acquireLocksAndOpenTxn)
+ * and mutexing openTxn() with commit(). In other words, once a S.commit() starts we must ensure
+ * that txn T which will be considered a later txn, locks in a snapshot that includes the result
+ * of S's commit (assuming no other txns).
+ * As a counter example, suppose we have S[3,3] and T[4,4] (commitId=txnid means no other transactions
+ * were running in parallel). If T and S both locked in the same snapshot (for example commit of
+ * txnid:2, which is possible if commitTxn() and openTxnx() is not mutexed)
+ * 'x' would be updated to the same value by both, i.e. lost update.
+ */
+ @Override
+ @RetrySemantics.Idempotent("No-op if already committed")
+ public void commitTxn(CommitTxnRequest rqst)
+ throws NoSuchTxnException, TxnAbortedException, MetaException {
+ char isUpdateDelete = 'N';
+ long txnid = rqst.getTxnid();
+ long sourceTxnId = -1;
+
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet lockHandle = null;
+ ResultSet commitIdRs = null, rs;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ if (rqst.isSetReplPolicy()) {
+ sourceTxnId = rqst.getTxnid();
+ List<Long> targetTxnIds = getTargetTxnIdList(rqst.getReplPolicy(),
+ Collections.singletonList(sourceTxnId), stmt);
+ if (targetTxnIds.isEmpty()) {
+ LOG.info("Target txn id is missing for source txn id : " + sourceTxnId +
+ " and repl policy " + rqst.getReplPolicy());
+ return;
+ }
+ assert targetTxnIds.size() == 1;
+ txnid = targetTxnIds.get(0);
+ }
+
+ /**
+ * Runs at READ_COMMITTED with S4U on TXNS row for "txnid". S4U ensures that no other
+ * operation can change this txn (such acquiring locks). While lock() and commitTxn()
+ * should not normally run concurrently (for same txn) but could due to bugs in the client
+ * which could then corrupt internal transaction manager state. Also competes with abortTxn().
+ */
+ lockHandle = lockTransactionRecord(stmt, txnid, TXN_OPEN);
+ if (lockHandle == null) {
+ //if here, txn was not found (in expected state)
+ TxnStatus actualTxnStatus = findTxnState(txnid, stmt);
+ if(actualTxnStatus == TxnStatus.COMMITTED) {
+ if (rqst.isSetReplPolicy()) {
+ // in case of replication, idempotent is taken care by getTargetTxnId
+ LOG.warn("Invalid state COMMITTED for transactions started using replication replay task");
+ }
+ /**
+ * This makes the operation idempotent
+ * (assume that this is most likely due to retry logic)
+ */
+ LOG.info("Nth commitTxn(" + JavaUtils.txnIdToString(txnid) + ") msg");
+ return;
+ }
+ raiseTxnUnexpectedState(actualTxnStatus, txnid);
+ shouldNeverHappen(txnid);
+ //dbConn is rolled back in finally{}
+ }
+
+ String conflictSQLSuffix = null;
+ if (rqst.isSetReplPolicy()) {
+ rs = null;
+ } else {
+ conflictSQLSuffix = "from TXN_COMPONENTS where tc_txnid=" + txnid + " and tc_operation_type IN(" +
+ quoteChar(OpertaionType.UPDATE.sqlConst) + "," + quoteChar(OpertaionType.DELETE.sqlConst) + ")";
+ rs = stmt.executeQuery(sqlGenerator.addLimitClause(1,
+ "tc_operation_type " + conflictSQLSuffix));
+ }
+ if (rs != null && rs.next()) {
+ isUpdateDelete = 'Y';
+ close(rs);
+ //if here it means currently committing txn performed update/delete and we should check WW conflict
+ /**
+ * This S4U will mutex with other commitTxn() and openTxns().
+ * -1 below makes txn intervals look like [3,3] [4,4] if all txns are serial
+ * Note: it's possible to have several txns have the same commit id. Suppose 3 txns start
+ * at the same time and no new txns start until all 3 commit.
+ * We could've incremented the sequence for commitId is well but it doesn't add anything functionally.
+ */
+ commitIdRs = stmt.executeQuery(sqlGenerator.addForUpdateClause("select ntxn_next - 1 from NEXT_TXN_ID"));
+ if (!commitIdRs.next()) {
+ throw new IllegalStateException("No rows found in NEXT_TXN_ID");
+ }
+ long commitId = commitIdRs.getLong(1);
+ Savepoint undoWriteSetForCurrentTxn = dbConn.setSavepoint();
+ /**
+ * "select distinct" is used below because
+ * 1. once we get to multi-statement txns, we only care to record that something was updated once
+ * 2. if {@link #addDynamicPartitions(AddDynamicPartitions)} is retried by caller it my create
+ * duplicate entries in TXN_COMPONENTS
+ * but we want to add a PK on WRITE_SET which won't have unique rows w/o this distinct
+ * even if it includes all of it's columns
+ */
+ int numCompsWritten = stmt.executeUpdate(
+ "insert into WRITE_SET (ws_database, ws_table, ws_partition, ws_txnid, ws_commit_id, ws_operation_type)" +
+ " select distinct tc_database, tc_table, tc_partition, tc_txnid, " + commitId + ", tc_operation_type " + conflictSQLSuffix);
+ /**
+ * see if there are any overlapping txns wrote the same element, i.e. have a conflict
+ * Since entire commit operation is mutexed wrt other start/commit ops,
+ * committed.ws_commit_id <= current.ws_commit_id for all txns
+ * thus if committed.ws_commit_id < current.ws_txnid, transactions do NOT overlap
+ * For example, [17,20] is committed, [6,80] is being committed right now - these overlap
+ * [17,20] committed and [21,21] committing now - these do not overlap.
+ * [17,18] committed and [18,19] committing now - these overlap (here 18 started while 17 was still running)
+ */
+ rs = stmt.executeQuery
+ (sqlGenerator.addLimitClause(1, "committed.ws_txnid, committed.ws_commit_id, committed.ws_database," +
+ "committed.ws_table, committed.ws_partition, cur.ws_commit_id cur_ws_commit_id, " +
+ "cur.ws_operation_type cur_op, committed.ws_operation_type committed_op " +
+ "from WRITE_SET committed INNER JOIN WRITE_SET cur " +
+ "ON committed.ws_database=cur.ws_database and committed.ws_table=cur.ws_table " +
+ //For partitioned table we always track writes at partition level (never at table)
+ //and for non partitioned - always at table level, thus the same table should never
+ //have entries with partition key and w/o
+ "and (committed.ws_partition=cur.ws_partition or (committed.ws_partition is null and cur.ws_partition is null)) " +
+ "where cur.ws_txnid <= committed.ws_commit_id" + //txns overlap; could replace ws_txnid
+ // with txnid, though any decent DB should infer this
+ " and cur.ws_txnid=" + txnid + //make sure RHS of join only has rows we just inserted as
+ // part of this commitTxn() op
+ " and committed.ws_txnid <> " + txnid + //and LHS only has committed txns
+ //U+U and U+D is a conflict but D+D is not and we don't currently track I in WRITE_SET at all
+ " and (committed.ws_operation_type=" + quoteChar(OpertaionType.UPDATE.sqlConst) +
+ " OR cur.ws_operation_type=" + quoteChar(OpertaionType.UPDATE.sqlConst) + ")"));
+ if (rs.next()) {
+ //found a conflict
+ String committedTxn = "[" + JavaUtils.txnIdToString(rs.getLong(1)) + "," + rs.getLong(2) + "]";
+ StringBuilder resource = new StringBuilder(rs.getString(3)).append("/").append(rs.getString(4));
+ String partitionName = rs.getString(5);
+ if (partitionName != null) {
+ resource.append('/').append(partitionName);
+ }
+ String msg = "Aborting [" + JavaUtils.txnIdToString(txnid) + "," + rs.getLong(6) + "]" + " due to a write conflict on " + resource +
+ " committed by " + committedTxn + " " + rs.getString(7) + "/" + rs.getString(8);
+ close(rs);
+ //remove WRITE_SET info for current txn since it's about to abort
+ dbConn.rollback(undoWriteSetForCurrentTxn);
+ LOG.info(msg);
+ //todo: should make abortTxns() write something into TXNS.TXN_META_INFO about this
+ if (abortTxns(dbConn, Collections.singletonList(txnid), true) != 1) {
+ throw new IllegalStateException(msg + " FAILED!");
+ }
+ dbConn.commit();
+ close(null, stmt, dbConn);
+ throw new TxnAbortedException(msg);
+ } else {
+ //no conflicting operations, proceed with the rest of commit sequence
+ }
+ }
+ else {
+ /**
+ * current txn didn't update/delete anything (may have inserted), so just proceed with commit
+ *
+ * We only care about commit id for write txns, so for RO (when supported) txns we don't
+ * have to mutex on NEXT_TXN_ID.
+ * Consider: if RO txn is after a W txn, then RO's openTxns() will be mutexed with W's
+ * commitTxn() because both do S4U on NEXT_TXN_ID and thus RO will see result of W txn.
+ * If RO < W, then there is no reads-from relationship.
+ * In replication flow we don't expect any write write conflict as it should have been handled at source.
+ */
+ }
+
+ String s;
+ if (!rqst.isSetReplPolicy()) {
+ // Move the record from txn_components into completed_txn_components so that the compactor
+ // knows where to look to compact.
+ s = "insert into COMPLETED_TXN_COMPONENTS (ctc_txnid, ctc_database, " +
+ "ctc_table, ctc_partition, ctc_writeid, ctc_update_delete) select tc_txnid, tc_database, tc_table, " +
+ "tc_partition, tc_writeid, '" + isUpdateDelete + "' from TXN_COMPONENTS where tc_txnid = " + txnid;
+ LOG.debug("Going to execute insert <" + s + ">");
+
+ if ((stmt.executeUpdate(s)) < 1) {
+ //this can be reasonable for an empty txn START/COMMIT or read-only txn
+ //also an IUD with DP that didn't match any rows.
+ LOG.info("Expected to move at least one record from txn_components to " +
+ "completed_txn_components when committing txn! " + JavaUtils.txnIdToString(txnid));
+ }
+ } else {
+ if (rqst.isSetWriteEventInfos()) {
+ List<String> rows = new ArrayList<>();
+ for (WriteEventInfo writeEventInfo : rqst.getWriteEventInfos()) {
+ rows.add(txnid + "," + quoteString(writeEventInfo.getDatabase()) + "," +
+ quoteString(writeEventInfo.getTable()) + "," +
+ quoteString(writeEventInfo.getPartition()) + "," +
+ writeEventInfo.getWriteId() + "," +
+ "'" + isUpdateDelete + "'");
+ }
+ List<String> queries = sqlGenerator.createInsertValuesStmt("COMPLETED_TXN_COMPONENTS " +
+ "(ctc_txnid," + " ctc_database, ctc_table, ctc_partition, ctc_writeid, ctc_update_delete)", rows);
+ for (String q : queries) {
+ LOG.debug("Going to execute insert <" + q + "> ");
+ stmt.execute(q);
+ }
+ }
+
+ s = "delete from REPL_TXN_MAP where RTM_SRC_TXN_ID = " + sourceTxnId +
+ " and RTM_REPL_POLICY = " + quoteString(rqst.getReplPolicy());
+ LOG.info("Repl going to execute <" + s + ">");
+ stmt.executeUpdate(s);
+ }
+
+ // cleanup all txn related metadata
+ s = "delete from TXN_COMPONENTS where tc_txnid = " + txnid;
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+ s = "delete from HIVE_LOCKS where hl_txnid = " + txnid;
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+ s = "delete from TXNS where txn_id = " + txnid;
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+ s = "delete from MIN_HISTORY_LEVEL where mhl_txnid = " + txnid;
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+ LOG.info("Removed committed transaction: (" + txnid + ") from MIN_HISTORY_LEVEL");
+
+ s = "delete from MATERIALIZATION_REBUILD_LOCKS where mrl_txn_id = " + txnid;
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+
+ if (transactionalListeners != null) {
+ MetaStoreListenerNotifier.notifyEventWithDirectSql(transactionalListeners,
+ EventMessage.EventType.COMMIT_TXN, new CommitTxnEvent(txnid, null), dbConn, sqlGenerator);
+ }
+
+ LOG.debug("Going to commit");
+ close(rs);
+ dbConn.commit();
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "commitTxn(" + rqst + ")");
+ throw new MetaException("Unable to update transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(commitIdRs);
+ close(lockHandle, stmt, dbConn);
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ commitTxn(rqst);
+ }
+ }
+
+ /**
+ * Replicate Table Write Ids state to mark aborted write ids and writeid high water mark.
+ * @param rqst info on table/partitions and writeid snapshot to replicate.
+ * @throws MetaException
+ */
+ @Override
+ @RetrySemantics.Idempotent("No-op if already replicated the writeid state")
+ public void replTableWriteIdState(ReplTblWriteIdStateRequest rqst) throws MetaException {
+ String dbName = rqst.getDbName().toLowerCase();
+ String tblName = rqst.getTableName().toLowerCase();
+ ValidWriteIdList validWriteIdList = new ValidReaderWriteIdList(rqst.getValidWriteIdlist());
+
+ // Get the abortedWriteIds which are already sorted in ascending order.
+ List<Long> abortedWriteIds = getAbortedWriteIds(validWriteIdList);
+ int numAbortedWrites = abortedWriteIds.size();
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ TxnStore.MutexAPI.LockHandle handle = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ // Check if this txn state is already replicated for this given table. If yes, then it is
+ // idempotent case and just return.
+ String sql = "select nwi_next from NEXT_WRITE_ID where nwi_database = " + quoteString(dbName)
+ + " and nwi_table = " + quoteString(tblName);
+ LOG.debug("Going to execute query <" + sql + ">");
+
+ rs = stmt.executeQuery(sql);
+ if (rs.next()) {
+ LOG.info("Idempotent flow: WriteId state <" + validWriteIdList + "> is already applied for the table: "
+ + dbName + "." + tblName);
+ rollbackDBConn(dbConn);
+ return;
+ }
+
+ if (numAbortedWrites > 0) {
+ // Allocate/Map one txn per aborted writeId and abort the txn to mark writeid as aborted.
+ List<Long> txnIds = openTxns(dbConn, stmt,
+ new OpenTxnRequest(numAbortedWrites, rqst.getUser(), rqst.getHostName()));
+ assert(numAbortedWrites == txnIds.size());
+
+ // Map each aborted write id with each allocated txn.
+ List<String> rows = new ArrayList<>();
+ int i = 0;
+ for (long txn : txnIds) {
+ long writeId = abortedWriteIds.get(i++);
+ rows.add(txn + ", " + quoteString(dbName) + ", " + quoteString(tblName) + ", " + writeId);
+ LOG.info("Allocated writeID: " + writeId + " for txnId: " + txn);
+ }
+
+ // Insert entries to TXN_TO_WRITE_ID for aborted write ids
+ List<String> inserts = sqlGenerator.createInsertValuesStmt(
+ "TXN_TO_WRITE_ID (t2w_txnid, t2w_database, t2w_table, t2w_writeid)", rows);
+ for (String insert : inserts) {
+ LOG.debug("Going to execute insert <" + insert + ">");
+ stmt.execute(insert);
+ }
+
+ // Abort all the allocated txns so that the mapped write ids are referred as aborted ones.
+ int numAborts = abortTxns(dbConn, txnIds, true);
+ assert(numAborts == numAbortedWrites);
+ }
+ handle = getMutexAPI().acquireLock(MUTEX_KEY.WriteIdAllocator.name());
+
+ // There are some txns in the list which has no write id allocated and hence go ahead and do it.
+ // Get the next write id for the given table and update it with new next write id.
+ // It is expected NEXT_WRITE_ID doesn't have entry for this table and hence directly insert it.
+ long nextWriteId = validWriteIdList.getHighWatermark() + 1;
+
+ // First allocation of write id (hwm+1) should add the table to the next_write_id meta table.
+ sql = "insert into NEXT_WRITE_ID (nwi_database, nwi_table, nwi_next) values ("
+ + quoteString(dbName) + "," + quoteString(tblName) + ","
+ + Long.toString(nextWriteId) + ")";
+ LOG.debug("Going to execute insert <" + sql + ">");
+ stmt.execute(sql);
+
+ LOG.info("WriteId state <" + validWriteIdList + "> is applied for the table: " + dbName + "." + tblName);
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "replTableWriteIdState(" + rqst + ")");
+ throw new MetaException("Unable to update transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(rs, stmt, dbConn);
+ if(handle != null) {
+ handle.releaseLocks();
+ }
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ replTableWriteIdState(rqst);
+ }
+
+ // Schedule Major compaction on all the partitions/table to clean aborted data
+ if (numAbortedWrites > 0) {
+ CompactionRequest compactRqst = new CompactionRequest(rqst.getDbName(), rqst.getTableName(),
+ CompactionType.MAJOR);
+ if (rqst.isSetPartNames()) {
+ for (String partName : rqst.getPartNames()) {
+ compactRqst.setPartitionname(partName);
+ compact(compactRqst);
+ }
+ } else {
+ compact(compactRqst);
+ }
+ }
+ }
+
+ private List<Long> getAbortedWriteIds(ValidWriteIdList validWriteIdList) {
+ List<Long> abortedWriteIds = new ArrayList<>();
+ for (long writeId : validWriteIdList.getInvalidWriteIds()) {
+ if (validWriteIdList.isWriteIdAborted(writeId)) {
+ abortedWriteIds.add(writeId);
+ }
+ }
+ return abortedWriteIds;
+ }
+
+ @Override
+ @RetrySemantics.ReadOnly
+ public GetValidWriteIdsResponse getValidWriteIds(GetValidWriteIdsRequest rqst)
+ throws NoSuchTxnException, MetaException {
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ ValidTxnList validTxnList;
+
+ // We should prepare the valid write ids list based on validTxnList of current txn.
+ // If no txn exists in the caller, then they would pass null for validTxnList and so it is
+ // required to get the current state of txns to make validTxnList
+ if (rqst.isSetValidTxnList()) {
+ validTxnList = new ValidReadTxnList(rqst.getValidTxnList());
+ } else {
+ // Passing 0 for currentTxn means, this validTxnList is not wrt to any txn
+ validTxnList = TxnUtils.createValidReadTxnList(getOpenTxns(), 0);
+ }
+ try {
+ /**
+ * This runs at READ_COMMITTED for exactly the same reason as {@link #getOpenTxnsInfo()}
+ */
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ // Get the valid write id list for all the tables read by the current txn
+ List<TableValidWriteIds> tblValidWriteIdsList = new ArrayList<>();
+ for (String fullTableName : rqst.getFullTableNames()) {
+ tblValidWriteIdsList.add(getValidWriteIdsForTable(stmt, fullTableName, validTxnList));
+ }
+
+ LOG.debug("Going to rollback");
+ dbConn.rollback();
+ GetValidWriteIdsResponse owr = new GetValidWriteIdsResponse(tblValidWriteIdsList);
+ return owr;
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "getValidWriteIds");
+ throw new MetaException("Unable to select from transaction database, "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(null, stmt, dbConn);
+ }
+ } catch (RetryException e) {
+ return getValidWriteIds(rqst);
+ }
+ }
+
+ // Method to get the Valid write ids list for the given table
+ // Input fullTableName is expected to be of format <db_name>.<table_name>
+ private TableValidWriteIds getValidWriteIdsForTable(Statement stmt, String fullTableName,
+ ValidTxnList validTxnList) throws SQLException {
+ ResultSet rs = null;
+ String[] names = TxnUtils.getDbTableName(fullTableName);
+ try {
+ // Need to initialize to 0 to make sure if nobody modified this table, then current txn
+ // shouldn't read any data.
+ // If there is a conversion from non-acid to acid table, then by default 0 would be assigned as
+ // writeId for data from non-acid table and so writeIdHwm=0 would ensure those data are readable by any txns.
+ long writeIdHwm = 0;
+ List<Long> invalidWriteIdList = new ArrayList<>();
+ long minOpenWriteId = Long.MAX_VALUE;
+ BitSet abortedBits = new BitSet();
+ long txnHwm = validTxnList.getHighWatermark();
+
+ // Find the writeId high water mark based upon txnId high water mark. If found, then, need to
+ // traverse through all write Ids less than writeId HWM to make exceptions list.
+ // The writeHWM = min(NEXT_WRITE_ID.nwi_next-1, max(TXN_TO_WRITE_ID.t2w_writeid under txnHwm))
+ String s = "select max(t2w_writeid) from TXN_TO_WRITE_ID where t2w_txnid <= " + txnHwm
+ + " and t2w_database = " + quoteString(names[0])
+ + " and t2w_table = " + quoteString(names[1]);
+ LOG.debug("Going to execute query<" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (rs.next()) {
+ writeIdHwm = rs.getLong(1);
+ }
+
+ // If no writeIds allocated by txns under txnHwm, then find writeHwm from NEXT_WRITE_ID.
+ if (writeIdHwm <= 0) {
+ // Need to subtract 1 as nwi_next would be the next write id to be allocated but we need highest
+ // allocated write id.
+ s = "select nwi_next-1 from NEXT_WRITE_ID where nwi_database = " + quoteString(names[0])
+ + " and nwi_table = " + quoteString(names[1]);
+ LOG.debug("Going to execute query<" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (rs.next()) {
+ long maxWriteId = rs.getLong(1);
+ if (maxWriteId > 0) {
+ writeIdHwm = (writeIdHwm > 0) ? Math.min(maxWriteId, writeIdHwm) : maxWriteId;
+ }
+ }
+ }
+
+ // As writeIdHwm is known, query all writeIds under the writeId HWM.
+ // If any writeId under HWM is allocated by txn > txnId HWM or belongs to open/aborted txns,
+ // then will be added to invalid list. The results should be sorted in ascending order based
+ // on write id. The sorting is needed as exceptions list in ValidWriteIdList would be looked-up
+ // using binary search.
+ s = "select t2w_txnid, t2w_writeid from TXN_TO_WRITE_ID where t2w_writeid <= " + writeIdHwm
+ + " and t2w_database = " + quoteString(names[0])
+ + " and t2w_table = " + quoteString(names[1])
+ + " order by t2w_writeid asc";
+
+ LOG.debug("Going to execute query<" + s + ">");
+ rs = stmt.executeQuery(s);
+ while (rs.next()) {
+ long txnId = rs.getLong(1);
+ long writeId = rs.getLong(2);
+ if (validTxnList.isTxnValid(txnId)) {
+ // Skip if the transaction under evaluation is already committed.
+ continue;
+ }
+
+ // The current txn is either in open or aborted state.
+ // Mark the write ids state as per the txn state.
+ invalidWriteIdList.add(writeId);
+ if (validTxnList.isTxnAborted(txnId)) {
+ abortedBits.set(invalidWriteIdList.size() - 1);
+ } else {
+ minOpenWriteId = Math.min(minOpenWriteId, writeId);
+ }
+ }
+
+ ByteBuffer byteBuffer = ByteBuffer.wrap(abortedBits.toByteArray());
+ TableValidWriteIds owi = new TableValidWriteIds(fullTableName, writeIdHwm, invalidWriteIdList, byteBuffer);
+ if (minOpenWriteId < Long.MAX_VALUE) {
+ owi.setMinOpenWriteId(minOpenWriteId);
+ }
+ return owi;
+ } finally {
+ close(rs);
+ }
+ }
+
+ @Override
+ @RetrySemantics.Idempotent
+ public AllocateTableWriteIdsResponse allocateTableWriteIds(AllocateTableWriteIdsRequest rqst)
+ throws NoSuchTxnException, TxnAbortedException, MetaException {
+ List<Long> txnIds;
+ String dbName = rqst.getDbName().toLowerCase();
+ String tblName = rqst.getTableName().toLowerCase();
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ TxnStore.MutexAPI.LockHandle handle = null;
+ List<TxnToWriteId> txnToWriteIds = new ArrayList<>();
+ List<TxnToWriteId> srcTxnToWriteIds = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ if (rqst.isSetReplPolicy()) {
+ srcTxnToWriteIds = rqst.getSrcTxnToWriteIdList();
+ List<Long> srcTxnIds = new ArrayList<>();
+ assert (rqst.isSetSrcTxnToWriteIdList());
+ assert (!rqst.isSetTxnIds());
+ assert (!srcTxnToWriteIds.isEmpty());
+
+ for (TxnToWriteId txnToWriteId : srcTxnToWriteIds) {
+ srcTxnIds.add(txnToWriteId.getTxnId());
+ }
+ txnIds = getTargetTxnIdList(rqst.getReplPolicy(), srcTxnIds, stmt);
+ if (srcTxnIds.size() != txnIds.size()) {
+ LOG.warn("Target txn id is missing for source txn id : " + srcTxnIds.toString() +
+ " and repl policy " + rqst.getReplPolicy());
+ throw new RuntimeException("This should never happen for txnIds: " + txnIds);
+ }
+ } else {
+ assert (!rqst.isSetSrcTxnToWriteIdList());
+ assert (rqst.isSetTxnIds());
+ txnIds = rqst.getTxnIds();
+ }
+
+ Collections.sort(txnIds); //easier to read logs and for assumption done in replication flow
+
+ // Check if all the input txns are in open state. Write ID should be allocated only for open transactions.
+ if (!isTxnsInOpenState(txnIds, stmt)) {
+ ensureAllTxnsValid(dbName, tblName, txnIds, stmt);
+ throw new RuntimeException("This should never happen for txnIds: " + txnIds);
+ }
+
+ long writeId;
+ String s;
+ long allocatedTxnsCount = 0;
+ long txnId;
+ List<String> queries = new ArrayList<>();
+ StringBuilder prefix = new StringBuilder();
+ StringBuilder suffix = new StringBuilder();
+
+ // Traverse the TXN_TO_WRITE_ID to see if any of the input txns already have allocated a
+ // write id for the same db.table. If yes, then need to reuse it else have to allocate new one
+ // The write id would have been already allocated in case of multi-statement txns where
+ // first write on a table will allocate write id and rest of the writes should re-use it.
+ prefix.append("select t2w_txnid, t2w_writeid from TXN_TO_WRITE_ID where"
+ + " t2w_database = " + quoteString(dbName)
+ + " and t2w_table = " + quoteString(tblName) + " and ");
+ suffix.append("");
+ TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix,
+ txnIds, "t2w_txnid", false, false);
+ for (String query : queries) {
+ LOG.debug("Going to execute query <" + query + ">");
+ rs = stmt.executeQuery(query);
+ while (rs.next()) {
+ // If table write ID is already allocated for the given transaction, then just use it
+ txnId = rs.getLong(1);
+ writeId = rs.getLong(2);
+ txnToWriteIds.add(new TxnToWriteId(txnId, writeId));
+ allocatedTxnsCount++;
+ LOG.info("Reused already allocated writeID: " + writeId + " for txnId: " + txnId);
+ }
+ }
+
+ // Batch allocation should always happen atomically. Either write ids for all txns is allocated or none.
+ long numOfWriteIds = txnIds.size();
+ assert ((allocatedTxnsCount == 0) || (numOfWriteIds == allocatedTxnsCount));
+ if (allocatedTxnsCount == numOfWriteIds) {
+ // If all the txns in the list have pre-allocated write ids for the given table, then just return.
+ // This is for idempotent case.
+ return new AllocateTableWriteIdsResponse(txnToWriteIds);
+ }
+
+ handle = getMutexAPI().acquireLock(MUTEX_KEY.WriteIdAllocator.name());
+
+ // There are some txns in the list which does not have write id allocated and hence go ahead and do it.
+ // Get the next write id for the given table and update it with new next write id.
+ // This is select for update query which takes a lock if the table entry is already there in NEXT_WRITE_ID
+ s = sqlGenerator.addForUpdateClause(
+ "select nwi_next from NEXT_WRITE_ID where nwi_database = " + quoteString(dbName)
+ + " and nwi_table = " + quoteString(tblName));
+ LOG.debug("Going to execute query <" + s + ">");
+ rs = stmt.executeQuery(s);
+ if (!rs.next()) {
+ // First allocation of write id should add the table to the next_write_id meta table
+ // The initial value for write id should be 1 and hence we add 1 with number of write ids allocated here
+ writeId = 1;
+ s = "insert into NEXT_WRITE_ID (nwi_database, nwi_table, nwi_next) values ("
+ + quoteString(dbName) + "," + quoteString(tblName) + "," + Long.toString(numOfWriteIds + 1) + ")";
+ LOG.debug("Going to execute insert <" + s + ">");
+ stmt.execute(s);
+ } else {
+ writeId = rs.getLong(1);
+ // Update the NEXT_WRITE_ID for the given table after incrementing by number of write ids allocated
+ s = "update NEXT_WRITE_ID set nwi_next = " + (writeId + numOfWriteIds)
+ + " where nwi_database = " + quoteString(dbName)
+ + " and nwi_table = " + quoteString(tblName);
+ LOG.debug("Going to execute update <" + s + ">");
+ stmt.executeUpdate(s);
+ }
+
+ // Map the newly allocated write ids against the list of txns which doesn't have pre-allocated
+ // write ids
+ List<String> rows = new ArrayList<>();
+ for (long txn : txnIds) {
+ rows.add(txn + ", " + quoteString(dbName) + ", " + quoteString(tblName) + ", " + writeId);
+ txnToWriteIds.add(new TxnToWriteId(txn, writeId));
+ LOG.info("Allocated writeID: " + writeId + " for txnId: " + txn);
+ writeId++;
+ }
+
+ if (rqst.isSetReplPolicy()) {
+ int lastIdx = txnToWriteIds.size()-1;
+ if ((txnToWriteIds.get(0).getWriteId() != srcTxnToWriteIds.get(0).getWriteId()) ||
+ (txnToWriteIds.get(lastIdx).getWriteId() != srcTxnToWriteIds.get(lastIdx).getWriteId())) {
+ LOG.error("Allocated write id range {} is not matching with the input write id range {}.",
+ txnToWriteIds, srcTxnToWriteIds);
+ throw new IllegalStateException("Write id allocation failed for: " + srcTxnToWriteIds);
+ }
+ }
+
+ // Insert entries to TXN_TO_WRITE_ID for newly allocated write ids
+ List<String> inserts = sqlGenerator.createInsertValuesStmt(
+ "TXN_TO_WRITE_ID (t2w_txnid, t2w_database, t2w_table, t2w_writeid)", rows);
+ for (String insert : inserts) {
+ LOG.debug("Going to execute insert <" + insert + ">");
+ stmt.execute(insert);
+ }
+
+ if (transactionalListeners != null) {
+ MetaStoreListenerNotifier.notifyEventWithDirectSql(transactionalListeners,
+ EventMessage.EventType.ALLOC_WRITE_ID,
+ new AllocWriteIdEvent(txnToWriteIds, rqst.getDbName(), rqst.getTableName(), null),
+ dbConn, sqlGenerator);
+ }
+
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ return new AllocateTableWriteIdsResponse(txnToWriteIds);
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "allocateTableWriteIds(" + rqst + ")");
+ throw new MetaException("Unable to update transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(rs, stmt, dbConn);
+ if(handle != null) {
+ handle.releaseLocks();
+ }
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ return allocateTableWriteIds(rqst);
+ }
+ }
+ @Override
+ public void seedWriteIdOnAcidConversion(InitializeTableWriteIdsRequest rqst)
+ throws MetaException {
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ TxnStore.MutexAPI.LockHandle handle = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ handle = getMutexAPI().acquireLock(MUTEX_KEY.WriteIdAllocator.name());
+ //since this is on conversion from non-acid to acid, NEXT_WRITE_ID should not have an entry
+ //for this table. It also has a unique index in case 'should not' is violated
+
+ // First allocation of write id should add the table to the next_write_id meta table
+ // The initial value for write id should be 1 and hence we add 1 with number of write ids
+ // allocated here
+ String s = "insert into NEXT_WRITE_ID (nwi_database, nwi_table, nwi_next) values ("
+ + quoteString(rqst.getDbName()) + "," + quoteString(rqst.getTblName()) + "," +
+ Long.toString(rqst.getSeeWriteId() + 1) + ")";
+ LOG.debug("Going to execute insert <" + s + ">");
+ stmt.execute(s);
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "seedWriteIdOnAcidConversion(" + rqst + ")");
+ throw new MetaException("Unable to update transaction database "
+ + StringUtils.stringifyException(e));
+ } finally {
+ close(null, stmt, dbConn);
+ if(handle != null) {
+ handle.releaseLocks();
+ }
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ seedWriteIdOnAcidConversion(rqst);
+ }
+
+ }
+ @Override
+ @RetrySemantics.Idempotent
+ public void addWriteNotificationLog(AcidWriteEvent acidWriteEvent)
+ throws MetaException {
+ Connection dbConn = null;
+ try {
+ try {
+ //Idempotent case is handled by notify Event
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ MetaStoreListenerNotifier.notifyEventWithDirectSql(transactionalListeners,
+ EventMessage.EventType.ACID_WRITE, acidWriteEvent, dbConn, sqlGenerator);
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ return;
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ if (isDuplicateKeyError(e)) {
+ // in case of key duplicate error, retry as it might be because of race condition
+ if (waitForRetry("addWriteNotificationLog(" + acidWriteEvent + ")", e.getMessage())) {
+ throw new RetryException();
+ }
+ retryNum = 0;
+ throw new MetaException(e.getMessage());
+ }
+ checkRetryable(dbConn, e, "addWriteNotificationLog(" + acidWriteEvent + ")");
+ throw new MetaException("Unable to add write notification event " + StringUtils.stringifyException(e));
+ } finally{
+ closeDbConn(dbConn);
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ addWriteNotificationLog(acidWriteEvent);
+ }
+ }
+
+ @Override
+ @RetrySemantics.SafeToRetry
+ public void performWriteSetGC() {
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+ rs = stmt.executeQuery("select ntxn_next - 1 from NEXT_TXN_ID");
+ if(!rs.next()) {
+ throw new IllegalStateException("NEXT_TXN_ID is empty: DB is corrupted");
+ }
+ long highestAllocatedTxnId = rs.getLong(1);
+ close(rs);
+ rs = stmt.executeQuery("select min(txn_id) from TXNS where txn_state=" + quoteChar(TXN_OPEN));
+ if(!rs.next()) {
+ throw new IllegalStateException("Scalar query returned no rows?!?!!");
+ }
+ long commitHighWaterMark;//all currently open txns (if any) have txnid >= than commitHighWaterMark
+ long lowestOpenTxnId = rs.getLong(1);
+ if(rs.wasNull()) {
+ //if here then there are no Open txns and highestAllocatedTxnId must be
+ //resolved (i.e. committed or aborted), either way
+ //there are no open txns with id <= highestAllocatedTxnId
+ //the +1 is there because "delete ..." below has < (which is correct for the case when
+ //there is an open txn
+ //Concurrency: even if new txn starts (or starts + commits) it is still true that
+ //there are no currently open txns that overlap with any committed txn with
+ //commitId <= commitHighWaterMark (as set on next line). So plain READ_COMMITTED is enough.
+ commitHighWaterMark = highestAllocatedTxnId + 1;
+ }
+ else {
+ commitHighWaterMark = lowestOpenTxnId;
+ }
+ int delCnt = stmt.executeUpdate("delete from WRITE_SET where ws_commit_id < " + commitHighWaterMark);
+ LOG.info("Deleted " + delCnt + " obsolete rows from WRTIE_SET");
+ dbConn.commit();
+ } catch (SQLException ex) {
+ LOG.warn("WriteSet GC failed due to " + getMessage(ex), ex);
+ }
+ finally {
+ close(rs, stmt, dbConn);
+ }
+ }
+
+ /**
+ * Get invalidation info for the materialization. Currently, the materialization information
+ * only contains information about whether there was update/delete operations on the source
+ * tables used by the materialization since it was created.
+ */
+ @Override
+ @RetrySemantics.ReadOnly
+ public Materialization getMaterializationInvalidationInfo(
+ CreationMetadata creationMetadata, String validTxnListStr) throws MetaException {
+ if (creationMetadata.getTablesUsed().isEmpty()) {
+ // Bail out
+ LOG.warn("Materialization creation metadata does not contain any table");
+ return null;
+ }
+
+ // Parse validTxnList
+ final ValidReadTxnList validTxnList =
+ new ValidReadTxnList(validTxnListStr);
+
+ // Parse validReaderWriteIdList from creation metadata
+ final ValidTxnWriteIdList validReaderWriteIdList =
+ new ValidTxnWriteIdList(creationMetadata.getValidTxnList());
+
+ // We are composing a query that returns a single row if an update happened after
+ // the materialization was created. Otherwise, query returns 0 rows.
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+ stmt.setMaxRows(1);
+ StringBuilder query = new StringBuilder();
+ // compose a query that select transactions containing an update...
+ query.append("select ctc_update_delete from COMPLETED_TXN_COMPONENTS where ctc_update_delete='Y' AND (");
+ int i = 0;
+ for (String fullyQualifiedName : creationMetadata.getTablesUsed()) {
+ // ...for each of the tables that are part of the materialized view,
+ // where the transaction had to be committed after the materialization was created...
+ if (i != 0) {
+ query.append("OR");
+ }
+ String[] names = TxnUtils.getDbTableName(fullyQualifiedName);
+ query.append(" (ctc_database=" + quoteString(names[0]) + " AND ctc_table=" + quoteString(names[1]));
+ ValidWriteIdList tblValidWriteIdList =
+ validReaderWriteIdList.getTableValidWriteIdList(fullyQualifiedName);
+ if (tblValidWriteIdList == null) {
+ LOG.warn("ValidWriteIdList for table {} not present in creation metadata, this should not happen");
+ return null;
+ }
+ query.append(" AND (ctc_writeid > " + tblValidWriteIdList.getHighWatermark());
+ query.append(tblValidWriteIdList.getInvalidWriteIds().length == 0 ? ") " :
+ " OR ctc_writeid IN(" + StringUtils.join(",",
+ Arrays.asList(ArrayUtils.toObject(tblValidWriteIdList.getInvalidWriteIds()))) + ") ");
+ query.append(") ");
+ i++;
+ }
+ // ... and where the transaction has already been committed as per snapshot taken
+ // when we are running current query
+ query.append(") AND ctc_txnid <= " + validTxnList.getHighWatermark());
+ query.append(validTxnList.getInvalidTransactions().length == 0 ? " " :
+ " AND ctc_txnid NOT IN(" + StringUtils.join(",",
+ Arrays.asList(ArrayUtils.toObject(validTxnList.getInvalidTransactions()))) + ") ");
+
+ // Execute query
+ String s = query.toString();
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Going to execute query <" + s + ">");
+ }
+ rs = stmt.executeQuery(s);
+
+ return new Materialization(rs.next());
+ } catch (SQLException ex) {
+ LOG.warn("getMaterializationInvalidationInfo failed due to " + getMessage(ex), ex);
+ throw new MetaException("Unable to retrieve materialization invalidation information due to " +
+ StringUtils.stringifyException(ex));
+ } finally {
+ close(rs, stmt, dbConn);
+ }
+ }
+
+ @Override
+ public LockResponse lockMaterializationRebuild(String dbName, String tableName, long txnId)
+ throws MetaException {
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Acquiring lock for materialization rebuild with txnId={} for {}", txnId, Warehouse.getQualifiedName(dbName,tableName));
+ }
+
+ TxnStore.MutexAPI.LockHandle handle = null;
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ lockInternal();
+ /**
+ * MUTEX_KEY.MaterializationRebuild lock ensures that there is only 1 entry in
+ * Initiated/Working state for any resource. This ensures we do not run concurrent
+ * rebuild operations on any materialization.
+ */
+ handle = getMutexAPI().acquireLock(MUTEX_KEY.MaterializationRebuild.name());
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ String selectQ = "select mrl_txn_id from MATERIALIZATION_REBUILD_LOCKS where" +
+ " mrl_db_name =" + quoteString(dbName) +
+ " AND mrl_tbl_name=" + quoteString(tableName);
+ LOG.debug("Going to execute query <" + selectQ + ">");
+ rs = stmt.executeQuery(selectQ);
+ if(rs.next()) {
+ LOG.info("Ignoring request to rebuild " + dbName + "/" + tableName +
+ " since it is already being rebuilt");
+ return new LockResponse(txnId, LockState.NOT_ACQUIRED);
+ }
+ String insertQ = "insert into MATERIALIZATION_REBUILD_LOCKS " +
+ "(mrl_txn_id, mrl_db_name, mrl_tbl_name, mrl_last_heartbeat) values (" + txnId +
+ ", '" + dbName + "', '" + tableName + "', " + Instant.now().toEpochMilli() + ")";
+ LOG.debug("Going to execute update <" + insertQ + ">");
+ stmt.executeUpdate(insertQ);
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ return new LockResponse(txnId, LockState.ACQUIRED);
+ } catch (SQLException ex) {
+ LOG.warn("lockMaterializationRebuild failed due to " + getMessage(ex), ex);
+ throw new MetaException("Unable to retrieve materialization invalidation information due to " +
+ StringUtils.stringifyException(ex));
+ } finally {
+ close(rs, stmt, dbConn);
+ if(handle != null) {
+ handle.releaseLocks();
+ }
+ unlockInternal();
+ }
+ }
+
+ @Override
+ public boolean heartbeatLockMaterializationRebuild(String dbName, String tableName, long txnId)
+ throws MetaException {
+ try {
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+ String s = "update MATERIALIZATION_REBUILD_LOCKS" +
+ " set mrl_last_heartbeat = " + Instant.now().toEpochMilli() +
+ " where mrl_txn_id = " + txnId +
+ " AND mrl_db_name =" + quoteString(dbName) +
+ " AND mrl_tbl_name=" + quoteString(tableName);
+ LOG.debug("Going to execute update <" + s + ">");
+ int rc = stmt.executeUpdate(s);
+ if (rc < 1) {
+ LOG.debug("Going to rollback");
+ dbConn.rollback();
+ LOG.info("No lock found for rebuild of " + Warehouse.getQualifiedName(dbName, tableName) +
+ " when trying to heartbeat");
+ // It could not be renewed, return that information
+ return false;
+ }
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ // It could be renewed, return that information
+ return true;
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e,
+ "heartbeatLockMaterializationRebuild(" + Warehouse.getQualifiedName(dbName, tableName) + ", " + txnId + ")");
+ throw new MetaException("Unable to heartbeat rebuild lock due to " +
+ StringUtils.stringifyException(e));
+ } finally {
+ close(rs, stmt, dbConn);
+ unlockInternal();
+ }
+ } catch (RetryException e) {
+ return heartbeatLockMaterializationRebuild(dbName, tableName ,txnId);
+ }
+ }
+
+ @Override
+ public long cleanupMaterializationRebuildLocks(ValidTxnList validTxnList, long timeout) throws MetaException {
+ try {
+ // Aux values
+ long cnt = 0L;
+ List<Long> txnIds = new ArrayList<>();
+ long timeoutTime = Instant.now().toEpochMilli() - timeout;
+
+ Connection dbConn = null;
+ Statement stmt = null;
+ ResultSet rs = null;
+ try {
+ lockInternal();
+ dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
+ stmt = dbConn.createStatement();
+
+ String selectQ = "select mrl_txn_id, mrl_last_heartbeat from MATERIALIZATION_REBUILD_LOCKS";
+ LOG.debug("Going to execute query <" + selectQ + ">");
+ rs = stmt.executeQuery(selectQ);
+ while(rs.next()) {
+ long lastHeartbeat = rs.getLong(2);
+ if (lastHeartbeat < timeoutTime) {
+ // The heartbeat has timeout, double check whether we can remove it
+ long txnId = rs.getLong(1);
+ if (validTxnList.isTxnValid(txnId) || validTxnList.isTxnAborted(txnId)) {
+ // Txn was committed (but notification was not received) or it was aborted.
+ // Either case, we can clean it up
+ txnIds.add(txnId);
+ }
+ }
+ }
+ if (!txnIds.isEmpty()) {
+ String deleteQ = "delete from MATERIALIZATION_REBUILD_LOCKS where" +
+ " mrl_txn_id IN(" + StringUtils.join(",", txnIds) + ") ";
+ LOG.debug("Going to execute update <" + deleteQ + ">");
+ cnt = stmt.executeUpdate(deleteQ);
+ }
+ LOG.debug("Going to commit");
+ dbConn.commit();
+ return cnt;
+ } catch (SQLException e) {
+ LOG.debug("Going to rollback");
+ rollbackDBConn(dbConn);
+ checkRetryable(dbConn, e, "cleanupMaterializationRebuildLocks");
+
<TRUNCATED>