You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2020/05/25 23:14:16 UTC
[hive] branch master updated: HIVE-23536 : Provide an option to
skip stats generation for major compaction (Peter Vary via Ashutosh
Chauhan)
This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 947b7a4 HIVE-23536 : Provide an option to skip stats generation for major compaction (Peter Vary via Ashutosh Chauhan)
947b7a4 is described below
commit 947b7a44896fa57bc4e2ddaa6014cc4cb2c7002e
Author: Peter Vary <pv...@cloudera.com>
AuthorDate: Mon May 25 16:13:32 2020 -0700
HIVE-23536 : Provide an option to skip stats generation for major compaction (Peter Vary via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 6 +++
.../hive/ql/txn/compactor/TestCompactor.java | 52 ++++++++++++++++++++++
.../hadoop/hive/ql/txn/compactor/CompactorMR.java | 5 ++-
.../hadoop/hive/ql/txn/compactor/Worker.java | 8 ++--
4 files changed, 67 insertions(+), 4 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index a00d907..8094d28 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2908,6 +2908,12 @@ public class HiveConf extends Configuration {
HIVE_COMPACTOR_WAIT_TIMEOUT("hive.compactor.wait.timeout", 300000L, "Time out in "
+ "milliseconds for blocking compaction. It's value has to be higher than 2000 milliseconds. "),
+
+ HIVE_MR_COMPACTOR_GATHER_STATS("hive.mr.compactor.gather.stats", true, "If set to true MAJOR compaction " +
+ "will gather stats if there are stats already associated with the table/partition.\n" +
+ "Turn this off to save some resources and the stats are not used anyway.\n" +
+ "Works only for MR based compaction, CRUD based compaction uses hive.stats.autogather."),
+
/**
* @deprecated Use MetastoreConf.COMPACTOR_INITIATOR_FAILED_THRESHOLD
*/
diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
index c687f14..32fe535 100644
--- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
+++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
@@ -129,6 +129,7 @@ public class TestCompactor {
hiveConf.setVar(HiveConf.ConfVars.POSTEXECHOOKS, "");
hiveConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, TEST_WAREHOUSE_DIR);
hiveConf.setVar(HiveConf.ConfVars.HIVEINPUTFORMAT, HiveInputFormat.class.getName());
+ hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER, false);
TxnDbUtil.setConfValues(hiveConf);
TxnDbUtil.cleanDb(hiveConf);
@@ -1468,6 +1469,57 @@ public class TestCompactor {
}
}
+ @Test
+ public void testCompactorGatherStats() throws Exception {
+ String dbName = "default";
+ String tableName = "stats_comp_test";
+ List<String> colNames = Arrays.asList("a");
+ executeStatementOnDriver("drop table if exists " + dbName + "." + tableName, driver);
+ executeStatementOnDriver("create table " + dbName + "." + tableName +
+ " (a INT) STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
+ executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver);
+ executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver);
+
+ TxnStore txnHandler = TxnUtils.getTxnStore(conf);
+ txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR));
+ runWorker(conf);
+
+ // Make sure we do not have statistics for this table yet
+ // Compaction generates stats only if there is any
+ List<ColumnStatisticsObj> colStats = msClient.getTableColumnStatistics(dbName,
+ tableName, colNames, Constants.HIVE_ENGINE);
+ assertEquals("No stats should be there for the table", 0, colStats.size());
+
+ executeStatementOnDriver("analyze table " + dbName + "." + tableName + " compute statistics for columns", driver);
+ executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(2)", driver);
+
+ // Make sure we have old statistics for the table
+ colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
+ assertEquals("Stats should be there", 1, colStats.size());
+ assertEquals("Value should contain old data", 1, colStats.get(0).getStatsData().getLongStats().getHighValue());
+ assertEquals("Value should contain old data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
+
+ txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR));
+ runWorker(conf);
+ // Make sure the statistics is updated for the table
+ colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
+ assertEquals("Stats should be there", 1, colStats.size());
+ assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue());
+ assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
+
+ executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(3)", driver);
+ HiveConf workerConf = new HiveConf(conf);
+ workerConf.setBoolVar(ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS, false);
+
+ txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR));
+ runWorker(workerConf);
+ // Make sure the statistics is NOT updated for the table
+ colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
+ assertEquals("Stats should be there", 1, colStats.size());
+ assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue());
+ assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
+ }
+
/**
* Users have the choice of specifying compaction related tblproperties either in CREATE TABLE
* statement or in ALTER TABLE .. COMPACT statement. This tests both cases.
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
index 018c733..0425142 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
@@ -211,6 +211,7 @@ public class CompactorMR {
* @param sd metastore storage descriptor
* @param writeIds list of valid write ids
* @param ci CompactionInfo
+ * @param su StatsUpdater which is null if no stats gathering is needed
* @throws java.io.IOException if the job fails
*/
void run(HiveConf conf, String jobName, Table t, Partition p, StorageDescriptor sd, ValidWriteIdList writeIds,
@@ -294,7 +295,9 @@ public class CompactorMR {
launchCompactionJob(job, baseDir, ci.type, dirsToSearch, dir.getCurrentDirectories(),
dir.getCurrentDirectories().size(), dir.getObsolete().size(), conf, msc, ci.id, jobName);
- su.gatherStats();
+ if (su != null) {
+ su.gatherStats();
+ }
}
/**
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java
index a96cf1e..8180adc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java
@@ -91,6 +91,7 @@ public class Worker extends RemoteCompactorThread implements MetaStoreThread {
@Override
public void run() {
LOG.info("Starting Worker thread");
+ boolean computeStats = conf.getBoolVar(HiveConf.ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS);
do {
boolean launchedJob = false;
// Make sure nothing escapes this run method and kills the metastore at large,
@@ -201,10 +202,11 @@ public class Worker extends RemoteCompactorThread implements MetaStoreThread {
continue;
}
- LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName() + " in " + JavaUtils.txnIdToString(compactorTxnId));
- final StatsUpdater su = StatsUpdater.init(ci, msc.findColumnsWithStats(
+ LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName() + " in " +
+ JavaUtils.txnIdToString(compactorTxnId) + " with compute stats set to " + computeStats);
+ final StatsUpdater su = computeStats ? StatsUpdater.init(ci, msc.findColumnsWithStats(
CompactionInfo.compactionInfoToStruct(ci)), conf,
- runJobAsSelf(ci.runAs) ? ci.runAs : t.getOwner());
+ runJobAsSelf(ci.runAs) ? ci.runAs : t.getOwner()) : null;
final CompactorMR mr = new CompactorMR();
launchedJob = true;
try {