You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2020/05/25 23:14:16 UTC

[hive] branch master updated: HIVE-23536 : Provide an option to skip stats generation for major compaction (Peter Vary via Ashutosh Chauhan)

This is an automated email from the ASF dual-hosted git repository.

hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 947b7a4  HIVE-23536 : Provide an option to skip stats generation for major compaction (Peter Vary via Ashutosh Chauhan)
947b7a4 is described below

commit 947b7a44896fa57bc4e2ddaa6014cc4cb2c7002e
Author: Peter Vary <pv...@cloudera.com>
AuthorDate: Mon May 25 16:13:32 2020 -0700

    HIVE-23536 : Provide an option to skip stats generation for major compaction (Peter Vary via Ashutosh Chauhan)
    
    Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |  6 +++
 .../hive/ql/txn/compactor/TestCompactor.java       | 52 ++++++++++++++++++++++
 .../hadoop/hive/ql/txn/compactor/CompactorMR.java  |  5 ++-
 .../hadoop/hive/ql/txn/compactor/Worker.java       |  8 ++--
 4 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index a00d907..8094d28 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2908,6 +2908,12 @@ public class HiveConf extends Configuration {
 
     HIVE_COMPACTOR_WAIT_TIMEOUT("hive.compactor.wait.timeout", 300000L, "Time out in "
         + "milliseconds for blocking compaction. It's value has to be higher than 2000 milliseconds. "),
+
+    HIVE_MR_COMPACTOR_GATHER_STATS("hive.mr.compactor.gather.stats", true, "If set to true MAJOR compaction " +
+        "will gather stats if there are stats already associated with the table/partition.\n" +
+        "Turn this off to save some resources and the stats are not used anyway.\n" +
+        "Works only for MR based compaction, CRUD based compaction uses hive.stats.autogather."),
+
     /**
      * @deprecated Use MetastoreConf.COMPACTOR_INITIATOR_FAILED_THRESHOLD
      */
diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
index c687f14..32fe535 100644
--- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
+++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
@@ -129,6 +129,7 @@ public class TestCompactor {
     hiveConf.setVar(HiveConf.ConfVars.POSTEXECHOOKS, "");
     hiveConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, TEST_WAREHOUSE_DIR);
     hiveConf.setVar(HiveConf.ConfVars.HIVEINPUTFORMAT, HiveInputFormat.class.getName());
+    hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER, false);
 
     TxnDbUtil.setConfValues(hiveConf);
     TxnDbUtil.cleanDb(hiveConf);
@@ -1468,6 +1469,57 @@ public class TestCompactor {
     }
   }
 
+  @Test
+  public void testCompactorGatherStats() throws Exception {
+    String dbName = "default";
+    String tableName = "stats_comp_test";
+    List<String> colNames = Arrays.asList("a");
+    executeStatementOnDriver("drop table if exists " + dbName + "." + tableName, driver);
+    executeStatementOnDriver("create table " + dbName + "." + tableName +
+        " (a INT) STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
+    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver);
+    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver);
+
+    TxnStore txnHandler = TxnUtils.getTxnStore(conf);
+    txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR));
+    runWorker(conf);
+
+    // Make sure we do not have statistics for this table yet
+    // Compaction generates stats only if there is any
+    List<ColumnStatisticsObj> colStats = msClient.getTableColumnStatistics(dbName,
+        tableName, colNames, Constants.HIVE_ENGINE);
+    assertEquals("No stats should be there for the table", 0, colStats.size());
+
+    executeStatementOnDriver("analyze table " + dbName + "." + tableName + " compute statistics for columns", driver);
+    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(2)", driver);
+
+    // Make sure we have old statistics for the table
+    colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
+    assertEquals("Stats should be there", 1, colStats.size());
+    assertEquals("Value should contain old data", 1, colStats.get(0).getStatsData().getLongStats().getHighValue());
+    assertEquals("Value should contain old data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
+
+    txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR));
+    runWorker(conf);
+    // Make sure the statistics is updated for the table
+    colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
+    assertEquals("Stats should be there", 1, colStats.size());
+    assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue());
+    assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
+
+    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(3)", driver);
+    HiveConf workerConf = new HiveConf(conf);
+    workerConf.setBoolVar(ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS, false);
+
+    txnHandler.compact(new CompactionRequest(dbName, tableName, CompactionType.MAJOR));
+    runWorker(workerConf);
+    // Make sure the statistics is NOT updated for the table
+    colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
+    assertEquals("Stats should be there", 1, colStats.size());
+    assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue());
+    assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
+  }
+
   /**
    * Users have the choice of specifying compaction related tblproperties either in CREATE TABLE
    * statement or in ALTER TABLE .. COMPACT statement. This tests both cases.
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
index 018c733..0425142 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
@@ -211,6 +211,7 @@ public class CompactorMR {
    * @param sd metastore storage descriptor
    * @param writeIds list of valid write ids
    * @param ci CompactionInfo
+   * @param su StatsUpdater which is null if no stats gathering is needed
    * @throws java.io.IOException if the job fails
    */
   void run(HiveConf conf, String jobName, Table t, Partition p, StorageDescriptor sd, ValidWriteIdList writeIds,
@@ -294,7 +295,9 @@ public class CompactorMR {
     launchCompactionJob(job, baseDir, ci.type, dirsToSearch, dir.getCurrentDirectories(),
       dir.getCurrentDirectories().size(), dir.getObsolete().size(), conf, msc, ci.id, jobName);
 
-    su.gatherStats();
+    if (su != null) {
+      su.gatherStats();
+    }
   }
 
   /**
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java
index a96cf1e..8180adc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Worker.java
@@ -91,6 +91,7 @@ public class Worker extends RemoteCompactorThread implements MetaStoreThread {
   @Override
   public void run() {
     LOG.info("Starting Worker thread");
+    boolean computeStats = conf.getBoolVar(HiveConf.ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS);
     do {
       boolean launchedJob = false;
       // Make sure nothing escapes this run method and kills the metastore at large,
@@ -201,10 +202,11 @@ public class Worker extends RemoteCompactorThread implements MetaStoreThread {
           continue;
         }
 
-        LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName() + " in " + JavaUtils.txnIdToString(compactorTxnId));
-        final StatsUpdater su = StatsUpdater.init(ci, msc.findColumnsWithStats(
+        LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName() + " in " +
+            JavaUtils.txnIdToString(compactorTxnId) + " with compute stats set to " + computeStats);
+        final StatsUpdater su = computeStats ? StatsUpdater.init(ci, msc.findColumnsWithStats(
             CompactionInfo.compactionInfoToStruct(ci)), conf,
-          runJobAsSelf(ci.runAs) ? ci.runAs : t.getOwner());
+          runJobAsSelf(ci.runAs) ? ci.runAs : t.getOwner()) : null;
         final CompactorMR mr = new CompactorMR();
         launchedJob = true;
         try {