You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ai...@apache.org on 2018/04/25 23:14:31 UTC

[2/2] hive git commit: HIVE-18986: Table rename will run java.lang.StackOverflowError in dataNucleus if the table contains large number of columns (Aihua Xu, reviewed by Yongzhi Chen)

HIVE-18986: Table rename will run java.lang.StackOverflowError in dataNucleus if the table contains large number of columns (Aihua Xu, reviewed by Yongzhi Chen)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f30efbeb
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f30efbeb
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f30efbeb

Branch: refs/heads/master
Commit: f30efbebf2ff85c55a5d9e3e2f86e0a51341df78
Parents: 11b0d85
Author: Aihua Xu <ai...@apache.org>
Authored: Wed Apr 18 17:05:08 2018 -0700
Committer: Aihua Xu <ai...@apache.org>
Committed: Wed Apr 25 16:10:30 2018 -0700

----------------------------------------------------------------------
 .../queries/clientpositive/alter_rename_table.q | 12 ++-
 .../clientpositive/alter_rename_table.q.out     | 88 ++++++++++++++++++++
 .../apache/hadoop/hive/metastore/Batchable.java | 86 +++++++++++++++++++
 .../hive/metastore/MetaStoreDirectSql.java      | 61 ++------------
 .../hadoop/hive/metastore/ObjectStore.java      | 45 ++++++----
 .../hive/metastore/conf/MetastoreConf.java      |  5 ++
 6 files changed, 227 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/ql/src/test/queries/clientpositive/alter_rename_table.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/alter_rename_table.q b/ql/src/test/queries/clientpositive/alter_rename_table.q
index 53fb230..bcf6ad5 100644
--- a/ql/src/test/queries/clientpositive/alter_rename_table.q
+++ b/ql/src/test/queries/clientpositive/alter_rename_table.q
@@ -36,4 +36,14 @@ create table source.src1 like default.src;
 load data local inpath '../../data/files/kv1.txt' overwrite into table source.src;
 
 ALTER TABLE source.src RENAME TO target.src1;
-select * from target.src1 tablesample (10 rows);
\ No newline at end of file
+select * from target.src1 tablesample (10 rows);
+
+set metastore.rawstore.batch.size=1;
+set metastore.try.direct.sql=false;
+
+create table source.src2 like default.src;
+load data local inpath '../../data/files/kv1.txt' overwrite into table source.src2;
+ANALYZE TABlE source.src2 COMPUTE STATISTICS FOR COLUMNS;
+ALTER TABLE source.src2 RENAME TO target.src3;
+DESC FORMATTED target.src3;
+select * from target.src3 tablesample (10 rows);

http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/ql/src/test/results/clientpositive/alter_rename_table.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/alter_rename_table.q.out b/ql/src/test/results/clientpositive/alter_rename_table.q.out
index 732d8a2..9ac8fd2 100644
--- a/ql/src/test/results/clientpositive/alter_rename_table.q.out
+++ b/ql/src/test/results/clientpositive/alter_rename_table.q.out
@@ -261,3 +261,91 @@ POSTHOOK: Input: target@src1
 278	val_278
 98	val_98
 484	val_484
+PREHOOK: query: create table source.src2 like default.src
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:source
+PREHOOK: Output: source@src2
+POSTHOOK: query: create table source.src2 like default.src
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:source
+POSTHOOK: Output: source@src2
+PREHOOK: query: load data local inpath '../../data/files/kv1.txt' overwrite into table source.src2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: source@src2
+POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' overwrite into table source.src2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: source@src2
+PREHOOK: query: ANALYZE TABlE source.src2 COMPUTE STATISTICS FOR COLUMNS
+PREHOOK: type: QUERY
+PREHOOK: Input: source@src2
+#### A masked pattern was here ####
+PREHOOK: Output: source@src2
+POSTHOOK: query: ANALYZE TABlE source.src2 COMPUTE STATISTICS FOR COLUMNS
+POSTHOOK: type: QUERY
+POSTHOOK: Input: source@src2
+#### A masked pattern was here ####
+POSTHOOK: Output: source@src2
+PREHOOK: query: ALTER TABLE source.src2 RENAME TO target.src3
+PREHOOK: type: ALTERTABLE_RENAME
+PREHOOK: Input: source@src2
+PREHOOK: Output: source@src2
+POSTHOOK: query: ALTER TABLE source.src2 RENAME TO target.src3
+POSTHOOK: type: ALTERTABLE_RENAME
+POSTHOOK: Input: source@src2
+POSTHOOK: Output: source@src2
+POSTHOOK: Output: target@src3
+PREHOOK: query: DESC FORMATTED target.src3
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: target@src3
+POSTHOOK: query: DESC FORMATTED target.src3
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: target@src3
+# col_name            	data_type           	comment             
+key                 	string              	default             
+value               	string              	default             
+	 	 
+# Detailed Table Information	 	 
+Database:           	target              	 
+#### A masked pattern was here ####
+Retention:          	0                   	 
+#### A masked pattern was here ####
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
+#### A masked pattern was here ####
+	numFiles            	1                   
+	numRows             	500                 
+	rawDataSize         	5312                
+	totalSize           	5812                
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
+PREHOOK: query: select * from target.src3 tablesample (10 rows)
+PREHOOK: type: QUERY
+PREHOOK: Input: target@src3
+#### A masked pattern was here ####
+POSTHOOK: query: select * from target.src3 tablesample (10 rows)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: target@src3
+#### A masked pattern was here ####
+238	val_238
+86	val_86
+311	val_311
+27	val_27
+165	val_165
+409	val_409
+255	val_255
+278	val_278
+98	val_98
+484	val_484

http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java
----------------------------------------------------------------------
diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java
new file mode 100644
index 0000000..7e488a5
--- /dev/null
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import java.util.ArrayList;
+import java.util.List;
+import javax.jdo.Query;
+
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *  Base class to add the batch process for DirectSQL or RawStore queries.
+ *  1. Provide the implementation of run() to process one batch
+ *  2. Call Batchable.runBatched() to process the whole dataset
+ *
+ *  I: input type, R: result type
+ */
+public abstract class Batchable<I, R> {
+  private static final Logger LOG = LoggerFactory.getLogger(Batchable.class);
+  public static final int NO_BATCHING = -1;
+
+  private List<Query> queries = null;
+  public abstract List<R> run(List<I> input) throws MetaException;
+
+  public void addQueryAfterUse(Query query) {
+    if (queries == null) {
+      queries = new ArrayList<Query>(1);
+    }
+    queries.add(query);
+  }
+  protected void addQueryAfterUse(Batchable<?, ?> b) {
+    if (b.queries == null) {
+      return;
+    }
+    if (queries == null) {
+      queries = new ArrayList<Query>(1);
+    }
+    queries.addAll(b.queries);
+  }
+  public void closeAllQueries() {
+    for (Query q : queries) {
+      try {
+        q.closeAll();
+      } catch (Throwable t) {
+        LOG.error("Failed to close a query", t);
+      }
+    }
+  }
+
+  public static <I, R> List<R> runBatched(
+      final int batchSize,
+      List<I> input,
+      Batchable<I, R> runnable) throws MetaException {
+    if (batchSize == NO_BATCHING || batchSize >= input.size()) {
+      return runnable.run(input);
+    }
+    List<R> result = new ArrayList<R>(input.size());
+    for (int fromIndex = 0, toIndex = 0; toIndex < input.size(); fromIndex = toIndex) {
+      toIndex = Math.min(fromIndex + batchSize, input.size());
+      List<I> batchedInput = input.subList(fromIndex, toIndex);
+      List<R> batchedOutput = runnable.run(batchedInput);
+      if (batchedOutput != null) {
+        result.addAll(batchedOutput);
+      }
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
----------------------------------------------------------------------
diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
index 997f5fd..4e0e887 100644
--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
@@ -455,7 +455,7 @@ class MetaStoreDirectSql {
     if (partNames.isEmpty()) {
       return Collections.emptyList();
     }
-    return runBatched(partNames, new Batchable<String, Partition>() {
+    return Batchable.runBatched(batchSize, partNames, new Batchable<String, Partition>() {
       @Override
       public List<Partition> run(List<String> input) throws MetaException {
         String filter = "" + PARTITIONS + ".\"PART_NAME\" in (" + makeParams(input.size()) + ")";
@@ -596,7 +596,7 @@ class MetaStoreDirectSql {
     }
 
     // Get full objects. For Oracle/etc. do it in batches.
-    List<Partition> result = runBatched(sqlResult, new Batchable<Object, Partition>() {
+    List<Partition> result = Batchable.runBatched(batchSize, sqlResult, new Batchable<Object, Partition>() {
       @Override
       public List<Partition> run(List<Object> input) throws MetaException {
         return getPartitionsFromPartitionIds(catNameLcase, dbNameLcase, tblNameLcase, isView,
@@ -1374,7 +1374,7 @@ class MetaStoreDirectSql {
         return ensureList(qResult);
       }
     };
-    List<Object[]> list = runBatched(colNames, b);
+    List<Object[]> list = Batchable.runBatched(batchSize, colNames, b);
     if (list.isEmpty()) {
       return null;
     }
@@ -1460,10 +1460,10 @@ class MetaStoreDirectSql {
         + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ? "
         + " and \"COLUMN_NAME\" in (%1$s) and \"PARTITION_NAME\" in (%2$s)"
         + " group by \"PARTITION_NAME\"";
-    List<Long> allCounts = runBatched(colNames, new Batchable<String, Long>() {
+    List<Long> allCounts = Batchable.runBatched(batchSize, colNames, new Batchable<String, Long>() {
       @Override
       public List<Long> run(final List<String> inputColName) throws MetaException {
-        return runBatched(partNames, new Batchable<String, Long>() {
+        return Batchable.runBatched(batchSize, partNames, new Batchable<String, Long>() {
           @Override
           public List<Long> run(List<String> inputPartNames) throws MetaException {
             long partsFound = 0;
@@ -1503,10 +1503,10 @@ class MetaStoreDirectSql {
     final String tableName, final List<String> partNames, List<String> colNames, long partsFound,
     final boolean useDensityFunctionForNDVEstimation, final double ndvTuner, final boolean enableBitVector) throws MetaException {
     final boolean areAllPartsFound = (partsFound == partNames.size());
-    return runBatched(colNames, new Batchable<String, ColumnStatisticsObj>() {
+    return Batchable.runBatched(batchSize, colNames, new Batchable<String, ColumnStatisticsObj>() {
       @Override
       public List<ColumnStatisticsObj> run(final List<String> inputColNames) throws MetaException {
-        return runBatched(partNames, new Batchable<String, ColumnStatisticsObj>() {
+        return Batchable.runBatched(batchSize, partNames, new Batchable<String, ColumnStatisticsObj>() {
           @Override
           public List<ColumnStatisticsObj> run(List<String> inputPartNames) throws MetaException {
             return columnStatisticsObjForPartitionsBatch(catName, dbName, tableName, inputPartNames,
@@ -1918,13 +1918,13 @@ class MetaStoreDirectSql {
           }
         };
         try {
-          return runBatched(partNames, b2);
+          return Batchable.runBatched(batchSize, partNames, b2);
         } finally {
           addQueryAfterUse(b2);
         }
       }
     };
-    List<Object[]> list = runBatched(colNames, b);
+    List<Object[]> list = Batchable.runBatched(batchSize, colNames, b);
 
     List<ColumnStatistics> result = new ArrayList<ColumnStatistics>(
         Math.min(list.size(), partNames.size()));
@@ -2027,49 +2027,6 @@ class MetaStoreDirectSql {
   }
 
 
-  private static abstract class Batchable<I, R> {
-    private List<Query> queries = null;
-    public abstract List<R> run(List<I> input) throws MetaException;
-    public void addQueryAfterUse(Query query) {
-      if (queries == null) {
-        queries = new ArrayList<Query>(1);
-      }
-      queries.add(query);
-    }
-    protected void addQueryAfterUse(Batchable<?, ?> b) {
-      if (b.queries == null) return;
-      if (queries == null) {
-        queries = new ArrayList<Query>(1);
-      }
-      queries.addAll(b.queries);
-    }
-    public void closeAllQueries() {
-      for (Query q : queries) {
-        try {
-          q.closeAll();
-        } catch (Throwable t) {
-          LOG.error("Failed to close a query", t);
-        }
-      }
-    }
-  }
-
-  private <I,R> List<R> runBatched(List<I> input, Batchable<I, R> runnable) throws MetaException {
-    if (batchSize == NO_BATCHING || batchSize >= input.size()) {
-      return runnable.run(input);
-    }
-    List<R> result = new ArrayList<R>(input.size());
-    for (int fromIndex = 0, toIndex = 0; toIndex < input.size(); fromIndex = toIndex) {
-      toIndex = Math.min(fromIndex + batchSize, input.size());
-      List<I> batchedInput = input.subList(fromIndex, toIndex);
-      List<R> batchedOutput = runnable.run(batchedInput);
-      if (batchedOutput != null) {
-        result.addAll(batchedOutput);
-      }
-    }
-    return result;
-  }
-
   public List<SQLForeignKey> getForeignKeys(String catName, String parent_db_name,
                                             String parent_tbl_name, String foreign_db_name,
                                             String foreign_tbl_name) throws MetaException {

http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java
----------------------------------------------------------------------
diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java
index 184ecb6..1abd99d 100644
--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java
@@ -244,6 +244,7 @@ public class ObjectStore implements RawStore, Configurable {
   private static Properties prop = null;
   private static PersistenceManagerFactory pmf = null;
   private static boolean forTwoMetastoreTesting = false;
+  private int batchSize = Batchable.NO_BATCHING;
 
   private static final DateTimeFormatter YMDHMS_FORMAT = DateTimeFormatter.ofPattern(
       "yyyy_MM_dd_HH_mm_ss");
@@ -385,6 +386,8 @@ public class ObjectStore implements RawStore, Configurable {
         directSqlErrors = Metrics.getOrCreateCounter(MetricsConstants.DIRECTSQL_ERRORS);
       }
 
+      this.batchSize = MetastoreConf.getIntVar(conf, ConfVars.RAWSTORE_PARTITION_BATCH_SIZE);
+
       if (!isInitialized) {
         throw new RuntimeException(
         "Unable to create persistence manager. Check dss.log for details");
@@ -8028,25 +8031,33 @@ public class ObjectStore implements RawStore, Configurable {
     try {
       openTransaction();
 
-      List<MTableColumnStatistics> result = null;
       validateTableCols(table, colNames);
       Query query = queryWrapper.query = pm.newQuery(MTableColumnStatistics.class);
-      String filter = "tableName == t1 && dbName == t2 && catName == t3 && (";
-      String paramStr = "java.lang.String t1, java.lang.String t2, java.lang.String t3";
-      Object[] params = new Object[colNames.size() + 3];
-      params[0] = table.getTableName();
-      params[1] = table.getDbName();
-      params[2] = table.getCatName();
-      for (int i = 0; i < colNames.size(); ++i) {
-        filter += ((i == 0) ? "" : " || ") + "colName == c" + i;
-        paramStr += ", java.lang.String c" + i;
-        params[i + 3] = colNames.get(i);
-      }
-      filter += ")";
-      query.setFilter(filter);
-      query.declareParameters(paramStr);
-      result = (List<MTableColumnStatistics>) query.executeWithArray(params);
-      pm.retrieveAll(result);
+      List<MTableColumnStatistics> result =
+          Batchable.runBatched(batchSize, colNames, new Batchable<String, MTableColumnStatistics>() {
+            @Override
+            public List<MTableColumnStatistics> run(List<String> input)
+                throws MetaException {
+              String filter = "tableName == t1 && dbName == t2 && catName == t3 && (";
+              String paramStr = "java.lang.String t1, java.lang.String t2, java.lang.String t3";
+              Object[] params = new Object[input.size() + 3];
+              params[0] = table.getTableName();
+              params[1] = table.getDbName();
+              params[2] = table.getCatName();
+              for (int i = 0; i < input.size(); ++i) {
+                filter += ((i == 0) ? "" : " || ") + "colName == c" + i;
+                paramStr += ", java.lang.String c" + i;
+                params[i + 3] = input.get(i);
+              }
+              filter += ")";
+              query.setFilter(filter);
+              query.declareParameters(paramStr);
+              List<MTableColumnStatistics> paritial = (List<MTableColumnStatistics>) query.executeWithArray(params);
+              pm.retrieveAll(paritial);
+              return paritial;
+            }
+          });
+
       if (result.size() > colNames.size()) {
         throw new MetaException("Unexpected " + result.size() + " statistics for "
             + colNames.size() + " columns");

http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
----------------------------------------------------------------------
diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
index 552eeca..35aa40c 100644
--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
@@ -856,6 +856,11 @@ public class MetastoreConf {
         "hive.metastore.wm.default.pool.size", 4,
         "The size of a default pool to create when creating an empty resource plan;\n" +
         "If not positive, no default pool will be created."),
+    RAWSTORE_PARTITION_BATCH_SIZE("metastore.rawstore.batch.size",
+        "metastore.rawstore.batch.size", -1,
+        "Batch size for partition and other object retrieval from the underlying DB in JDO.\n" +
+        "The JDO implementation such as DataNucleus may run into issues when the generated queries are\n" +
+        "too large. Use this parameter to break the query into multiple batches. -1 means no batching."),
 
     // Hive values we have copied and use as is
     // These two are used to indicate that we are running tests