You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jb...@apache.org on 2016/08/26 17:28:20 UTC
[3/3] incubator-impala git commit: IMPALA-3776: fix 'describe formatted' for Avro tables

IMPALA-3776: fix 'describe formatted' for Avro tables

For Avro tables the column information in the underlying database of the
Hive metastore can be different from what is specified in the avro
schema. HIVE-6308 aimed to improve upon this, but for older tables the
two don't necessarily align.

There are two possible cases:

1) Hive's underlying database contains a column which is not present in
the Avro schema file. In this case we encounter a NullPointerException
in DescribeResultFactory.java#L189 when trying to look up the column in
the internal table object.

2) The Avro schema contains a column, which is not present in the
underlying database. In this case the column will not be displayed in
describe formatted.

In addition to the automatic tests I verified this manually by creating
an Avro table with an external schema file in Hive. This populated the
underlying database with the column information. I then either removed
a column from the Avro schema file (case 1) or cleared the column
information from the "COLUMNS_V2" table in the underlying database
(case 2) and verified that the change fixed both cases.

Change-Id: Ieb69d3678e662465d40aee80ba23132ea13871a0
Reviewed-on: http://gerrit.cloudera.org:8080/4126
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Internal Jenkins
Reviewed-by: Jim Apple <jb...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0e886618
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0e886618
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0e886618

Branch: refs/heads/master
Commit: 0e886618e26f8480aa8926918e7f80b7bada3884
Parents: c23dc3a
Author: Lars Volker <lv...@cloudera.com>
Authored: Thu Jun 23 04:39:25 2016 +0200
Committer: Jim Apple <jb...@cloudera.com>
Committed: Fri Aug 26 17:20:10 2016 +0000

----------------------------------------------------------------------
 .../com/cloudera/impala/catalog/Column.java     | 15 +++++
 .../java/com/cloudera/impala/catalog/Table.java | 12 ++--
 .../impala/service/DescribeResultFactory.java   | 15 ++---
 .../queries/QueryTest/avro-schema-changes.test  | 69 ++++++++++++++++++++
 tests/query_test/test_avro_schema_resolution.py |  5 +-
 5 files changed, 101 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0e886618/fe/src/main/java/com/cloudera/impala/catalog/Column.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/catalog/Column.java b/fe/src/main/java/com/cloudera/impala/catalog/Column.java
index 46a7835..b2d7416 100644
--- a/fe/src/main/java/com/cloudera/impala/catalog/Column.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/Column.java
@@ -17,14 +17,19 @@
 
 package com.cloudera.impala.catalog;
 
+import java.util.List;
+
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.cloudera.impala.thrift.TColumn;
 import com.cloudera.impala.thrift.TColumnStats;
+import com.google.common.base.Function;
 import com.google.common.base.Objects;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 
 /**
  * Internal representation of column-related metadata.
@@ -114,4 +119,14 @@ public class Column {
     colDesc.setCol_stats(getStats().toThrift());
     return colDesc;
   }
+
+  public static List<FieldSchema> toFieldSchemas(List<Column> columns) {
+    return Lists.transform(columns, new Function<Column, FieldSchema>() {
+      public FieldSchema apply(Column column) {
+        Preconditions.checkNotNull(column.getType());
+        return new FieldSchema(column.getName(), column.getType().toSql(),
+            column.getComment());
+      }
+    });
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0e886618/fe/src/main/java/com/cloudera/impala/catalog/Table.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/catalog/Table.java b/fe/src/main/java/com/cloudera/impala/catalog/Table.java
index 8ac2499..f794d7e 100644
--- a/fe/src/main/java/com/cloudera/impala/catalog/Table.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/Table.java
@@ -393,10 +393,7 @@ public abstract class Table implements CatalogObject {
    */
   public ArrayList<Column> getColumnsInHiveOrder() {
     ArrayList<Column> columns = Lists.newArrayList(getNonClusteringColumns());
-
-    for (Column column: colsByPos_.subList(0, numClusteringCols_)) {
-      columns.add(column);
-    }
+    columns.addAll(getClusteringColumns());
     return columns;
   }
 
@@ -412,6 +409,13 @@ public abstract class Table implements CatalogObject {
   }
 
   /**
+   * Returns the list of all partition columns.
+   */
+  public List<Column> getClusteringColumns() {
+    return colsByPos_.subList(0, numClusteringCols_);
+  }
+
+  /**
    * Returns the list of all columns excluding any partition columns.
    */
   public List<Column> getNonClusteringColumns() {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0e886618/fe/src/main/java/com/cloudera/impala/service/DescribeResultFactory.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/service/DescribeResultFactory.java b/fe/src/main/java/com/cloudera/impala/service/DescribeResultFactory.java
index a7c5762..c1a9557 100644
--- a/fe/src/main/java/com/cloudera/impala/service/DescribeResultFactory.java
+++ b/fe/src/main/java/com/cloudera/impala/service/DescribeResultFactory.java
@@ -21,12 +21,12 @@ import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
-import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.PrincipalPrivilegeSet;
 import org.apache.hadoop.hive.metastore.api.PrincipalType;
 import org.apache.hadoop.hive.metastore.api.PrivilegeGrantInfo;
 import org.apache.hadoop.hive.ql.metadata.formatting.MetaDataFormatUtils;
 
+import com.cloudera.impala.catalog.Column;
 import com.cloudera.impala.catalog.Db;
 import com.cloudera.impala.catalog.StructField;
 import com.cloudera.impala.catalog.StructType;
@@ -185,14 +185,11 @@ public class DescribeResultFactory {
 
     org.apache.hadoop.hive.metastore.api.Table msTable =
         table.getMetaStoreTable().deepCopy();
-    // Fixup the metastore table so the output of DESCRIBE FORMATTED|EXTENDED matches
-    // Hive's. This is to distinguish between empty comments and no comments
-    // (value is null).
-    for (FieldSchema fs: msTable.getSd().getCols())
-      fs.setComment(table.getColumn(fs.getName()).getComment());
-    for (FieldSchema fs: msTable.getPartitionKeys()) {
-      fs.setComment(table.getColumn(fs.getName()).getComment());
-    }
+    // For some table formats (e.g. Avro) the column list in the table can differ from the
+    // one returned by the Hive metastore. To handle this we use the column list from the
+    // table which has already reconciled those differences.
+    msTable.getSd().setCols(Column.toFieldSchemas(table.getNonClusteringColumns()));
+    msTable.setPartitionKeys(Column.toFieldSchemas(table.getClusteringColumns()));
 
     // To avoid initializing any of the SerDe classes in the metastore table Thrift
     // struct, create the ql.metadata.Table object by calling the empty c'tor and

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0e886618/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
index e4b87c5..1e3eac9 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
@@ -67,3 +67,72 @@ select * from avro_alter_table_add_new_column
 ---- TYPES
 string, string, boolean, int, bigint, float, double, string, decimal, string
 ====
+---- QUERY
+# IMPALA-3776: Create an Avro table, add a column to the Avro schema and make sure
+# describe and describe formatted still work.
+CREATE TABLE avro_alter_schema_add_new_column (old_col string) STORED AS AVRO;
+
+ALTER TABLE avro_alter_schema_add_new_column SET TBLPROPERTIES (
+'avro.schema.literal'=' {
+"namespace": "org.apache.test",
+"name": "avro_alter_schema_add_new_column",
+"type": "record",
+"fields": [
+{ "name":"old_col", "type":"string" },
+{ "name":"new_col", "type":"string" }
+]
+}'
+);
+
+REFRESH avro_alter_schema_add_new_column;
+====
+---- QUERY
+# The new column now has to show up in describe.
+DESCRIBE avro_alter_schema_add_new_column;
+---- TYPES
+string,string,string
+---- RESULTS
+'old_col','string','from deserializer'
+'new_col','string','from deserializer'
+====
+---- QUERY
+# The new column now has to show up in describe formatted.
+DESCRIBE FORMATTED avro_alter_schema_add_new_column;
+---- TYPES
+string,string,string
+---- RESULTS: VERIFY_IS_SUBSET
+'old_col','STRING','from deserializer'
+'new_col','STRING','from deserializer'
+====
+---- QUERY
+# IMPALA-3776: Create an Avro table, remove a column from the Avro schema and make sure
+# describe and describe formatted still work.
+CREATE TABLE avro_alter_schema_remove_column (col1 string, col2 string) STORED AS AVRO;
+
+ALTER TABLE avro_alter_schema_remove_column SET TBLPROPERTIES (
+'avro.schema.literal'=' {
+"namespace": "org.apache.test",
+"name": "avro_alter_schema_remove_column",
+"type": "record",
+"fields": [
+{ "name":"col1", "type":"string" }
+]
+}'
+);
+REFRESH avro_alter_schema_remove_column;
+====
+---- QUERY
+# The new column now must not show up in describe.
+DESCRIBE avro_alter_schema_remove_column;
+---- TYPES
+string,string,string
+---- RESULTS
+'col1','string','from deserializer'
+====
+---- QUERY
+DESCRIBE FORMATTED avro_alter_schema_remove_column;
+---- TYPES
+string,string,string
+---- RESULTS: VERIFY_IS_SUBSET
+'col1','STRING','from deserializer'
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0e886618/tests/query_test/test_avro_schema_resolution.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_avro_schema_resolution.py b/tests/query_test/test_avro_schema_resolution.py
index e861c63..b72be5b 100644
--- a/tests/query_test/test_avro_schema_resolution.py
+++ b/tests/query_test/test_avro_schema_resolution.py
@@ -48,9 +48,10 @@ class TestAvroSchemaResolution(ImpalaTestSuite):
       assert comparison.data[x] == result.data[x]
 
   def test_avro_schema_changes(self, vector, unique_database):
-    """Test for IMPALA-3314 and IMPALA-3513: Impalad shouldn't crash with stale avro
+    """Test for IMPALA-3314 and IMPALA-3513: Impalad shouldn't crash with stale Avro
     metadata. Instead, should provide a meaningful error message.
-    Test for IMPALA-3092: Impala should be able to query an avro table after ALTER TABLE
+    Test for IMPALA-3092: Impala should be able to query an Avro table after ALTER TABLE
     ... ADD COLUMN ...
+    Test for IMPALA-3776: Fix describe formatted when changing Avro schema.
     """
     self.run_test_case('QueryTest/avro-schema-changes', vector, unique_database)