You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2014/08/05 20:57:18 UTC

svn commit: r1615979 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/io/parquet/read/ test/queries/clientpositive/ test/results/clientpositive/

Author: szehon
Date: Tue Aug  5 18:57:18 2014
New Revision: 1615979

URL: http://svn.apache.org/r1615979
Log:
HIVE-7554 : Parquet Hive should resolve column names in case insensitive manner (Brock Noland via Szehon)

Added:
    hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q
    hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
    hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q
    hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java?rev=1615979&r1=1615978&r2=1615979&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java Tue Aug  5 18:57:18 2014
@@ -26,6 +26,7 @@ import org.apache.hadoop.hive.serde2.Col
 import org.apache.hadoop.io.ArrayWritable;
 import org.apache.hadoop.util.StringUtils;
 
+import parquet.column.ColumnDescriptor;
 import parquet.hadoop.api.ReadSupport;
 import parquet.io.api.RecordMaterializer;
 import parquet.schema.MessageType;
@@ -46,8 +47,8 @@ public class DataWritableReadSupport ext
 
   private static final String TABLE_SCHEMA = "table_schema";
   public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA";
-  public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access";  
-  
+  public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access";
+
   /**
    * From a string which columns names (including hive column), return a list
    * of string columns
@@ -75,12 +76,16 @@ public class DataWritableReadSupport ext
     final Map<String, String> contextMetadata = new HashMap<String, String>();
     if (columns != null) {
       final List<String> listColumns = getColumns(columns);
-
+      final Map<String, String> lowerCaseFileSchemaColumns = new HashMap<String,String>();
+      for (ColumnDescriptor c : fileSchema.getColumns()) {
+        lowerCaseFileSchemaColumns.put(c.getPath()[0].toLowerCase(), c.getPath()[0]);
+      }
       final List<Type> typeListTable = new ArrayList<Type>();
-      for (final String col : listColumns) {
+      for (String col : listColumns) {
+        col = col.toLowerCase();
         // listColumns contains partition columns which are metadata only
-        if (fileSchema.containsField(col)) {
-          typeListTable.add(fileSchema.getType(col));
+        if (lowerCaseFileSchemaColumns.containsKey(col)) {
+          typeListTable.add(fileSchema.getType(lowerCaseFileSchemaColumns.get(col)));
         } else {
           // below allows schema evolution
           typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
@@ -93,10 +98,24 @@ public class DataWritableReadSupport ext
       final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
 
       final List<Type> typeListWanted = new ArrayList<Type>();
+      final boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
       for (final Integer idx : indexColumnsWanted) {
-        typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
+        String col = listColumns.get(idx);
+        if (indexAccess) {
+          typeListWanted.add(tableSchema.getType(col));
+        } else {
+          col = col.toLowerCase();
+          if (lowerCaseFileSchemaColumns.containsKey(col)) {
+            typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col)));
+          } else {
+            // should never occur?
+            String msg = "Column " + col + " at index " + idx + " does not exist in " +
+              lowerCaseFileSchemaColumns;
+            throw new IllegalStateException(msg);
+          }
+        }
       }
-      requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), 
+      requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
               typeListWanted), fileSchema, configuration);
 
       return new ReadContext(requestedSchemaByUser, contextMetadata);
@@ -127,29 +146,24 @@ public class DataWritableReadSupport ext
     }
     final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
         parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);
-    
     return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
   }
-  
+
   /**
-  * Determine the file column names based on the position within the requested columns and 
+  * Determine the file column names based on the position within the requested columns and
   * use that as the requested schema.
   */
-  private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, 
+  private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema,
           Configuration configuration) {
-    if(configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) {
+    if (configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) {
       final List<String> listColumns = getColumns(configuration.get(IOConstants.COLUMNS));
-        
       List<Type> requestedTypes = new ArrayList<Type>();
-        
       for(Type t : requestedSchema.getFields()) {
         int index = listColumns.indexOf(t.getName());
         requestedTypes.add(fileSchema.getType(index));
       }
-          
       requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes);
     }
-      
     return requestedSchema;
   }
 }

Modified: hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q?rev=1615979&r1=1615978&r2=1615979&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q Tue Aug  5 18:57:18 2014
@@ -13,15 +13,16 @@ CREATE TABLE parquet_columnar_access_sta
 
 CREATE TABLE parquet_columnar_access (
     s string,
-    i int,
+    x int,
+    y int,
     f float
   ) STORED AS PARQUET;
 
 LOAD DATA LOCAL INPATH '../../data/files/parquet_columnar.txt' OVERWRITE INTO TABLE parquet_columnar_access_stage;
 
-INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage;
+INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage;
 SELECT * FROM parquet_columnar_access;
 
-ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float);
+ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float);
 
 SELECT * FROM parquet_columnar_access;

Added: hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q?rev=1615979&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q Tue Aug  5 18:57:18 2014
@@ -0,0 +1,13 @@
+DROP TABLE parquet_mixed_case;
+
+CREATE TABLE parquet_mixed_case (
+  lowerCase string,
+  UPPERcase string,
+  stats bigint,
+  moreuppercase string,
+  MORELOWERCASE string
+) STORED AS PARQUET;
+
+LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case;
+
+SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case;

Modified: hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out?rev=1615979&r1=1615978&r2=1615979&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out Tue Aug  5 18:57:18 2014
@@ -29,14 +29,16 @@ POSTHOOK: Output: database:default
 POSTHOOK: Output: default@parquet_columnar_access_stage
 PREHOOK: query: CREATE TABLE parquet_columnar_access (
     s string,
-    i int,
+    x int,
+    y int,
     f float
   ) STORED AS PARQUET
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
 POSTHOOK: query: CREATE TABLE parquet_columnar_access (
     s string,
-    i int,
+    x int,
+    y int,
     f float
   ) STORED AS PARQUET
 POSTHOOK: type: CREATETABLE
@@ -50,17 +52,18 @@ POSTHOOK: query: LOAD DATA LOCAL INPATH 
 POSTHOOK: type: LOAD
 #### A masked pattern was here ####
 POSTHOOK: Output: default@parquet_columnar_access_stage
-PREHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage
+PREHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage
 PREHOOK: type: QUERY
 PREHOOK: Input: default@parquet_columnar_access_stage
 PREHOOK: Output: default@parquet_columnar_access
-POSTHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage
+POSTHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@parquet_columnar_access_stage
 POSTHOOK: Output: default@parquet_columnar_access
 POSTHOOK: Lineage: parquet_columnar_access.f SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:f, type:float, comment:null), ]
-POSTHOOK: Lineage: parquet_columnar_access.i SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ]
 POSTHOOK: Lineage: parquet_columnar_access.s SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:s, type:string, comment:null), ]
+POSTHOOK: Lineage: parquet_columnar_access.x SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ]
+POSTHOOK: Lineage: parquet_columnar_access.y EXPRESSION [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ]
 PREHOOK: query: SELECT * FROM parquet_columnar_access
 PREHOOK: type: QUERY
 PREHOOK: Input: default@parquet_columnar_access
@@ -69,32 +72,32 @@ POSTHOOK: query: SELECT * FROM parquet_c
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@parquet_columnar_access
 #### A masked pattern was here ####
-1abc00	1	1.0
-1def01	2	1.1
-1ghi02	3	1.2
-1jkl03	1	1.3
-1mno04	2	1.4
-1pqr05	3	1.0
-1stu06	1	1.1
-1vwx07	2	1.2
-1yza08	3	1.3
-1bcd09	1	1.4
-1efg10	2	1.0
-1hij11	3	1.1
-1klm12	1	1.2
-1nop13	2	1.3
-1qrs14	3	1.4
-1tuv15	1	1.0
-1wxy16	2	1.1
-1zab17	3	1.2
-1cde18	1	1.3
-1fgh19	2	1.4
-1ijk20	3	1.0
-PREHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float)
+1abc00	1	2	1.0
+1def01	2	3	1.1
+1ghi02	3	4	1.2
+1jkl03	1	2	1.3
+1mno04	2	3	1.4
+1pqr05	3	4	1.0
+1stu06	1	2	1.1
+1vwx07	2	3	1.2
+1yza08	3	4	1.3
+1bcd09	1	2	1.4
+1efg10	2	3	1.0
+1hij11	3	4	1.1
+1klm12	1	2	1.2
+1nop13	2	3	1.3
+1qrs14	3	4	1.4
+1tuv15	1	2	1.0
+1wxy16	2	3	1.1
+1zab17	3	4	1.2
+1cde18	1	2	1.3
+1fgh19	2	3	1.4
+1ijk20	3	4	1.0
+PREHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float)
 PREHOOK: type: ALTERTABLE_REPLACECOLS
 PREHOOK: Input: default@parquet_columnar_access
 PREHOOK: Output: default@parquet_columnar_access
-POSTHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float)
+POSTHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float)
 POSTHOOK: type: ALTERTABLE_REPLACECOLS
 POSTHOOK: Input: default@parquet_columnar_access
 POSTHOOK: Output: default@parquet_columnar_access
@@ -106,24 +109,24 @@ POSTHOOK: query: SELECT * FROM parquet_c
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@parquet_columnar_access
 #### A masked pattern was here ####
-1abc00	1	1.0
-1def01	2	1.1
-1ghi02	3	1.2
-1jkl03	1	1.3
-1mno04	2	1.4
-1pqr05	3	1.0
-1stu06	1	1.1
-1vwx07	2	1.2
-1yza08	3	1.3
-1bcd09	1	1.4
-1efg10	2	1.0
-1hij11	3	1.1
-1klm12	1	1.2
-1nop13	2	1.3
-1qrs14	3	1.4
-1tuv15	1	1.0
-1wxy16	2	1.1
-1zab17	3	1.2
-1cde18	1	1.3
-1fgh19	2	1.4
-1ijk20	3	1.0
+1abc00	1	2	1.0
+1def01	2	3	1.1
+1ghi02	3	4	1.2
+1jkl03	1	2	1.3
+1mno04	2	3	1.4
+1pqr05	3	4	1.0
+1stu06	1	2	1.1
+1vwx07	2	3	1.2
+1yza08	3	4	1.3
+1bcd09	1	2	1.4
+1efg10	2	3	1.0
+1hij11	3	4	1.1
+1klm12	1	2	1.2
+1nop13	2	3	1.3
+1qrs14	3	4	1.4
+1tuv15	1	2	1.0
+1wxy16	2	3	1.1
+1zab17	3	4	1.2
+1cde18	1	2	1.3
+1fgh19	2	3	1.4
+1ijk20	3	4	1.0

Added: hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out?rev=1615979&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out Tue Aug  5 18:57:18 2014
@@ -0,0 +1,41 @@
+PREHOOK: query: DROP TABLE parquet_mixed_case
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE parquet_mixed_case
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_mixed_case (
+  lowerCase string,
+  UPPERcase string,
+  stats bigint,
+  moreuppercase string,
+  MORELOWERCASE string
+) STORED AS PARQUET
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: CREATE TABLE parquet_mixed_case (
+  lowerCase string,
+  UPPERcase string,
+  stats bigint,
+  moreuppercase string,
+  MORELOWERCASE string
+) STORED AS PARQUET
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_mixed_case
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@parquet_mixed_case
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@parquet_mixed_case
+PREHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_mixed_case
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_mixed_case
+#### A masked pattern was here ####
+test lowercase string	|	test upperCase string	|	NULL	|	more upperCase string	|	more lowercase string
+test lowercase string2	|	test upperCase string2	|	NULL	|	more upperCase string2	|	more lowercase string2