You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2014/08/05 20:57:18 UTC
svn commit: r1615979 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/io/parquet/read/
test/queries/clientpositive/ test/results/clientpositive/
Author: szehon
Date: Tue Aug 5 18:57:18 2014
New Revision: 1615979
URL: http://svn.apache.org/r1615979
Log:
HIVE-7554 : Parquet Hive should resolve column names in case insensitive manner (Brock Noland via Szehon)
Added:
hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q
hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q
hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java?rev=1615979&r1=1615978&r2=1615979&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java Tue Aug 5 18:57:18 2014
@@ -26,6 +26,7 @@ import org.apache.hadoop.hive.serde2.Col
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.util.StringUtils;
+import parquet.column.ColumnDescriptor;
import parquet.hadoop.api.ReadSupport;
import parquet.io.api.RecordMaterializer;
import parquet.schema.MessageType;
@@ -46,8 +47,8 @@ public class DataWritableReadSupport ext
private static final String TABLE_SCHEMA = "table_schema";
public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA";
- public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access";
-
+ public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access";
+
/**
* From a string which columns names (including hive column), return a list
* of string columns
@@ -75,12 +76,16 @@ public class DataWritableReadSupport ext
final Map<String, String> contextMetadata = new HashMap<String, String>();
if (columns != null) {
final List<String> listColumns = getColumns(columns);
-
+ final Map<String, String> lowerCaseFileSchemaColumns = new HashMap<String,String>();
+ for (ColumnDescriptor c : fileSchema.getColumns()) {
+ lowerCaseFileSchemaColumns.put(c.getPath()[0].toLowerCase(), c.getPath()[0]);
+ }
final List<Type> typeListTable = new ArrayList<Type>();
- for (final String col : listColumns) {
+ for (String col : listColumns) {
+ col = col.toLowerCase();
// listColumns contains partition columns which are metadata only
- if (fileSchema.containsField(col)) {
- typeListTable.add(fileSchema.getType(col));
+ if (lowerCaseFileSchemaColumns.containsKey(col)) {
+ typeListTable.add(fileSchema.getType(lowerCaseFileSchemaColumns.get(col)));
} else {
// below allows schema evolution
typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
@@ -93,10 +98,24 @@ public class DataWritableReadSupport ext
final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
final List<Type> typeListWanted = new ArrayList<Type>();
+ final boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
for (final Integer idx : indexColumnsWanted) {
- typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
+ String col = listColumns.get(idx);
+ if (indexAccess) {
+ typeListWanted.add(tableSchema.getType(col));
+ } else {
+ col = col.toLowerCase();
+ if (lowerCaseFileSchemaColumns.containsKey(col)) {
+ typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col)));
+ } else {
+ // should never occur?
+ String msg = "Column " + col + " at index " + idx + " does not exist in " +
+ lowerCaseFileSchemaColumns;
+ throw new IllegalStateException(msg);
+ }
+ }
}
- requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
+ requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
typeListWanted), fileSchema, configuration);
return new ReadContext(requestedSchemaByUser, contextMetadata);
@@ -127,29 +146,24 @@ public class DataWritableReadSupport ext
}
final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);
-
return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
}
-
+
/**
- * Determine the file column names based on the position within the requested columns and
+ * Determine the file column names based on the position within the requested columns and
* use that as the requested schema.
*/
- private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema,
+ private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema,
Configuration configuration) {
- if(configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) {
+ if (configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) {
final List<String> listColumns = getColumns(configuration.get(IOConstants.COLUMNS));
-
List<Type> requestedTypes = new ArrayList<Type>();
-
for(Type t : requestedSchema.getFields()) {
int index = listColumns.indexOf(t.getName());
requestedTypes.add(fileSchema.getType(index));
}
-
requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes);
}
-
return requestedSchema;
}
}
Modified: hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q?rev=1615979&r1=1615978&r2=1615979&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/parquet_columnar.q Tue Aug 5 18:57:18 2014
@@ -13,15 +13,16 @@ CREATE TABLE parquet_columnar_access_sta
CREATE TABLE parquet_columnar_access (
s string,
- i int,
+ x int,
+ y int,
f float
) STORED AS PARQUET;
LOAD DATA LOCAL INPATH '../../data/files/parquet_columnar.txt' OVERWRITE INTO TABLE parquet_columnar_access_stage;
-INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage;
+INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage;
SELECT * FROM parquet_columnar_access;
-ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float);
+ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float);
SELECT * FROM parquet_columnar_access;
Added: hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q?rev=1615979&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/parquet_mixed_case.q Tue Aug 5 18:57:18 2014
@@ -0,0 +1,13 @@
+DROP TABLE parquet_mixed_case;
+
+CREATE TABLE parquet_mixed_case (
+ lowerCase string,
+ UPPERcase string,
+ stats bigint,
+ moreuppercase string,
+ MORELOWERCASE string
+) STORED AS PARQUET;
+
+LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case;
+
+SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case;
Modified: hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out?rev=1615979&r1=1615978&r2=1615979&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/parquet_columnar.q.out Tue Aug 5 18:57:18 2014
@@ -29,14 +29,16 @@ POSTHOOK: Output: database:default
POSTHOOK: Output: default@parquet_columnar_access_stage
PREHOOK: query: CREATE TABLE parquet_columnar_access (
s string,
- i int,
+ x int,
+ y int,
f float
) STORED AS PARQUET
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
POSTHOOK: query: CREATE TABLE parquet_columnar_access (
s string,
- i int,
+ x int,
+ y int,
f float
) STORED AS PARQUET
POSTHOOK: type: CREATETABLE
@@ -50,17 +52,18 @@ POSTHOOK: query: LOAD DATA LOCAL INPATH
POSTHOOK: type: LOAD
#### A masked pattern was here ####
POSTHOOK: Output: default@parquet_columnar_access_stage
-PREHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage
+PREHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage
PREHOOK: type: QUERY
PREHOOK: Input: default@parquet_columnar_access_stage
PREHOOK: Output: default@parquet_columnar_access
-POSTHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT * FROM parquet_columnar_access_stage
+POSTHOOK: query: INSERT OVERWRITE TABLE parquet_columnar_access SELECT s, i, (i + 1), f FROM parquet_columnar_access_stage
POSTHOOK: type: QUERY
POSTHOOK: Input: default@parquet_columnar_access_stage
POSTHOOK: Output: default@parquet_columnar_access
POSTHOOK: Lineage: parquet_columnar_access.f SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:f, type:float, comment:null), ]
-POSTHOOK: Lineage: parquet_columnar_access.i SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ]
POSTHOOK: Lineage: parquet_columnar_access.s SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:s, type:string, comment:null), ]
+POSTHOOK: Lineage: parquet_columnar_access.x SIMPLE [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ]
+POSTHOOK: Lineage: parquet_columnar_access.y EXPRESSION [(parquet_columnar_access_stage)parquet_columnar_access_stage.FieldSchema(name:i, type:int, comment:null), ]
PREHOOK: query: SELECT * FROM parquet_columnar_access
PREHOOK: type: QUERY
PREHOOK: Input: default@parquet_columnar_access
@@ -69,32 +72,32 @@ POSTHOOK: query: SELECT * FROM parquet_c
POSTHOOK: type: QUERY
POSTHOOK: Input: default@parquet_columnar_access
#### A masked pattern was here ####
-1abc00 1 1.0
-1def01 2 1.1
-1ghi02 3 1.2
-1jkl03 1 1.3
-1mno04 2 1.4
-1pqr05 3 1.0
-1stu06 1 1.1
-1vwx07 2 1.2
-1yza08 3 1.3
-1bcd09 1 1.4
-1efg10 2 1.0
-1hij11 3 1.1
-1klm12 1 1.2
-1nop13 2 1.3
-1qrs14 3 1.4
-1tuv15 1 1.0
-1wxy16 2 1.1
-1zab17 3 1.2
-1cde18 1 1.3
-1fgh19 2 1.4
-1ijk20 3 1.0
-PREHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float)
+1abc00 1 2 1.0
+1def01 2 3 1.1
+1ghi02 3 4 1.2
+1jkl03 1 2 1.3
+1mno04 2 3 1.4
+1pqr05 3 4 1.0
+1stu06 1 2 1.1
+1vwx07 2 3 1.2
+1yza08 3 4 1.3
+1bcd09 1 2 1.4
+1efg10 2 3 1.0
+1hij11 3 4 1.1
+1klm12 1 2 1.2
+1nop13 2 3 1.3
+1qrs14 3 4 1.4
+1tuv15 1 2 1.0
+1wxy16 2 3 1.1
+1zab17 3 4 1.2
+1cde18 1 2 1.3
+1fgh19 2 3 1.4
+1ijk20 3 4 1.0
+PREHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float)
PREHOOK: type: ALTERTABLE_REPLACECOLS
PREHOOK: Input: default@parquet_columnar_access
PREHOOK: Output: default@parquet_columnar_access
-POSTHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, i1 int, f1 float)
+POSTHOOK: query: ALTER TABLE parquet_columnar_access REPLACE COLUMNS (s1 string, x1 int, y1 int, f1 float)
POSTHOOK: type: ALTERTABLE_REPLACECOLS
POSTHOOK: Input: default@parquet_columnar_access
POSTHOOK: Output: default@parquet_columnar_access
@@ -106,24 +109,24 @@ POSTHOOK: query: SELECT * FROM parquet_c
POSTHOOK: type: QUERY
POSTHOOK: Input: default@parquet_columnar_access
#### A masked pattern was here ####
-1abc00 1 1.0
-1def01 2 1.1
-1ghi02 3 1.2
-1jkl03 1 1.3
-1mno04 2 1.4
-1pqr05 3 1.0
-1stu06 1 1.1
-1vwx07 2 1.2
-1yza08 3 1.3
-1bcd09 1 1.4
-1efg10 2 1.0
-1hij11 3 1.1
-1klm12 1 1.2
-1nop13 2 1.3
-1qrs14 3 1.4
-1tuv15 1 1.0
-1wxy16 2 1.1
-1zab17 3 1.2
-1cde18 1 1.3
-1fgh19 2 1.4
-1ijk20 3 1.0
+1abc00 1 2 1.0
+1def01 2 3 1.1
+1ghi02 3 4 1.2
+1jkl03 1 2 1.3
+1mno04 2 3 1.4
+1pqr05 3 4 1.0
+1stu06 1 2 1.1
+1vwx07 2 3 1.2
+1yza08 3 4 1.3
+1bcd09 1 2 1.4
+1efg10 2 3 1.0
+1hij11 3 4 1.1
+1klm12 1 2 1.2
+1nop13 2 3 1.3
+1qrs14 3 4 1.4
+1tuv15 1 2 1.0
+1wxy16 2 3 1.1
+1zab17 3 4 1.2
+1cde18 1 2 1.3
+1fgh19 2 3 1.4
+1ijk20 3 4 1.0
Added: hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out?rev=1615979&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/parquet_mixed_case.q.out Tue Aug 5 18:57:18 2014
@@ -0,0 +1,41 @@
+PREHOOK: query: DROP TABLE parquet_mixed_case
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE parquet_mixed_case
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_mixed_case (
+ lowerCase string,
+ UPPERcase string,
+ stats bigint,
+ moreuppercase string,
+ MORELOWERCASE string
+) STORED AS PARQUET
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: CREATE TABLE parquet_mixed_case (
+ lowerCase string,
+ UPPERcase string,
+ stats bigint,
+ moreuppercase string,
+ MORELOWERCASE string
+) STORED AS PARQUET
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_mixed_case
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@parquet_mixed_case
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_mixed_case' OVERWRITE INTO TABLE parquet_mixed_case
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@parquet_mixed_case
+PREHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_mixed_case
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT lowercase, "|", uppercase, "|", stats, "|", moreuppercase, "|", morelowercase FROM parquet_mixed_case
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_mixed_case
+#### A masked pattern was here ####
+test lowercase string | test upperCase string | NULL | more upperCase string | more lowercase string
+test lowercase string2 | test upperCase string2 | NULL | more upperCase string2 | more lowercase string2