You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by he...@apache.org on 2017/03/05 23:37:07 UTC
[5/8] incubator-impala git commit: IMPALA-4675: Case-insensitive
matching of Parquet fields.
IMPALA-4675: Case-insensitive matching of Parquet fields.
The query option PARQUET_FALLBACK_SCHEMA_RESOLUTION
allows matching of Parquet fields by name instead of by
index (the default).
Parquet column names are case sensitive, but Impala treats
db/table/column/field names as case-insensitive. Today,
there is no way today to select Parquet columns with mixed
casing via SQL using the name-based field resolution policy.
This patch changes the matching of Parquet fields to be
case-insensitive.
Testing:
- Modified the data files backing complextypestbl
to contain fields with mixed casing.
- Several existing tests run against this table,
including the test for name-based resolution.
- I confirmed that without this fix, the existing
name-based resolution tests fail on the modified
data files.
- I locally ran test_scanners.py and test_nested_types.py
on exhaustive with this fix.
Change-Id: I87395f84ba29b4c3d8e41be1ea4e89e500b8a9f4
Reviewed-on: http://gerrit.cloudera.org:8080/5891
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/34353218
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/34353218
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/34353218
Branch: refs/heads/master
Commit: 34353218cebbcd02630b07681624c1a1d5c9fd5b
Parents: 7ea96c6
Author: Nathan Salmon <na...@gmail.com>
Authored: Wed Mar 1 13:23:41 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Mar 3 10:20:07 2017 +0000
----------------------------------------------------------------------
be/src/exec/parquet-metadata-utils.cc | 3 +-
be/src/exec/parquet-metadata-utils.h | 4 +-
testdata/ComplexTypesTbl/nonnullable.avsc | 16 ++--
testdata/ComplexTypesTbl/nonnullable.json | 14 ++--
testdata/ComplexTypesTbl/nonnullable.parq | Bin 3190 -> 3186 bytes
testdata/ComplexTypesTbl/nullable.avsc | 16 ++--
testdata/ComplexTypesTbl/nullable.json | 80 +++++++++----------
testdata/ComplexTypesTbl/nullable.parq | Bin 3900 -> 3896 bytes
.../QueryTest/parquet-resolution-by-name.test | 12 ---
tests/query_test/test_scanners.py | 2 +-
10 files changed, 71 insertions(+), 76 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/be/src/exec/parquet-metadata-utils.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-metadata-utils.cc b/be/src/exec/parquet-metadata-utils.cc
index 7fb2313..2527219 100644
--- a/be/src/exec/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet-metadata-utils.cc
@@ -20,6 +20,7 @@
#include <string>
#include <sstream>
#include <vector>
+#include <strings.h>
#include <boost/algorithm/string.hpp>
#include <gutil/strings/substitute.h>
@@ -546,7 +547,7 @@ int ParquetSchemaResolver::FindChildWithName(SchemaNode* node,
const string& name) const {
int idx;
for (idx = 0; idx < node->children.size(); ++idx) {
- if (node->children[idx].element->name == name) break;
+ if (strcasecmp(node->children[idx].element->name.c_str(), name.c_str()) == 0) break;
}
return idx;
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/be/src/exec/parquet-metadata-utils.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-metadata-utils.h b/be/src/exec/parquet-metadata-utils.h
index fe7552e..fdfc463 100644
--- a/be/src/exec/parquet-metadata-utils.h
+++ b/be/src/exec/parquet-metadata-utils.h
@@ -188,7 +188,9 @@ class ParquetSchemaResolver {
int next_idx, SchemaNode* node, bool* missing_field) const;
/// Returns the index of 'node's child with 'name', or the number of children if not
- /// found.
+ /// found. The name comparison is case-insensitive because that's how Impala treats
+ /// db/table/column/field names. If there are several matches with different casing,
+ /// then the index of the first match is returned.
int FindChildWithName(SchemaNode* node, const string& name) const;
/// The ResolvePathHelper() logic for arrays.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.avsc
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nonnullable.avsc b/testdata/ComplexTypesTbl/nonnullable.avsc
index 5f78dcf..b82c423 100644
--- a/testdata/ComplexTypesTbl/nonnullable.avsc
+++ b/testdata/ComplexTypesTbl/nonnullable.avsc
@@ -1,26 +1,28 @@
{"type": "record",
"namespace": "org.apache.impala",
"name": "ComplexTypesTbl",
+ /* Field names have mixed casing to test the case-insensitive matching of
+ fields in Parquet files. */
"fields": [
- {"name": "id", "type": "long"},
- {"name": "int_array", "type": {"type": "array", "items": "int"}},
+ {"name": "ID", "type": "long"},
+ {"name": "Int_Array", "type": {"type": "array", "items": "int"}},
{"name": "int_array_array", "type": {"type": "array", "items":
{"type": "array", "items": "int"}}},
- {"name": "int_map", "type": {"type": "map", "values": "int"}},
+ {"name": "Int_Map", "type": {"type": "map", "values": "int"}},
{"name": "int_map_array", "type": {"type": "array", "items":
{"type": "map", "values": "int"}}},
- {"name": "nested_struct", "type":
+ {"name": "nested_Struct", "type":
{"type": "record", "name": "r1", "fields": [
{"name": "a", "type": "int"},
- {"name": "b", "type": {"type": "array", "items": "int"}},
+ {"name": "B", "type": {"type": "array", "items": "int"}},
{"name": "c", "type": {"type": "record", "name": "r2", "fields": [
- {"name": "d", "type": {"type": "array", "items":
+ {"name": "D", "type": {"type": "array", "items":
{"type": "array", "items":
{"type": "record", "name": "r3", "fields": [
{"name": "e", "type": "int"},
{"name": "f", "type": "string"}]}}}}
]}},
- {"name": "g", "type": {"type": "map", "values": {
+ {"name": "G", "type": {"type": "map", "values": {
"type": "record", "name": "r4", "fields": [
{"name": "h", "type": {"type": "record", "name": "r5", "fields": [
{"name": "i", "type": {"type": "array", "items": "double"}}]}}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.json
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nonnullable.json b/testdata/ComplexTypesTbl/nonnullable.json
index 9c49460..4da4968 100644
--- a/testdata/ComplexTypesTbl/nonnullable.json
+++ b/testdata/ComplexTypesTbl/nonnullable.json
@@ -1,14 +1,14 @@
[
-{"id": 8,
- "int_array": [-1],
+{"ID": 8,
+ "Int_Array": [-1],
"int_array_array": [[-1,-2],[]],
- "int_map": {"k1": -1},
+ "Int_Map": {"k1": -1},
"int_map_array": [{}, {"k1": 1}, {}, {}],
- "nested_struct": {
+ "nested_Struct": {
"a": -1,
- "b": [-1],
+ "B": [-1],
"c": {
- "d": [
+ "D": [
[{"e": -1, "f": "nonnullable"}]]},
- "g": {}}}
+ "G": {}}}
]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.parq
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nonnullable.parq b/testdata/ComplexTypesTbl/nonnullable.parq
index e13d9eb..f4be082 100644
Binary files a/testdata/ComplexTypesTbl/nonnullable.parq and b/testdata/ComplexTypesTbl/nonnullable.parq differ
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.avsc
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nullable.avsc b/testdata/ComplexTypesTbl/nullable.avsc
index 564bea8..b3bd874 100644
--- a/testdata/ComplexTypesTbl/nullable.avsc
+++ b/testdata/ComplexTypesTbl/nullable.avsc
@@ -1,28 +1,30 @@
{"type": "record",
"namespace": "org.apache.impala",
"name": "ComplexTypesTbl",
+ /* Field names have mixed casing to test the case-insensitive matching of
+ fields in Parquet files. */
"fields": [
{"name": "id", "type": ["null", "long"]},
{"name": "int_array", "type": ["null", {"type": "array", "items": ["null", "int"]}]},
- {"name": "int_array_array", "type": ["null", {"type": "array", "items":
+ {"name": "int_array_Array", "type": ["null", {"type": "array", "items":
["null", {"type": "array", "items": ["null", "int"]}]}]},
{"name": "int_map", "type": ["null", {"type": "map", "values": ["null", "int"]}]},
- {"name": "int_map_array", "type": ["null", {"type": "array", "items":
+ {"name": "int_Map_Array", "type": ["null", {"type": "array", "items":
["null", {"type": "map", "values": ["null", "int"]}]}]},
{"name": "nested_struct", "type":
["null", {"type": "record", "name": "r1", "fields": [
- {"name": "a", "type": ["null", "int"]},
+ {"name": "A", "type": ["null", "int"]},
{"name": "b", "type": ["null", {"type": "array", "items": ["null", "int"]}]},
- {"name": "c", "type": ["null", {"type": "record", "name": "r2", "fields": [
+ {"name": "C", "type": ["null", {"type": "record", "name": "r2", "fields": [
{"name": "d", "type": ["null", {"type": "array", "items":
["null", {"type": "array", "items":
["null", {"type": "record", "name": "r3", "fields": [
- {"name": "e", "type": ["null", "int"]},
- {"name": "f", "type": ["null", "string"]}]}]}]}]}
+ {"name": "E", "type": ["null", "int"]},
+ {"name": "F", "type": ["null", "string"]}]}]}]}]}
]}]},
{"name": "g", "type": ["null", {"type": "map", "values":
["null", {"type": "record", "name": "r4", "fields": [
- {"name": "h", "type":
+ {"name": "H", "type":
["null", {"type": "record", "name": "r5", "fields": [
{"name": "i", "type": ["null", {"type": "array", "items":
["null", "double"]}]}]}]}]}]}]}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.json
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nullable.json b/testdata/ComplexTypesTbl/nullable.json
index 251b522..6f49c20 100644
--- a/testdata/ComplexTypesTbl/nullable.json
+++ b/testdata/ComplexTypesTbl/nullable.json
@@ -1,86 +1,86 @@
[
{"id": 1,
"int_array": [1,2,3],
- "int_array_array": [[1,2],[3,4]],
+ "int_array_Array": [[1,2],[3,4]],
"int_map": {"k1": 1, "k2": 100},
- "int_map_array": [{"k1": 1}],
+ "int_Map_Array": [{"k1": 1}],
"nested_struct": {
- "a": 1,
+ "A": 1,
"b": [1],
- "c": {
+ "C": {
"d": [
- [{"e": 10, "f": "aaa"},
- {"e": -10, "f": "bbb"}],
- [{"e": 11, "f": "c"}]]},
- "g": {"foo": {"h": {"i": [1.1]}}}}},
+ [{"E": 10, "F": "aaa"},
+ {"E": -10, "F": "bbb"}],
+ [{"E": 11, "F": "c"}]]},
+ "g": {"foo": {"H": {"i": [1.1]}}}}},
{"id": 2,
"int_array": [null,1,2,null,3,null],
- "int_array_array": [[null,1,2,null],[3,null,4],[], null],
+ "int_array_Array": [[null,1,2,null],[3,null,4],[], null],
"int_map": {"k1": 2, "k2": null},
- "int_map_array": [{"k3": null, "k1": 1}, null, {}],
+ "int_Map_Array": [{"k3": null, "k1": 1}, null, {}],
"nested_struct": {
- "a": null,
+ "A": null,
"b": [null],
- "c": {"d": [
- [{"e": null, "f": null},
- {"e": 10, "f": "aaa"},
- {"e": null, "f": null},
- {"e": -10, "f": "bbb"},
- {"e": null, "f": null}],
- [{"e": 11, "f": "c"},
+ "C": {"d": [
+ [{"E": null, "F": null},
+ {"E": 10, "F": "aaa"},
+ {"E": null, "F": null},
+ {"E": -10, "F": "bbb"},
+ {"E": null, "F": null}],
+ [{"E": 11, "F": "c"},
null],
[],
null]},
"g": {
- "g1": {"h": {"i": [2.2, null]}},
- "g2": {"h": {"i": []}},
+ "g1": {"H": {"i": [2.2, null]}},
+ "g2": {"H": {"i": []}},
"g3": null,
- "g4": {"h": {"i": null}},
- "g5": {"h": null}}}},
+ "g4": {"H": {"i": null}},
+ "g5": {"H": null}}}},
{"id": 3,
"int_array": [],
- "int_array_array": [null],
+ "int_array_Array": [null],
"int_map": {},
- "int_map_array": [null, null],
+ "int_Map_Array": [null, null],
"nested_struct": {
- "a": null,
+ "A": null,
"b": null,
- "c": {"d": []},
+ "C": {"d": []},
"g": {}}},
{"id": 4,
"int_array": null,
- "int_array_array": [],
+ "int_array_Array": [],
"int_map": {},
- "int_map_array": [],
+ "int_Map_Array": [],
"nested_struct": {
- "a": null,
+ "A": null,
"b": null,
- "c": {"d": null},
+ "C": {"d": null},
"g": null}},
{"id": 5,
"int_array": null,
- "int_array_array": null,
+ "int_array_Array": null,
"int_map": {},
"nested_struct": {
- "a": null,
+ "A": null,
"b": null,
- "c": null,
- "g": {"foo": {"h": {"i": [2.2, 3.3]}}}}},
+ "C": null,
+ "g": {"foo": {"H": {"i": [2.2, 3.3]}}}}},
{"id": 6,
"int_array": null,
- "int_array_array": null,
+ "int_array_Array": null,
"int_map": null,
- "int_map_array": null,
+ "int_Map_Array": null,
"nested_struct": null},
{"id": 7,
"int_array": null,
- "int_array_array": [null,[5,6]],
+ "int_array_Array": [null,[5,6]],
"int_map": {"k1": null, "k3": null},
- "int_map_array": null,
+ "int_Map_Array": null,
"nested_struct": {
- "a": 7,
+ "A": 7,
"b": [2,3,null],
- "c": {"d": [
+ "C": {"d": [
[],
[null],
null]},
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.parq
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nullable.parq b/testdata/ComplexTypesTbl/nullable.parq
index 01955a6..2c72f52 100644
Binary files a/testdata/ComplexTypesTbl/nullable.parq and b/testdata/ComplexTypesTbl/nullable.parq differ
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
index 2546e9c..45c739e 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
@@ -1,7 +1,6 @@
====
---- QUERY
# Create a table and populate with data file
-drop table if exists resolution_by_name_test;
create table resolution_by_name_test stored as parquet
as select * from functional_parquet.tinytable;
select a, b from resolution_by_name_test;
@@ -69,11 +68,7 @@ string,string
'eeeeeeee','f'
====
---- QUERY
-drop table resolution_by_name_test;
-====
----- QUERY
# Test nested types resolution
-drop table if exists nested_resolution_by_name_test;
create table nested_resolution_by_name_test like functional_parquet.complextypestbl;
====
---- SHELL
@@ -191,11 +186,7 @@ string
'NULL'
====
---- QUERY
-drop table nested_resolution_by_name_test;
-====
----- QUERY
# Test switched key/value map fields
-drop table if exists switched_map_fields_resolution_test;
create table switched_map_fields_resolution_test (int_map map<string,int>)
stored as parquet;
====
@@ -227,9 +218,6 @@ switched_map.parq' has an incompatible Parquet schema for column
required int32 value [i:0 d:1 r:1]
====
---- QUERY
-drop table switched_map_fields_resolution_test
-====
----- QUERY
# Check that we handle bad options gracefully
set parquet_fallback_schema_resolution="FOO"
---- CATCH
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index de4f826..c286c3a 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -507,7 +507,7 @@ class TestParquet(ImpalaTestSuite):
assert d_schema_elt.converted_type == None
@SkipIfOldAggsJoins.nested_types
- def test_resolution_by_name(self, unique_database, vector):
+ def test_resolution_by_name(self, vector, unique_database):
self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
use_db=unique_database)