You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by he...@apache.org on 2017/03/05 23:37:07 UTC

[5/8] incubator-impala git commit: IMPALA-4675: Case-insensitive matching of Parquet fields.

IMPALA-4675: Case-insensitive matching of Parquet fields.

The query option PARQUET_FALLBACK_SCHEMA_RESOLUTION
allows matching of Parquet fields by name instead of by
index (the default).

Parquet column names are case sensitive, but Impala treats
db/table/column/field names as case-insensitive. Today,
there is no way today to select Parquet columns with mixed
casing via SQL using the name-based field resolution policy.

This patch changes the matching of Parquet fields to be
case-insensitive.

Testing:
- Modified the data files backing complextypestbl
  to contain fields with mixed casing.
- Several existing tests run against this table,
  including the test for name-based resolution.
- I confirmed that without this fix, the existing
  name-based resolution tests fail on the modified
  data files.
- I locally ran test_scanners.py and test_nested_types.py
  on exhaustive with this fix.

Change-Id: I87395f84ba29b4c3d8e41be1ea4e89e500b8a9f4
Reviewed-on: http://gerrit.cloudera.org:8080/5891
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/34353218
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/34353218
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/34353218

Branch: refs/heads/master
Commit: 34353218cebbcd02630b07681624c1a1d5c9fd5b
Parents: 7ea96c6
Author: Nathan Salmon <na...@gmail.com>
Authored: Wed Mar 1 13:23:41 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Mar 3 10:20:07 2017 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-metadata-utils.cc           |   3 +-
 be/src/exec/parquet-metadata-utils.h            |   4 +-
 testdata/ComplexTypesTbl/nonnullable.avsc       |  16 ++--
 testdata/ComplexTypesTbl/nonnullable.json       |  14 ++--
 testdata/ComplexTypesTbl/nonnullable.parq       | Bin 3190 -> 3186 bytes
 testdata/ComplexTypesTbl/nullable.avsc          |  16 ++--
 testdata/ComplexTypesTbl/nullable.json          |  80 +++++++++----------
 testdata/ComplexTypesTbl/nullable.parq          | Bin 3900 -> 3896 bytes
 .../QueryTest/parquet-resolution-by-name.test   |  12 ---
 tests/query_test/test_scanners.py               |   2 +-
 10 files changed, 71 insertions(+), 76 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/be/src/exec/parquet-metadata-utils.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-metadata-utils.cc b/be/src/exec/parquet-metadata-utils.cc
index 7fb2313..2527219 100644
--- a/be/src/exec/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet-metadata-utils.cc
@@ -20,6 +20,7 @@
 #include <string>
 #include <sstream>
 #include <vector>
+#include <strings.h>
 
 #include <boost/algorithm/string.hpp>
 #include <gutil/strings/substitute.h>
@@ -546,7 +547,7 @@ int ParquetSchemaResolver::FindChildWithName(SchemaNode* node,
     const string& name) const {
   int idx;
   for (idx = 0; idx < node->children.size(); ++idx) {
-    if (node->children[idx].element->name == name) break;
+    if (strcasecmp(node->children[idx].element->name.c_str(), name.c_str()) == 0) break;
   }
   return idx;
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/be/src/exec/parquet-metadata-utils.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-metadata-utils.h b/be/src/exec/parquet-metadata-utils.h
index fe7552e..fdfc463 100644
--- a/be/src/exec/parquet-metadata-utils.h
+++ b/be/src/exec/parquet-metadata-utils.h
@@ -188,7 +188,9 @@ class ParquetSchemaResolver {
       int next_idx, SchemaNode* node, bool* missing_field) const;
 
   /// Returns the index of 'node's child with 'name', or the number of children if not
-  /// found.
+  /// found. The name comparison is case-insensitive because that's how Impala treats
+  /// db/table/column/field names. If there are several matches with different casing,
+  /// then the index of the first match is returned.
   int FindChildWithName(SchemaNode* node, const string& name) const;
 
   /// The ResolvePathHelper() logic for arrays.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.avsc
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nonnullable.avsc b/testdata/ComplexTypesTbl/nonnullable.avsc
index 5f78dcf..b82c423 100644
--- a/testdata/ComplexTypesTbl/nonnullable.avsc
+++ b/testdata/ComplexTypesTbl/nonnullable.avsc
@@ -1,26 +1,28 @@
 {"type": "record",
  "namespace": "org.apache.impala",
  "name": "ComplexTypesTbl",
+ /* Field names have mixed casing to test the case-insensitive matching of
+    fields in Parquet files. */
  "fields": [
-     {"name": "id", "type": "long"},
-     {"name": "int_array", "type": {"type": "array", "items": "int"}},
+     {"name": "ID", "type": "long"},
+     {"name": "Int_Array", "type": {"type": "array", "items": "int"}},
      {"name": "int_array_array", "type": {"type": "array", "items":
          {"type": "array", "items": "int"}}},
-     {"name": "int_map", "type": {"type": "map", "values": "int"}},
+     {"name": "Int_Map", "type": {"type": "map", "values": "int"}},
      {"name": "int_map_array", "type": {"type": "array", "items":
          {"type": "map", "values": "int"}}},
-     {"name": "nested_struct", "type":
+     {"name": "nested_Struct", "type":
          {"type": "record", "name": "r1", "fields": [
               {"name": "a", "type": "int"},
-              {"name": "b", "type": {"type": "array", "items": "int"}},
+              {"name": "B", "type": {"type": "array", "items": "int"}},
               {"name": "c", "type": {"type": "record", "name": "r2", "fields": [
-                  {"name": "d", "type": {"type": "array", "items":
+                  {"name": "D", "type": {"type": "array", "items":
                       {"type": "array", "items":
                           {"type": "record", "name": "r3", "fields": [
                               {"name": "e", "type": "int"},
                               {"name": "f", "type": "string"}]}}}}
               ]}},
-              {"name": "g", "type": {"type": "map", "values": {
+              {"name": "G", "type": {"type": "map", "values": {
                    "type": "record", "name": "r4", "fields": [
                        {"name": "h", "type": {"type": "record", "name": "r5", "fields": [
                            {"name": "i", "type": {"type": "array", "items": "double"}}]}}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.json
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nonnullable.json b/testdata/ComplexTypesTbl/nonnullable.json
index 9c49460..4da4968 100644
--- a/testdata/ComplexTypesTbl/nonnullable.json
+++ b/testdata/ComplexTypesTbl/nonnullable.json
@@ -1,14 +1,14 @@
 [
-{"id": 8,
- "int_array": [-1],
+{"ID": 8,
+ "Int_Array": [-1],
  "int_array_array": [[-1,-2],[]],
- "int_map": {"k1": -1},
+ "Int_Map": {"k1": -1},
  "int_map_array": [{}, {"k1": 1}, {}, {}],
- "nested_struct": {
+ "nested_Struct": {
    "a": -1,
-   "b": [-1],
+   "B": [-1],
    "c": {
-     "d": [
+     "D": [
        [{"e": -1, "f": "nonnullable"}]]},
-   "g": {}}}
+   "G": {}}}
 ]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.parq
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nonnullable.parq b/testdata/ComplexTypesTbl/nonnullable.parq
index e13d9eb..f4be082 100644
Binary files a/testdata/ComplexTypesTbl/nonnullable.parq and b/testdata/ComplexTypesTbl/nonnullable.parq differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.avsc
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nullable.avsc b/testdata/ComplexTypesTbl/nullable.avsc
index 564bea8..b3bd874 100644
--- a/testdata/ComplexTypesTbl/nullable.avsc
+++ b/testdata/ComplexTypesTbl/nullable.avsc
@@ -1,28 +1,30 @@
 {"type": "record",
  "namespace": "org.apache.impala",
  "name": "ComplexTypesTbl",
+  /* Field names have mixed casing to test the case-insensitive matching of
+     fields in Parquet files. */
  "fields": [
      {"name": "id", "type": ["null", "long"]},
      {"name": "int_array", "type": ["null", {"type": "array", "items": ["null", "int"]}]},
-     {"name": "int_array_array", "type": ["null", {"type": "array", "items":
+     {"name": "int_array_Array", "type": ["null", {"type": "array", "items":
          ["null", {"type": "array", "items": ["null", "int"]}]}]},
      {"name": "int_map", "type": ["null", {"type": "map", "values": ["null", "int"]}]},
-     {"name": "int_map_array", "type": ["null", {"type": "array", "items":
+     {"name": "int_Map_Array", "type": ["null", {"type": "array", "items":
          ["null", {"type": "map", "values": ["null", "int"]}]}]},
      {"name": "nested_struct", "type":
          ["null", {"type": "record", "name": "r1", "fields": [
-              {"name": "a", "type": ["null", "int"]},
+              {"name": "A", "type": ["null", "int"]},
               {"name": "b", "type": ["null", {"type": "array", "items": ["null", "int"]}]},
-              {"name": "c", "type": ["null", {"type": "record", "name": "r2", "fields": [
+              {"name": "C", "type": ["null", {"type": "record", "name": "r2", "fields": [
                   {"name": "d", "type": ["null", {"type": "array", "items":
                       ["null", {"type": "array", "items":
                           ["null", {"type": "record", "name": "r3", "fields": [
-                              {"name": "e", "type": ["null", "int"]},
-                              {"name": "f", "type": ["null", "string"]}]}]}]}]}
+                              {"name": "E", "type": ["null", "int"]},
+                              {"name": "F", "type": ["null", "string"]}]}]}]}]}
                ]}]},
                {"name": "g", "type": ["null", {"type": "map", "values":
                    ["null", {"type": "record", "name": "r4", "fields": [
-                       {"name": "h", "type":
+                       {"name": "H", "type":
                            ["null", {"type": "record", "name": "r5", "fields": [
                                {"name": "i", "type": ["null", {"type": "array", "items":
                                    ["null", "double"]}]}]}]}]}]}]}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.json
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nullable.json b/testdata/ComplexTypesTbl/nullable.json
index 251b522..6f49c20 100644
--- a/testdata/ComplexTypesTbl/nullable.json
+++ b/testdata/ComplexTypesTbl/nullable.json
@@ -1,86 +1,86 @@
 [
 {"id": 1,
  "int_array": [1,2,3],
- "int_array_array": [[1,2],[3,4]],
+ "int_array_Array": [[1,2],[3,4]],
  "int_map": {"k1": 1, "k2": 100},
- "int_map_array": [{"k1": 1}],
+ "int_Map_Array": [{"k1": 1}],
  "nested_struct": {
-   "a": 1,
+   "A": 1,
    "b": [1],
-   "c": {
+   "C": {
      "d": [
-       [{"e": 10, "f": "aaa"},
-        {"e": -10, "f": "bbb"}],
-       [{"e": 11, "f": "c"}]]},
-   "g": {"foo": {"h": {"i": [1.1]}}}}},
+       [{"E": 10, "F": "aaa"},
+        {"E": -10, "F": "bbb"}],
+       [{"E": 11, "F": "c"}]]},
+   "g": {"foo": {"H": {"i": [1.1]}}}}},
 {"id": 2,
  "int_array": [null,1,2,null,3,null],
- "int_array_array": [[null,1,2,null],[3,null,4],[], null],
+ "int_array_Array": [[null,1,2,null],[3,null,4],[], null],
  "int_map": {"k1": 2, "k2": null},
- "int_map_array": [{"k3": null, "k1": 1}, null, {}],
+ "int_Map_Array": [{"k3": null, "k1": 1}, null, {}],
  "nested_struct": {
-   "a": null,
+   "A": null,
    "b": [null],
-   "c": {"d": [
-     [{"e": null, "f": null},
-      {"e": 10, "f": "aaa"},
-      {"e": null, "f": null},
-      {"e": -10, "f": "bbb"},
-      {"e": null, "f": null}],
-     [{"e": 11, "f": "c"},
+   "C": {"d": [
+     [{"E": null, "F": null},
+      {"E": 10, "F": "aaa"},
+      {"E": null, "F": null},
+      {"E": -10, "F": "bbb"},
+      {"E": null, "F": null}],
+     [{"E": 11, "F": "c"},
       null],
      [],
      null]},
    "g": {
-     "g1": {"h": {"i": [2.2, null]}},
-     "g2": {"h": {"i": []}},
+     "g1": {"H": {"i": [2.2, null]}},
+     "g2": {"H": {"i": []}},
      "g3": null,
-     "g4": {"h": {"i": null}},
-     "g5": {"h": null}}}},
+     "g4": {"H": {"i": null}},
+     "g5": {"H": null}}}},
 {"id": 3,
  "int_array": [],
- "int_array_array": [null],
+ "int_array_Array": [null],
  "int_map": {},
- "int_map_array": [null, null],
+ "int_Map_Array": [null, null],
  "nested_struct": {
-   "a": null,
+   "A": null,
    "b": null,
-   "c": {"d": []},
+   "C": {"d": []},
    "g": {}}},
 {"id": 4,
  "int_array": null,
- "int_array_array": [],
+ "int_array_Array": [],
  "int_map": {},
- "int_map_array": [],
+ "int_Map_Array": [],
  "nested_struct": {
-   "a": null,
+   "A": null,
    "b": null,
-   "c": {"d": null},
+   "C": {"d": null},
    "g": null}},
 {"id": 5,
  "int_array": null,
- "int_array_array": null,
+ "int_array_Array": null,
  "int_map": {},
  "nested_struct": {
-   "a": null,
+   "A": null,
    "b": null,
-   "c": null,
-   "g": {"foo": {"h": {"i": [2.2, 3.3]}}}}},
+   "C": null,
+   "g": {"foo": {"H": {"i": [2.2, 3.3]}}}}},
 {"id": 6,
  "int_array": null,
- "int_array_array": null,
+ "int_array_Array": null,
  "int_map": null,
- "int_map_array": null,
+ "int_Map_Array": null,
  "nested_struct": null},
 {"id": 7,
  "int_array": null,
- "int_array_array": [null,[5,6]],
+ "int_array_Array": [null,[5,6]],
  "int_map": {"k1": null, "k3": null},
- "int_map_array": null,
+ "int_Map_Array": null,
  "nested_struct": {
-   "a": 7,
+   "A": 7,
    "b": [2,3,null],
-   "c": {"d": [
+   "C": {"d": [
      [],
      [null],
      null]},

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.parq
----------------------------------------------------------------------
diff --git a/testdata/ComplexTypesTbl/nullable.parq b/testdata/ComplexTypesTbl/nullable.parq
index 01955a6..2c72f52 100644
Binary files a/testdata/ComplexTypesTbl/nullable.parq and b/testdata/ComplexTypesTbl/nullable.parq differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
index 2546e9c..45c739e 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
@@ -1,7 +1,6 @@
 ====
 ---- QUERY
 # Create a table and populate with data file
-drop table if exists resolution_by_name_test;
 create table resolution_by_name_test stored as parquet
 as select * from functional_parquet.tinytable;
 select a, b from resolution_by_name_test;
@@ -69,11 +68,7 @@ string,string
 'eeeeeeee','f'
 ====
 ---- QUERY
-drop table resolution_by_name_test;
-====
----- QUERY
 # Test nested types resolution
-drop table if exists nested_resolution_by_name_test;
 create table nested_resolution_by_name_test like functional_parquet.complextypestbl;
 ====
 ---- SHELL
@@ -191,11 +186,7 @@ string
 'NULL'
 ====
 ---- QUERY
-drop table nested_resolution_by_name_test;
-====
----- QUERY
 # Test switched key/value map fields
-drop table if exists switched_map_fields_resolution_test;
 create table switched_map_fields_resolution_test (int_map map<string,int>)
 stored as parquet;
 ====
@@ -227,9 +218,6 @@ switched_map.parq' has an incompatible Parquet schema for column
 required int32 value [i:0 d:1 r:1]
 ====
 ---- QUERY
-drop table switched_map_fields_resolution_test
-====
----- QUERY
 # Check that we handle bad options gracefully
 set parquet_fallback_schema_resolution="FOO"
 ---- CATCH

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index de4f826..c286c3a 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -507,7 +507,7 @@ class TestParquet(ImpalaTestSuite):
     assert d_schema_elt.converted_type == None
 
   @SkipIfOldAggsJoins.nested_types
-  def test_resolution_by_name(self, unique_database, vector):
+  def test_resolution_by_name(self, vector, unique_database):
     self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
                        use_db=unique_database)