You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/09/06 21:08:47 UTC

[3/3] incubator-impala git commit: IMPALA-4826: Fix error during a scan on repeated root schema in Parquet.

IMPALA-4826: Fix error during a scan on repeated root schema in Parquet.

Having the repetition level set to REPEATED on the root schema
resulted a scan to fail with error when Impala tried to parse that
table.

As a solution, the 'REPEATED' repetition level is ignored when the
root schema is processed. The reasoning behind is that the Parquet
format description says that the repetition level of the root schema
should not be set to REPEATED anyway, so it's safe to ignore it in
case it is set to this value for some reason.

Change-Id: I7ea84589e1d122ad9d43adde46893ec0ecc5f9c4
Reviewed-on: http://gerrit.cloudera.org:8080/7870
Reviewed-by: Dan Hecht <dh...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/545eab6d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/545eab6d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/545eab6d

Branch: refs/heads/master
Commit: 545eab6d6202ca3968279d14fad28bd2a6d566f6
Parents: 387bde0
Author: Gabor Kaszab <ga...@cloudera.com>
Authored: Mon Aug 28 17:03:31 2017 +0200
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Sep 6 20:07:56 2017 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-metadata-utils.cc      |   4 +++-
 testdata/data/README                       |  10 ++++++++++
 testdata/data/repeated_root_schema.parquet | Bin 0 -> 7598 bytes
 tests/query_test/test_scanners.py          |  14 ++++++++++++++
 4 files changed, 27 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/be/src/exec/parquet-metadata-utils.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-metadata-utils.cc b/be/src/exec/parquet-metadata-utils.cc
index 8ec0abd..fc34eda 100644
--- a/be/src/exec/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet-metadata-utils.cc
@@ -339,6 +339,7 @@ Status ParquetSchemaResolver::CreateSchemaTree(
     return Status(Substitute("File '$0' corrupt: could not reconstruct schema tree from "
             "flattened schema in file metadata", filename_));
   }
+  bool is_root_schema = (*idx == 0);
   node->element = &schema[*idx];
   ++(*idx);
 
@@ -363,7 +364,8 @@ Status ParquetSchemaResolver::CreateSchemaTree(
 
   if (node->element->repetition_type == parquet::FieldRepetitionType::OPTIONAL) {
     ++max_def_level;
-  } else if (node->element->repetition_type == parquet::FieldRepetitionType::REPEATED) {
+  } else if (node->element->repetition_type == parquet::FieldRepetitionType::REPEATED &&
+             !is_root_schema /*PARQUET-843*/) {
     ++max_rep_level;
     // Repeated fields add a definition level. This is used to distinguish between an
     // empty list and a list with an item in it.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 4066a8d..231a901 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -106,3 +106,13 @@ deprecated_statistics.parquet:
 Generated with with hive shell, which uses parquet-mr version 1.5.0-cdh5.12.0-SNAPSHOT
 Contains a copy of the data in functional.alltypessmall with statistics that use the old
 'min'/'max' fields.
+
+repeated_root_schema.parquet:
+Generated by hacking Impala's Parquet writer.
+Created to reproduce IMPALA-4826. Contains a table of 300 rows where the
+repetition level of the root schema is set to REPEATED.
+Reproduction steps:
+1: Extend HdfsParquetTableWriter::CreateSchema with the following line:
+   file_metadata_.schema[0].__set_repetition_type(FieldRepetitionType::REQUIRED);
+2: Run test_compute_stats and grab the created Parquet file for
+   alltypes_parquet table.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/testdata/data/repeated_root_schema.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/repeated_root_schema.parquet b/testdata/data/repeated_root_schema.parquet
new file mode 100755
index 0000000..4e0fb6e
Binary files /dev/null and b/testdata/data/repeated_root_schema.parquet differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index e9fd457..d355081 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -298,6 +298,20 @@ class TestParquet(ImpalaTestSuite):
     vector.get_value('exec_option')['abort_on_error'] = 1
     self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database)
 
+  def test_repeated_root_schema(self, vector, unique_database):
+    """IMPALA-4826: Tests that running a scan on a schema where the root schema's
+       repetetion level is set to REPEATED succeeds without errors."""
+    self.client.execute("create table %s.repeated_root_schema (i int) "
+        "stored as parquet" % unique_database)
+    repeated_root_schema_loc = get_fs_path(
+        "/test-warehouse/%s.db/%s" % (unique_database, "repeated_root_schema"))
+    check_call(['hdfs', 'dfs', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + "/testdata/data/repeated_root_schema.parquet",
+        repeated_root_schema_loc])
+
+    result = self.client.execute("select * from %s.repeated_root_schema" % unique_database)
+    assert len(result.data) == 300
+
   def test_huge_num_rows(self, vector, unique_database):
     """IMPALA-5021: Tests that a zero-slot scan on a file with a huge num_rows in the
     footer succeeds without errors."""