You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by to...@apache.org on 2019/05/21 17:14:25 UTC

[impala] branch master updated (7af981f -> f9bf62e)

This is an automated email from the ASF dual-hosted git repository.

todd pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 7af981f  fe: clean up POM and improve m2e integration
     new 578efe6  Add a README for data-loading file format, remove LOAD_LOCAL
     new f9bf62e  IMPALA-8566. Fix computation of num_nulls for incremental stats

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/exec/incr-stats-util.cc                     |   6 +-
 testdata/bin/generate-schema-statements.py         |  10 +-
 testdata/datasets/README                           |  66 ++++++++---
 .../QueryTest/compute-stats-incremental.test       | 122 ++++++++++-----------
 .../queries/QueryTest/truncate-table.test          |  26 ++---
 5 files changed, 131 insertions(+), 99 deletions(-)

[impala] 02/02: IMPALA-8566. Fix computation of num_nulls for incremental stats

Posted by to...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f9bf62eefab7fb807f4e5d6900064b612b455a5e
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Mon May 20 13:45:29 2019 -0700

    IMPALA-8566. Fix computation of num_nulls for incremental stats
    
    The calculation for num_nulls in the incremental stats code path
    initialized the counter to -1 instead of 0. This meant that, if there
    were no nulls (reasonably common), the num_nulls counter would be set to
    -1, indicating unknown, rather than 0.
    
    This simply fixes the initialization and updates the tests.
    
    Change-Id: Ie42103ad21d719cac45abc160c8d5422dd33fb28
    Reviewed-on: http://gerrit.cloudera.org:8080/13378
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/incr-stats-util.cc                     |   6 +-
 .../QueryTest/compute-stats-incremental.test       | 122 ++++++++++-----------
 .../queries/QueryTest/truncate-table.test          |  26 ++---
 3 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/be/src/exec/incr-stats-util.cc b/be/src/exec/incr-stats-util.cc
index f0bb73f..6cc9f2f 100644
--- a/be/src/exec/incr-stats-util.cc
+++ b/be/src/exec/incr-stats-util.cc
@@ -140,7 +140,7 @@ struct PerColumnStats {
   double avg_width;
 
   PerColumnStats()
-      : intermediate_ndv(AggregateFunctions::HLL_LEN, 0), num_nulls(-1),
+      : intermediate_ndv(AggregateFunctions::HLL_LEN, 0), num_nulls(0),
         max_width(0), num_rows(0), avg_width(0) { }
 
   // Updates all aggregate statistics with a new set of measurements.
@@ -150,11 +150,11 @@ struct PerColumnStats {
     DCHECK_GE(num_new_rows, 0);
     DCHECK_GE(max_new_width, 0);
     DCHECK_GE(new_avg_width, 0);
-    DCHECK_GE(num_new_nulls, -1);
+    DCHECK_GE(num_new_nulls, 0);
     for (int j = 0; j < ndv.size(); ++j) {
       intermediate_ndv[j] = ::max(intermediate_ndv[j], ndv[j]);
     }
-    if (num_new_nulls >= 0) num_nulls += num_new_nulls;
+    num_nulls += num_new_nulls;
     max_width = ::max(max_width, max_new_width);
     avg_width += (new_avg_width * num_new_rows);
     num_rows += num_new_rows;
diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
index 064c23a..e76170a 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
@@ -48,17 +48,17 @@ show column stats alltypes_incremental
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',7300,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',10,-1,1,1
-'smallint_col','SMALLINT',10,-1,2,2
-'int_col','INT',10,-1,4,4
-'bigint_col','BIGINT',10,-1,8,8
-'float_col','FLOAT',10,-1,4,4
-'double_col','DOUBLE',10,-1,8,8
-'date_string_col','STRING',736,-1,8,8
-'string_col','STRING',10,-1,1,1
-'timestamp_col','TIMESTAMP',7300,-1,16,16
+'id','INT',7300,0,4,4
+'bool_col','BOOLEAN',2,0,1,1
+'tinyint_col','TINYINT',10,0,1,1
+'smallint_col','SMALLINT',10,0,2,2
+'int_col','INT',10,0,4,4
+'bigint_col','BIGINT',10,0,8,8
+'float_col','FLOAT',10,0,4,4
+'double_col','DOUBLE',10,0,8,8
+'date_string_col','STRING',736,0,8,8
+'string_col','STRING',10,0,1,1
+'timestamp_col','TIMESTAMP',7300,0,16,16
 'year','INT',2,0,4,4
 'month','INT',12,0,4,4
 ---- TYPES
@@ -141,17 +141,17 @@ show column stats alltypes_incremental
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',7300,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',10,-1,1,1
-'smallint_col','SMALLINT',10,-1,2,2
-'int_col','INT',10,-1,4,4
-'bigint_col','BIGINT',10,-1,8,8
-'float_col','FLOAT',10,-1,4,4
-'double_col','DOUBLE',10,-1,8,8
-'date_string_col','STRING',736,-1,8,8
-'string_col','STRING',10,-1,1,1
-'timestamp_col','TIMESTAMP',7300,-1,16,16
+'id','INT',7300,0,4,4
+'bool_col','BOOLEAN',2,0,1,1
+'tinyint_col','TINYINT',10,0,1,1
+'smallint_col','SMALLINT',10,0,2,2
+'int_col','INT',10,0,4,4
+'bigint_col','BIGINT',10,0,8,8
+'float_col','FLOAT',10,0,4,4
+'double_col','DOUBLE',10,0,8,8
+'date_string_col','STRING',736,0,8,8
+'string_col','STRING',10,0,1,1
+'timestamp_col','TIMESTAMP',7300,0,16,16
 'year','INT',2,0,4,4
 'month','INT',12,0,4,4
 ---- TYPES
@@ -242,17 +242,17 @@ show column stats alltypes_incremental
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',6990,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',10,-1,1,1
-'smallint_col','SMALLINT',10,-1,2,2
-'int_col','INT',10,-1,4,4
-'bigint_col','BIGINT',10,-1,8,8
-'float_col','FLOAT',10,-1,4,4
-'double_col','DOUBLE',10,-1,8,8
-'date_string_col','STRING',688,-1,8,8
-'string_col','STRING',10,-1,1,1
-'timestamp_col','TIMESTAMP',6990,-1,16,16
+'id','INT',6990,0,4,4
+'bool_col','BOOLEAN',2,0,1,1
+'tinyint_col','TINYINT',10,0,1,1
+'smallint_col','SMALLINT',10,0,2,2
+'int_col','INT',10,0,4,4
+'bigint_col','BIGINT',10,0,8,8
+'float_col','FLOAT',10,0,4,4
+'double_col','DOUBLE',10,0,8,8
+'date_string_col','STRING',688,0,8,8
+'string_col','STRING',10,0,1,1
+'timestamp_col','TIMESTAMP',6990,0,16,16
 'year','INT',2,0,4,4
 'month','INT',12,0,4,4
 ---- TYPES
@@ -305,17 +305,17 @@ show column stats alltypes_incremental
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',7300,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',10,-1,1,1
-'smallint_col','SMALLINT',10,-1,2,2
-'int_col','INT',10,-1,4,4
-'bigint_col','BIGINT',10,-1,8,8
-'float_col','FLOAT',10,-1,4,4
-'double_col','DOUBLE',10,-1,8,8
-'date_string_col','STRING',736,-1,8,8
-'string_col','STRING',10,-1,1,1
-'timestamp_col','TIMESTAMP',7300,-1,16,16
+'id','INT',7300,0,4,4
+'bool_col','BOOLEAN',2,0,1,1
+'tinyint_col','TINYINT',10,0,1,1
+'smallint_col','SMALLINT',10,0,2,2
+'int_col','INT',10,0,4,4
+'bigint_col','BIGINT',10,0,8,8
+'float_col','FLOAT',10,0,4,4
+'double_col','DOUBLE',10,0,8,8
+'date_string_col','STRING',736,0,8,8
+'string_col','STRING',10,0,1,1
+'timestamp_col','TIMESTAMP',7300,0,16,16
 'year','INT',2,0,4,4
 'month','INT',12,0,4,4
 ---- TYPES
@@ -546,14 +546,14 @@ show column stats chars_tbl
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',2915,-1,4,4
-'ch1','CHAR(1)',1,-1,1,1
-'ch2','CHAR(8)',10,-1,8,8
-'ch3','CHAR(20)',10,-1,8,8
-'ts','TIMESTAMP',2871,-1,16,16
-'vc1','VARCHAR(1)',1,-1,1,1
-'vc2','VARCHAR(8)',10,-1,8,8
-'vc3','VARCHAR(20)',10,-1,8,8
+'id','INT',2915,0,4,4
+'ch1','CHAR(1)',1,0,1,1
+'ch2','CHAR(8)',10,0,8,8
+'ch3','CHAR(20)',10,0,8,8
+'ts','TIMESTAMP',2871,0,16,16
+'vc1','VARCHAR(1)',1,0,1,1
+'vc2','VARCHAR(8)',10,0,8,8
+'vc3','VARCHAR(20)',10,0,8,8
 'year','CHAR(5)',1,0,5,5
 'day','VARCHAR(13)',3,1,-1,-1
 ---- TYPES
@@ -576,14 +576,14 @@ show column stats chars_tbl
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',2915,0,4,4
-'ch1','CHAR(1)',2,-1,1,1
-'ch2','CHAR(8)',11,-1,8,7.99766731262207
-'ch3','CHAR(20)',11,-1,8,7.99766731262207
-'ts','TIMESTAMP',2871,0,16,16
-'vc1','VARCHAR(1)',2,-1,1,1
-'vc2','VARCHAR(8)',11,-1,8,7.99766731262207
-'vc3','VARCHAR(20)',11,-1,8,7.99766731262207
+'id','INT',2915,1,4,4
+'ch1','CHAR(1)',2,0,1,1
+'ch2','CHAR(8)',11,0,8,7.99766731262207
+'ch3','CHAR(20)',11,0,8,7.99766731262207
+'ts','TIMESTAMP',2871,1,16,16
+'vc1','VARCHAR(1)',2,0,1,1
+'vc2','VARCHAR(8)',11,0,8,7.99766731262207
+'vc3','VARCHAR(20)',11,0,8,7.99766731262207
 'year','CHAR(5)',2,0,5,5
 'day','VARCHAR(13)',4,1,-1,-1
 ---- TYPES
@@ -621,4 +621,4 @@ compute incremental stats complextypestbl_part;
 'Updated 1 partition(s) and 1 column(s).'
 ---- TYPES
 STRING
-====
\ No newline at end of file
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/truncate-table.test b/testdata/workloads/functional-query/queries/QueryTest/truncate-table.test
index a8d2a80..9ad769f 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/truncate-table.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/truncate-table.test
@@ -42,17 +42,17 @@ show column stats t1;
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'id','INT',7300,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',10,-1,1,1
-'smallint_col','SMALLINT',10,-1,2,2
-'int_col','INT',10,-1,4,4
-'bigint_col','BIGINT',10,-1,8,8
-'float_col','FLOAT',10,-1,4,4
-'double_col','DOUBLE',10,-1,8,8
-'date_string_col','STRING',736,-1,8,8
-'string_col','STRING',10,-1,1,1
-'timestamp_col','TIMESTAMP',7300,-1,16,16
+'id','INT',7300,0,4,4
+'bool_col','BOOLEAN',2,0,1,1
+'tinyint_col','TINYINT',10,0,1,1
+'smallint_col','SMALLINT',10,0,2,2
+'int_col','INT',10,0,4,4
+'bigint_col','BIGINT',10,0,8,8
+'float_col','FLOAT',10,0,4,4
+'double_col','DOUBLE',10,0,8,8
+'date_string_col','STRING',736,0,8,8
+'string_col','STRING',10,0,1,1
+'timestamp_col','TIMESTAMP',7300,0,16,16
 'year','INT',2,0,4,4
 'month','INT',12,0,4,4
 ---- TYPES
@@ -135,8 +135,8 @@ show column stats t2;
 ---- LABELS
 COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
 ---- RESULTS
-'a','STRING',3,-1,8,6.666666507720947
-'b','STRING',3,-1,7,4
+'a','STRING',3,0,8,6.666666507720947
+'b','STRING',3,0,7,4
 ---- TYPES
 STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE
 ====

[impala] 01/02: Add a README for data-loading file format, remove LOAD_LOCAL

Posted by to...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 578efe61d9cf696a5a735ef80b798c571f7b1d23
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue May 14 10:14:02 2019 -0700

    Add a README for data-loading file format, remove LOAD_LOCAL
    
    The LOAD_LOCAL support appears to be unused, so removed it for
    cleanlines.
    
    Change-Id: Id7938d293e144c95ad3752ebc0238ee0e8cf11eb
    Reviewed-on: http://gerrit.cloudera.org:8080/13370
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Todd Lipcon <to...@apache.org>
---
 testdata/bin/generate-schema-statements.py | 10 +----
 testdata/datasets/README                   | 66 +++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 6826c8a..95df878 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -599,12 +599,6 @@ def generate_statements(output_name, test_vectors, sections,
       else:
         create_kudu = None
 
-      # For some datasets we may want to use a different load strategy when running local
-      # tests versus tests against large scale factors. The most common reason is to
-      # reduce he number of partitions for the local test environment
-      if not options.scale_factor and section['LOAD_LOCAL']:
-        load = section['LOAD_LOCAL']
-
       columns = eval_section(section['COLUMNS']).strip()
       partition_columns = section['PARTITION_COLUMNS'].strip()
       row_format = section['ROW_FORMAT'].strip()
@@ -635,7 +629,7 @@ def generate_statements(output_name, test_vectors, sections,
       # HBASE we need to create these tables with a supported insert format.
       create_file_format = file_format
       create_codec = codec
-      if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD'] \
+      if not (section['LOAD'] or section['DEPENDENT_LOAD']
               or section['DEPENDENT_LOAD_HIVE']):
         create_codec = 'none'
         create_file_format = file_format
@@ -778,7 +772,7 @@ def parse_schema_template_file(file_name):
   VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS',
                          'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU',
                          'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'DEPENDENT_LOAD_HIVE',
-                         'LOAD', 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES',
+                         'LOAD', 'ALTER', 'HBASE_COLUMN_FAMILIES',
                          'TABLE_PROPERTIES', 'HBASE_REGION_SPLITS']
   return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False)
 
diff --git a/testdata/datasets/README b/testdata/datasets/README
index b4000b3..bf12a56 100644
--- a/testdata/datasets/README
+++ b/testdata/datasets/README
@@ -21,18 +21,56 @@ The schema template SQL files have the following format:
   to generate all the schema for the Imapla benchmark tests.
 
   Each table is defined as a new section in the file with the following format:
+
   ====
-  --- DATASET <- Start new section
-  Data set name - Used to group sets of tables together
-  ---- <- End sub-section
-  Base table name
-  ---- <- End sub-section
-  CREATE TABLE statement - Statement to drop and create a table
-  ---- <- End sub-section
-  INSERT/SELECT * - The INSERT/SELECT * command for loading from the base table
-  ---- <- End sub-section
-  Parquet loading code executed by bash.
-  ---- <- End sub-section
-  LOAD from LOCAL - How to load data for the the base table
-  ---- <- End sub-section
-  ANALYZE TABLE ... COMPUTE STATISTICS - Compute statistics statement for table
+  ---- SECTION NAME
+  section contents
+  ...
+  ---- ANOTHER SECTION
+  ... section contents
+  ---- ... more sections...
+
+  Note that tables are delimited by '====' and that even the first table in the
+  file must include this header line.
+
+  The supported section names are:
+
+  DATASET
+      Data set name - Used to group sets of tables together
+  BASE_TABLE_NAME
+      The name of the table within the database
+  CREATE
+      Explicit CREATE statement used to create the table (executed by Impala)
+  CREATE_HIVE
+      Same as the above, but will be executed by Hive instead. If specified,
+      'CREATE' must not be specified.
+  CREATE_KUDU
+      Customized CREATE TABLE statement used to create the table for Kudu-specific
+      syntax.
+
+  COLUMNS
+  PARTITION_COLUMNS
+  ROW_FORMAT
+  HBASE_COLUMN_FAMILIES
+  TABLE_PROPERTIES
+  HBASE_REGION_SPLITS
+      If no explicit CREATE statement is provided, a CREATE statement is generated
+      from these sections (see 'build_table_template' function in
+      'generate-schema-statements.py' for details)
+
+  ALTER
+      A set of ALTER statements to be executed after the table is created
+      (typically to add partitions, but may also be used for other settings that
+      cannot be specified directly in the CREATE TABLE statement).
+
+      These statements are ignored for HBase and Kudu tables.
+
+  LOAD
+      The statement used to load the base (text) form of the table. This is
+      typically a LOAD DATA statement.
+
+  DEPENDENT_LOAD
+  DEPENDENT_LOAD_KUDU
+  DEPENDENT_LOAD_HIVE
+      Statements to be executed during the "dependent load" phase. These statements
+      are run after the initial (base table) load is complete.