You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by to...@apache.org on 2019/05/21 17:14:26 UTC
[impala] 01/02: Add a README for data-loading file format, remove LOAD_LOCAL

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 578efe61d9cf696a5a735ef80b798c571f7b1d23
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue May 14 10:14:02 2019 -0700

    Add a README for data-loading file format, remove LOAD_LOCAL
    
    The LOAD_LOCAL support appears to be unused, so removed it for
    cleanlines.
    
    Change-Id: Id7938d293e144c95ad3752ebc0238ee0e8cf11eb
    Reviewed-on: http://gerrit.cloudera.org:8080/13370
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Todd Lipcon <to...@apache.org>
---
 testdata/bin/generate-schema-statements.py | 10 +----
 testdata/datasets/README                   | 66 +++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 6826c8a..95df878 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -599,12 +599,6 @@ def generate_statements(output_name, test_vectors, sections,
       else:
         create_kudu = None
 
-      # For some datasets we may want to use a different load strategy when running local
-      # tests versus tests against large scale factors. The most common reason is to
-      # reduce he number of partitions for the local test environment
-      if not options.scale_factor and section['LOAD_LOCAL']:
-        load = section['LOAD_LOCAL']
-
       columns = eval_section(section['COLUMNS']).strip()
       partition_columns = section['PARTITION_COLUMNS'].strip()
       row_format = section['ROW_FORMAT'].strip()
@@ -635,7 +629,7 @@ def generate_statements(output_name, test_vectors, sections,
       # HBASE we need to create these tables with a supported insert format.
       create_file_format = file_format
       create_codec = codec
-      if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD'] \
+      if not (section['LOAD'] or section['DEPENDENT_LOAD']
               or section['DEPENDENT_LOAD_HIVE']):
         create_codec = 'none'
         create_file_format = file_format
@@ -778,7 +772,7 @@ def parse_schema_template_file(file_name):
   VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS',
                          'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU',
                          'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'DEPENDENT_LOAD_HIVE',
-                         'LOAD', 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES',
+                         'LOAD', 'ALTER', 'HBASE_COLUMN_FAMILIES',
                          'TABLE_PROPERTIES', 'HBASE_REGION_SPLITS']
   return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False)
 
diff --git a/testdata/datasets/README b/testdata/datasets/README
index b4000b3..bf12a56 100644
--- a/testdata/datasets/README
+++ b/testdata/datasets/README
@@ -21,18 +21,56 @@ The schema template SQL files have the following format:
   to generate all the schema for the Imapla benchmark tests.
 
   Each table is defined as a new section in the file with the following format:
+
   ====
-  --- DATASET <- Start new section
-  Data set name - Used to group sets of tables together
-  ---- <- End sub-section
-  Base table name
-  ---- <- End sub-section
-  CREATE TABLE statement - Statement to drop and create a table
-  ---- <- End sub-section
-  INSERT/SELECT * - The INSERT/SELECT * command for loading from the base table
-  ---- <- End sub-section
-  Parquet loading code executed by bash.
-  ---- <- End sub-section
-  LOAD from LOCAL - How to load data for the the base table
-  ---- <- End sub-section
-  ANALYZE TABLE ... COMPUTE STATISTICS - Compute statistics statement for table
+  ---- SECTION NAME
+  section contents
+  ...
+  ---- ANOTHER SECTION
+  ... section contents
+  ---- ... more sections...
+
+  Note that tables are delimited by '====' and that even the first table in the
+  file must include this header line.
+
+  The supported section names are:
+
+  DATASET
+      Data set name - Used to group sets of tables together
+  BASE_TABLE_NAME
+      The name of the table within the database
+  CREATE
+      Explicit CREATE statement used to create the table (executed by Impala)
+  CREATE_HIVE
+      Same as the above, but will be executed by Hive instead. If specified,
+      'CREATE' must not be specified.
+  CREATE_KUDU
+      Customized CREATE TABLE statement used to create the table for Kudu-specific
+      syntax.
+
+  COLUMNS
+  PARTITION_COLUMNS
+  ROW_FORMAT
+  HBASE_COLUMN_FAMILIES
+  TABLE_PROPERTIES
+  HBASE_REGION_SPLITS
+      If no explicit CREATE statement is provided, a CREATE statement is generated
+      from these sections (see 'build_table_template' function in
+      'generate-schema-statements.py' for details)
+
+  ALTER
+      A set of ALTER statements to be executed after the table is created
+      (typically to add partitions, but may also be used for other settings that
+      cannot be specified directly in the CREATE TABLE statement).
+
+      These statements are ignored for HBase and Kudu tables.
+
+  LOAD
+      The statement used to load the base (text) form of the table. This is
+      typically a LOAD DATA statement.
+
+  DEPENDENT_LOAD
+  DEPENDENT_LOAD_KUDU
+  DEPENDENT_LOAD_HIVE
+      Statements to be executed during the "dependent load" phase. These statements
+      are run after the initial (base table) load is complete.