You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by to...@apache.org on 2019/05/21 17:14:26 UTC
[impala] 01/02: Add a README for data-loading file format,
remove LOAD_LOCAL
This is an automated email from the ASF dual-hosted git repository.
todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 578efe61d9cf696a5a735ef80b798c571f7b1d23
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue May 14 10:14:02 2019 -0700
Add a README for data-loading file format, remove LOAD_LOCAL
The LOAD_LOCAL support appears to be unused, so removed it for
cleanlines.
Change-Id: Id7938d293e144c95ad3752ebc0238ee0e8cf11eb
Reviewed-on: http://gerrit.cloudera.org:8080/13370
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Todd Lipcon <to...@apache.org>
---
testdata/bin/generate-schema-statements.py | 10 +----
testdata/datasets/README | 66 +++++++++++++++++++++++-------
2 files changed, 54 insertions(+), 22 deletions(-)
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 6826c8a..95df878 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -599,12 +599,6 @@ def generate_statements(output_name, test_vectors, sections,
else:
create_kudu = None
- # For some datasets we may want to use a different load strategy when running local
- # tests versus tests against large scale factors. The most common reason is to
- # reduce he number of partitions for the local test environment
- if not options.scale_factor and section['LOAD_LOCAL']:
- load = section['LOAD_LOCAL']
-
columns = eval_section(section['COLUMNS']).strip()
partition_columns = section['PARTITION_COLUMNS'].strip()
row_format = section['ROW_FORMAT'].strip()
@@ -635,7 +629,7 @@ def generate_statements(output_name, test_vectors, sections,
# HBASE we need to create these tables with a supported insert format.
create_file_format = file_format
create_codec = codec
- if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD'] \
+ if not (section['LOAD'] or section['DEPENDENT_LOAD']
or section['DEPENDENT_LOAD_HIVE']):
create_codec = 'none'
create_file_format = file_format
@@ -778,7 +772,7 @@ def parse_schema_template_file(file_name):
VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS',
'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU',
'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'DEPENDENT_LOAD_HIVE',
- 'LOAD', 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES',
+ 'LOAD', 'ALTER', 'HBASE_COLUMN_FAMILIES',
'TABLE_PROPERTIES', 'HBASE_REGION_SPLITS']
return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False)
diff --git a/testdata/datasets/README b/testdata/datasets/README
index b4000b3..bf12a56 100644
--- a/testdata/datasets/README
+++ b/testdata/datasets/README
@@ -21,18 +21,56 @@ The schema template SQL files have the following format:
to generate all the schema for the Imapla benchmark tests.
Each table is defined as a new section in the file with the following format:
+
====
- --- DATASET <- Start new section
- Data set name - Used to group sets of tables together
- ---- <- End sub-section
- Base table name
- ---- <- End sub-section
- CREATE TABLE statement - Statement to drop and create a table
- ---- <- End sub-section
- INSERT/SELECT * - The INSERT/SELECT * command for loading from the base table
- ---- <- End sub-section
- Parquet loading code executed by bash.
- ---- <- End sub-section
- LOAD from LOCAL - How to load data for the the base table
- ---- <- End sub-section
- ANALYZE TABLE ... COMPUTE STATISTICS - Compute statistics statement for table
+ ---- SECTION NAME
+ section contents
+ ...
+ ---- ANOTHER SECTION
+ ... section contents
+ ---- ... more sections...
+
+ Note that tables are delimited by '====' and that even the first table in the
+ file must include this header line.
+
+ The supported section names are:
+
+ DATASET
+ Data set name - Used to group sets of tables together
+ BASE_TABLE_NAME
+ The name of the table within the database
+ CREATE
+ Explicit CREATE statement used to create the table (executed by Impala)
+ CREATE_HIVE
+ Same as the above, but will be executed by Hive instead. If specified,
+ 'CREATE' must not be specified.
+ CREATE_KUDU
+ Customized CREATE TABLE statement used to create the table for Kudu-specific
+ syntax.
+
+ COLUMNS
+ PARTITION_COLUMNS
+ ROW_FORMAT
+ HBASE_COLUMN_FAMILIES
+ TABLE_PROPERTIES
+ HBASE_REGION_SPLITS
+ If no explicit CREATE statement is provided, a CREATE statement is generated
+ from these sections (see 'build_table_template' function in
+ 'generate-schema-statements.py' for details)
+
+ ALTER
+ A set of ALTER statements to be executed after the table is created
+ (typically to add partitions, but may also be used for other settings that
+ cannot be specified directly in the CREATE TABLE statement).
+
+ These statements are ignored for HBase and Kudu tables.
+
+ LOAD
+ The statement used to load the base (text) form of the table. This is
+ typically a LOAD DATA statement.
+
+ DEPENDENT_LOAD
+ DEPENDENT_LOAD_KUDU
+ DEPENDENT_LOAD_HIVE
+ Statements to be executed during the "dependent load" phase. These statements
+ are run after the initial (base table) load is complete.