You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/12/06 01:55:58 UTC

[1/8] impala git commit: IMPALA-6068: Scale back fixing functional-types

Repository: impala
Updated Branches:
  refs/heads/master 7e368b8f0 -> c505a8159


IMPALA-6068: Scale back fixing functional-types

I re-created the original patch for IMPALA-6068, but only
performed what I believe to be the limited legal transformation
of data load: DEPENDENT_LOAD -> DEPENDENT_LOAD_HIVE.

Any place that directly uploads via hadoop or hdfs commands
was left alone as changing it can't be proven to be correct.

Change-Id: I6c242cca209a7138b10ad517076707709b5cd204
Testing: Doing a full data load.  I mistakenly changed a variable
name causing the first two dry-runs to fail.
Reviewed-on: http://gerrit.cloudera.org:8080/8690
Reviewed-by: Zach Amsden <za...@cloudera.com>
Tested-by: Zach Amsden <za...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/66704f91
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/66704f91
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/66704f91

Branch: refs/heads/master
Commit: 66704f915e6145b003fa9d1dffddb31d9699ffbd
Parents: 7e368b8
Author: Zachary Amsden <za...@cloudera.com>
Authored: Wed Nov 29 16:18:01 2017 -0800
Committer: Zach Amsden <za...@cloudera.com>
Committed: Mon Dec 4 23:46:44 2017 +0000

----------------------------------------------------------------------
 testdata/bin/generate-schema-statements.py      | 24 +++++--
 testdata/common/widetable.py                    |  5 +-
 .../functional/functional_schema_template.sql   | 70 ++++++++++++--------
 3 files changed, 61 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/66704f91/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index a31eca1..b8f6e8c 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -523,9 +523,13 @@ def generate_statements(output_name, test_vectors, sections,
       alter = section.get('ALTER')
       create = section['CREATE']
       create_hive = section['CREATE_HIVE']
+      assert not (create and create_hive), "Can't set both CREATE and CREATE_HIVE"
 
       table_properties = section['TABLE_PROPERTIES']
       insert = eval_section(section['DEPENDENT_LOAD'])
+      insert_hive = eval_section(section['DEPENDENT_LOAD_HIVE'])
+      assert not (insert and insert_hive),\
+          "Can't set both DEPENDENT_LOAD and DEPENDENT_LOAD_HIVE"
       load = eval_section(section['LOAD'])
 
       if file_format == 'kudu':
@@ -570,7 +574,8 @@ def generate_statements(output_name, test_vectors, sections,
       # HBASE we need to create these tables with a supported insert format.
       create_file_format = file_format
       create_codec = codec
-      if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD']):
+      if not (section['LOAD'] or section['LOAD_LOCAL'] or section['DEPENDENT_LOAD'] \
+              or section['DEPENDENT_LOAD_HIVE']):
         create_codec = 'none'
         create_file_format = file_format
         if file_format not in IMPALA_SUPPORTED_INSERT_FORMATS:
@@ -665,19 +670,23 @@ def generate_statements(output_name, test_vectors, sections,
           else:
             print 'Empty base table load for %s. Skipping load generation' % table_name
         elif file_format in ['kudu', 'parquet']:
-          if insert:
+          if insert_hive:
+            hive_output.load.append(build_insert(insert_hive, db_name, db_suffix,
+                file_format, codec, compression_type, table_name, data_path))
+          elif insert:
             impala_load.load.append(build_insert_into_statement(insert, db_name,
                 db_suffix, table_name, file_format, data_path, for_impala=True))
           else:
             print 'Empty parquet/kudu load for table %s. Skipping insert generation' \
               % table_name
         else:
+          if insert_hive:
+            insert = insert_hive
           if insert:
             hive_output.load.append(build_insert(insert, db_name, db_suffix, file_format,
-                                        codec, compression_type, table_name, data_path,
-                                        create_hive=create_hive))
+                codec, compression_type, table_name, data_path, create_hive=create_hive))
           else:
-              print 'Empty insert for table %s. Skipping insert generation' % table_name
+            print 'Empty insert for table %s. Skipping insert generation' % table_name
 
     impala_output.write_to_file("load-%s-impala-generated-%s-%s-%s.sql" %
         (output_name, file_format, codec, compression_type))
@@ -694,8 +703,9 @@ def generate_statements(output_name, test_vectors, sections,
 def parse_schema_template_file(file_name):
   VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS',
                          'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU',
-                         'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'LOAD',
-                         'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES', 'TABLE_PROPERTIES']
+                         'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'DEPENDENT_LOAD_HIVE',
+                         'LOAD', 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES',
+                         'TABLE_PROPERTIES']
   return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False)
 
 if __name__ == "__main__":

http://git-wip-us.apache.org/repos/asf/impala/blob/66704f91/testdata/common/widetable.py
----------------------------------------------------------------------
diff --git a/testdata/common/widetable.py b/testdata/common/widetable.py
index c95d5e6..d83c62e 100755
--- a/testdata/common/widetable.py
+++ b/testdata/common/widetable.py
@@ -19,7 +19,8 @@
 
 # Functions for creating wide (i.e. many-column) tables. When run from the command line,
 # specify either --get_columns to generate column descriptors, or --create_data to
-# generate a CSV data file.
+# generate a CSV data file and prints a SQL load statement to incorporate
+# into dataload SQL script generation.
 
 from datetime import datetime, timedelta
 import itertools
@@ -121,7 +122,7 @@ if __name__ == "__main__":
     print '\n'.join(get_columns(options.num_columns))
 
   if options.create_data:
-    # Generate data locally, and output the command template to load it into HDFS
+    # Generate data locally, and output the SQL load command for use in dataload
     if not options.output_file:
       parser.error("--output_file option must be specified")
 

http://git-wip-us.apache.org/repos/asf/impala/blob/66704f91/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index ff7b00d..3fcb5f7 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -712,13 +712,12 @@ CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
   a array<int>,
   m map<string,bigint>)
 STORED AS {file_format};
----- ALTER
--- This INSERT is placed in the ALTER section and not in the DEPENDENT_LOAD section because
--- it must always be executed in Hive. The DEPENDENT_LOAD section is sometimes executed in
--- Impala, but Impala currently does not support inserting into tables with complex types.
-INSERT OVERWRITE TABLE {table_name} SELECT * FROM functional.{table_name};
 ---- LOAD
 INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
+---- DEPENDENT_LOAD_HIVE
+-- This INSERT must run in Hive, because Impala doesn't support inserting into tables
+-- with complex types.
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM functional.{table_name};
 ====
 ---- DATASET
 functional
@@ -1477,8 +1476,9 @@ old_rcfile_table
 ---- COLUMNS
 key INT
 value STRING
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/data/oldrcfile.rc' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/oldrcfile.rc'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -1486,9 +1486,10 @@ functional
 bad_text_lzo
 ---- COLUMNS
 field STRING
----- DEPENDENT_LOAD
+---- DEPENDENT_LOAD_HIVE
 -- Error recovery test data for LZO compression.
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_text_lzo/bad_text.lzo' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_text_lzo/bad_text.lzo'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -1497,8 +1498,9 @@ bad_text_gzip
 ---- COLUMNS
 s STRING
 i INT
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_text_gzip/file_not_finished.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_text_gzip/file_not_finished.gz'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -1506,9 +1508,10 @@ functional
 bad_seq_snap
 ---- COLUMNS
 field STRING
----- DEPENDENT_LOAD
+---- DEPENDENT_LOAD_HIVE
 -- This data file contains format errors and is accessed by the unit test: sequence-file-recover-test.
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_seq_snap/bad_file' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_seq_snap/bad_file'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -1516,10 +1519,13 @@ functional
 bad_avro_snap_strings
 ---- COLUMNS
 s STRING
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/negative_string_len.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/invalid_union.avro' INTO TABLE {db_name}{db_suffix}.{table_name};
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/truncated_string.avro' INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/negative_string_len.avro'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/invalid_union.avro'
+INTO TABLE {db_name}{db_suffix}.{table_name};
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/truncated_string.avro'
+INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -1527,8 +1533,9 @@ functional
 bad_avro_snap_floats
 ---- COLUMNS
 c1 FLOAT
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/truncated_float.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/truncated_float.avro'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -1537,8 +1544,9 @@ bad_avro_decimal_schema
 ---- COLUMNS
 name STRING
 value DECIMAL(5,2)
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/invalid_decimal_schema.avro' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_avro_snap/invalid_decimal_schema.avro'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 -- IMPALA-694: uses data file produced by parquet-mr version 1.2.5-cdh4.5.0
@@ -1802,8 +1810,8 @@ avro_decimal_tbl
 ---- COLUMNS
 name STRING
 value DECIMAL(5,2)
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/data/avro_decimal_tbl.avro'
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/avro_decimal_tbl.avro'
 OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
@@ -2094,9 +2102,11 @@ delimited fields terminated by ','  escaped by '\\'
 ---- ALTER
 ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='1');
 ---- LOAD
-LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -2110,9 +2120,11 @@ delimited fields terminated by ','  escaped by '\\'
 ---- ALTER
 ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='2');
 ---- LOAD
-LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
----- DEPENDENT_LOAD
-LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD_HIVE
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional


[6/8] impala git commit: IMPALA-6256: Incorrect principal will be used for internal connections if FLAGS_be_principal is set

Posted by ta...@apache.org.
IMPALA-6256: Incorrect principal will be used for internal connections if FLAGS_be_principal is set

In Impala, we have FLAGS_principal and FLAGS_be_principal flags.
If only FLAGS_principal is set, we use it as both the internal
and external principals.

If both FLAGS_principal and FLAGS_be_principal are set, we use
FLAGS_be_principal as the internal principal and FLAGS_principal
as the external principal.

However, in Kudu, they only source the internal principal from
FLAGS_principal and aren't aware of a flag called FLAGS_be_principal.
So as of the time IMPALA-5129 went in, if FLAGS_be_principal is
explicitly set to something different from FLAGS_principal, we would
be using the external principal as the internal principal, which is
incorrect.

This is fixed on the Kudu side by the following commit:
https://github.com/apache/kudu/commit/d42c2916467b83347f064ddea59f7a65202f7247

This is now cherry-picked to Impala and we now use the new API to
fix this problem.

Testing: Made the MiniKdc explicitly set FLAGS_principal and
FLAGS_be_principal. Confirmed that the tests fail without this
patch and with FLAGS_be_principal set.

Change-Id: If5af4398467857da09878075439b6612a04d7a01
Reviewed-on: http://gerrit.cloudera.org:8080/8761
Reviewed-by: Dan Hecht <dh...@cloudera.com>
Reviewed-by: Michael Ho <kw...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/e79f6446
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/e79f6446
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/e79f6446

Branch: refs/heads/master
Commit: e79f6446b1eb9647ecbf61ed6b8fe7e992142de1
Parents: e27b01f
Author: Sailesh Mukil <sa...@apache.org>
Authored: Mon Dec 4 22:31:52 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Dec 5 23:24:08 2017 +0000

----------------------------------------------------------------------
 be/src/rpc/authentication.cc        | 4 ++--
 be/src/testutil/mini-kdc-wrapper.cc | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/e79f6446/be/src/rpc/authentication.cc
----------------------------------------------------------------------
diff --git a/be/src/rpc/authentication.cc b/be/src/rpc/authentication.cc
index 151dad0..29fa8f7 100644
--- a/be/src/rpc/authentication.cc
+++ b/be/src/rpc/authentication.cc
@@ -833,8 +833,8 @@ Status SaslAuthProvider::Start() {
     if (FLAGS_use_kudu_kinit) {
       // Starts a thread that periodically does a 'kinit'. The thread lives as long as the
       // process does.
-      KUDU_RETURN_IF_ERROR(kudu::security::InitKerberosForServer(KRB5CCNAME_PATH, false),
-          "Could not init kerberos");
+      KUDU_RETURN_IF_ERROR(kudu::security::InitKerberosForServer(principal_,
+          KRB5CCNAME_PATH, false), "Could not init kerberos");
     } else {
       Promise<Status> first_kinit;
       stringstream thread_name;

http://git-wip-us.apache.org/repos/asf/impala/blob/e79f6446/be/src/testutil/mini-kdc-wrapper.cc
----------------------------------------------------------------------
diff --git a/be/src/testutil/mini-kdc-wrapper.cc b/be/src/testutil/mini-kdc-wrapper.cc
index b4e0f25..d378ea4 100644
--- a/be/src/testutil/mini-kdc-wrapper.cc
+++ b/be/src/testutil/mini-kdc-wrapper.cc
@@ -34,6 +34,7 @@ DECLARE_bool(use_kudu_kinit);
 
 DECLARE_string(keytab_file);
 DECLARE_string(principal);
+DECLARE_string(be_principal);
 DECLARE_string(krb5_conf);
 
 Status MiniKdcWrapper::StartKdc(string keytab_dir) {
@@ -79,7 +80,11 @@ Status MiniKdcWrapper::SetupAndStartMiniKDC(KerberosSwitch k) {
     // Set the appropriate flags based on how we've set up the kerberos environment.
     FLAGS_krb5_conf = strings::Substitute("$0/$1", keytab_dir, "krb5.conf");
     FLAGS_keytab_file = kt_path;
-    FLAGS_principal = strings::Substitute("$0@$1", spn_, realm_);
+
+    // We explicitly set 'principal' and 'be_principal' even though 'principal' won't be
+    // used to test IMPALA-6256.
+    FLAGS_principal = "dummy-service/host@realm";
+    FLAGS_be_principal = strings::Substitute("$0@$1", spn_, realm_);
   }
   return Status::OK();
 }
@@ -91,6 +96,7 @@ Status MiniKdcWrapper::TearDownMiniKDC(KerberosSwitch k) {
     // Clear the flags so we don't step on other tests that may run in the same process.
     FLAGS_keytab_file.clear();
     FLAGS_principal.clear();
+    FLAGS_be_principal.clear();
     FLAGS_krb5_conf.clear();
 
     // Remove test directory.


[5/8] impala git commit: IMPALA-6268: KerberosOnAndOff/RpcMgrKerberizedTest.MultipleServices failing

Posted by ta...@apache.org.
IMPALA-6268: KerberosOnAndOff/RpcMgrKerberizedTest.MultipleServices failing

This patch just disables the failing test to unblock builds.
We will investigate in parallel the root cause for these failures
and post a real fix.

Change-Id: I6c750850ff916617a06e3cfac330072d8e2179e8
Reviewed-on: http://gerrit.cloudera.org:8080/8766
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Reviewed-by: Michael Ho <kw...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/e27b01f4
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/e27b01f4
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/e27b01f4

Branch: refs/heads/master
Commit: e27b01f4bb73a56261ef075eb29b030b60881d73
Parents: 7512181
Author: Sailesh Mukil <sa...@apache.org>
Authored: Tue Dec 5 11:21:26 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Dec 5 22:58:23 2017 +0000

----------------------------------------------------------------------
 be/src/rpc/rpc-mgr-test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/e27b01f4/be/src/rpc/rpc-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/rpc/rpc-mgr-test.cc b/be/src/rpc/rpc-mgr-test.cc
index 5410af4..9acd39b 100644
--- a/be/src/rpc/rpc-mgr-test.cc
+++ b/be/src/rpc/rpc-mgr-test.cc
@@ -191,11 +191,11 @@ class ScanMemServiceImpl : public ScanMemServiceIf {
   }
 };
 
+// TODO: Disabled 'USE_KUDU_KERBEROS' and 'USE_IMPALA_KERBEROS' due to IMPALA-6268.
+// Reenable after fixing.
 INSTANTIATE_TEST_CASE_P(KerberosOnAndOff,
                         RpcMgrKerberizedTest,
-                        ::testing::Values(KERBEROS_OFF,
-                                          USE_KUDU_KERBEROS,
-                                          USE_IMPALA_KERBEROS));
+                        ::testing::Values(KERBEROS_OFF));
 
 TEST_P(RpcMgrKerberizedTest, MultipleServices) {
   // Test that a service can be started, and will respond to requests.


[2/8] impala git commit: IMPALA-6238: Enhance TErrorCode::DATASTREAM_SENDER_TIMEOUT message

Posted by ta...@apache.org.
IMPALA-6238: Enhance TErrorCode::DATASTREAM_SENDER_TIMEOUT message

This change augments the message of TErrorCode::DATASTREAM_SENDER_TIMEOUT
to include the source address when KRPC is enabled. The source address is
not readily available in Thrift. The new message includes the destination
plan node id in case there are multiple exchange nodes in a fragment instance.

Testing done: Confirmed the error message by testing with following options:
"--stress_datastream_recvr_delay_ms=90000 datastream_sender_timeout_ms=1000"

Change-Id: Ie3e83773fe6feda057296e7d5544690aa9271fa0
Reviewed-on: http://gerrit.cloudera.org:8080/8751
Reviewed-by: Michael Ho <kw...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/e4a2f5d2
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/e4a2f5d2
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/e4a2f5d2

Branch: refs/heads/master
Commit: e4a2f5d2123508dbd9281980a395d4f9e1851dd7
Parents: 66704f9
Author: Michael Ho <kw...@cloudera.com>
Authored: Sat Dec 2 19:49:48 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Dec 5 02:09:21 2017 +0000

----------------------------------------------------------------------
 be/src/runtime/data-stream-mgr.cc         | 6 ++++--
 be/src/runtime/data-stream-sender.cc      | 3 ++-
 be/src/runtime/krpc-data-stream-mgr.cc    | 5 +++--
 be/src/runtime/krpc-data-stream-sender.cc | 3 ++-
 common/thrift/generate_error_codes.py     | 4 ++--
 5 files changed, 13 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/data-stream-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/data-stream-mgr.cc b/be/src/runtime/data-stream-mgr.cc
index 503cd29..93c524e 100644
--- a/be/src/runtime/data-stream-mgr.cc
+++ b/be/src/runtime/data-stream-mgr.cc
@@ -186,7 +186,8 @@ Status DataStreamMgr::AddData(const TUniqueId& fragment_instance_id,
     // FindRecvrOrWait() timed out, which is unexpected and suggests a query setup error;
     // we return DATASTREAM_SENDER_TIMEOUT to trigger tear-down of the query.
     if (already_unregistered) return Status::OK();
-    ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, PrintId(fragment_instance_id));
+    ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, "", PrintId(fragment_instance_id),
+        dest_node_id);
     VLOG_QUERY << "DataStreamMgr::AddData(): " << msg.msg();
     return Status::Expected(msg);
   }
@@ -210,7 +211,8 @@ Status DataStreamMgr::CloseSender(const TUniqueId& fragment_instance_id,
       // Was not able to notify the receiver that this was the end of stream. Notify the
       // sender that this failed so that they can take appropriate action (i.e. failing
       // the query).
-      ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, PrintId(fragment_instance_id));
+      ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, "",
+          PrintId(fragment_instance_id), dest_node_id);
       VLOG_QUERY << "DataStreamMgr::CloseSender(): " << msg.msg();
       status = Status::Expected(msg);
     }

http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/data-stream-sender.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/data-stream-sender.cc b/be/src/runtime/data-stream-sender.cc
index cf749cd..c76e626 100644
--- a/be/src/runtime/data-stream-sender.cc
+++ b/be/src/runtime/data-stream-sender.cc
@@ -275,7 +275,8 @@ Status DataStreamSender::Channel::SendCurrentBatch() {
 Status DataStreamSender::Channel::GetSendStatus() {
   WaitForRpc();
   if (!rpc_status_.ok()) {
-    LOG(ERROR) << "channel send status: " << rpc_status_.GetDetail();
+    LOG(ERROR) << "channel send to " << TNetworkAddressToString(address_) << " failed: "
+               << rpc_status_.GetDetail();
   }
   return rpc_status_;
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/krpc-data-stream-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/krpc-data-stream-mgr.cc b/be/src/runtime/krpc-data-stream-mgr.cc
index 7c36191..348b9ab 100644
--- a/be/src/runtime/krpc-data-stream-mgr.cc
+++ b/be/src/runtime/krpc-data-stream-mgr.cc
@@ -334,8 +334,9 @@ void KrpcDataStreamMgr::RespondToTimedOutSender(const std::unique_ptr<ContextTyp
   TUniqueId finst_id;
   finst_id.__set_lo(request->dest_fragment_instance_id().lo());
   finst_id.__set_hi(request->dest_fragment_instance_id().hi());
-
-  ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, PrintId(finst_id));
+  string remote_addr = Substitute(" $0", ctx->rpc_context->remote_address().host());
+  ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, remote_addr, PrintId(finst_id),
+      ctx->request->dest_node_id());
   VLOG_QUERY << msg.msg();
   Status::Expected(msg).ToProto(ctx->response->mutable_status());
   ctx->rpc_context->RespondSuccess();

http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/krpc-data-stream-sender.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/krpc-data-stream-sender.cc b/be/src/runtime/krpc-data-stream-sender.cc
index 32e20cd..0c2a295 100644
--- a/be/src/runtime/krpc-data-stream-sender.cc
+++ b/be/src/runtime/krpc-data-stream-sender.cc
@@ -331,7 +331,8 @@ Status KrpcDataStreamSender::Channel::WaitForRpc(std::unique_lock<SpinLock>* loc
 
   DCHECK(!rpc_in_flight_);
   if (UNLIKELY(!rpc_status_.ok())) {
-    LOG(ERROR) << "channel send status: " << rpc_status_.GetDetail();
+    LOG(ERROR) << "channel send to " << TNetworkAddressToString(address_) << " failed: "
+               << rpc_status_.GetDetail();
     return rpc_status_;
   }
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index bf1953e..b137b5e 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -224,8 +224,8 @@ error_codes = (
   ("COMPRESSED_FILE_TRUNCATED", 70,
    "Unexpected end of compressed file. File may be truncated. file=$0"),
 
-  ("DATASTREAM_SENDER_TIMEOUT", 71, "Sender timed out waiting for receiver fragment "
-   "instance: $0"),
+  ("DATASTREAM_SENDER_TIMEOUT", 71, "Sender$0 timed out waiting for receiver fragment "
+   "instance: $1, dest node: $2"),
 
   ("KUDU_IMPALA_TYPE_MISSING", 72, "Kudu type $0 is not available in Impala."),
 


[4/8] impala git commit: IMPALA-6265 Query cancellation test enhancements

Posted by ta...@apache.org.
IMPALA-6265 Query cancellation test enhancements

In the query cancellation tests it is essential to wait until the
query gets to a desired state (waiting_to_finish, fetching) and then
cancel it. Apparently, ASAN query execution happens slower than on a
Release build. As a result a hard coded timeout threshold is not
sufficient to cover all the builds, or should be set to a wastingly
high value.
As a solution the query state is checked on the Impala debug page in
intervals until it reaches the desired state or the maximum retry
attempt value is reached.

Change-Id: Ie0bff485a872df7be8efd784314a6ca91aaadd11
Reviewed-on: http://gerrit.cloudera.org:8080/8713
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/75121819
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/75121819
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/75121819

Branch: refs/heads/master
Commit: 75121819bee10b7f8f0093b694c9fefc63fb9bdf
Parents: 1f1bff8
Author: Gabor Kaszab <ga...@cloudera.com>
Authored: Fri Dec 1 18:09:49 2017 +0100
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Dec 5 21:40:11 2017 +0000

----------------------------------------------------------------------
 tests/shell/test_shell_commandline.py | 45 ++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/75121819/tests/shell/test_shell_commandline.py
----------------------------------------------------------------------
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 1ecdbd5..9bd6b8c 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -325,31 +325,58 @@ class TestImpalaShell(ImpalaTestSuite):
 
     assert "Cancelling Query" in result.stderr, result.stderr
 
+  @pytest.mark.execute_serially
   def test_query_cancellation_during_fetch(self):
     """IMPALA-1144: Test cancellation (CTRL+C) while results are being
     fetched"""
     # A select query where fetch takes several seconds
-    args = '-q "with v as (values (1 as x), (2), (3), (4)) ' + \
-        'select * from v, v v2, v v3, v v4, v v5, v v6, v v7, v v8, ' + \
-        'v v9, v v10, v v11;"'
+    stmt = "with v as (values (1 as x), (2), (3), (4)) " + \
+        "select * from v, v v2, v v3, v v4, v v5, v v6, v v7, v v8, " + \
+        "v v9, v v10, v v11"
     # Kill happens when the results are being fetched
-    self.run_and_verify_query_cancellation_test(args)
+    self.run_and_verify_query_cancellation_test(stmt, "FINISHED")
 
+  @pytest.mark.execute_serially
   def test_query_cancellation_during_wait_to_finish(self):
     """IMPALA-1144: Test cancellation (CTRL+C) while the query is in the
     wait_to_finish state"""
     # A select where wait_to_finish takes several seconds
-    args = '-q "select * from tpch.customer c1, tpch.customer c2, ' + \
-           'tpch.customer c3 order by c1.c_name"'
+    stmt = "select * from tpch.customer c1, tpch.customer c2, " + \
+           "tpch.customer c3 order by c1.c_name"
     # Kill happens in wait_to_finish state
-    self.run_and_verify_query_cancellation_test(args)
+    self.run_and_verify_query_cancellation_test(stmt, "RUNNING")
 
-  def run_and_verify_query_cancellation_test(self, args):
+  def wait_for_query_state(self, stmt, state, max_retry=15):
+    """Checks the in flight queries on Impala debug page. Polls the state of
+    the query statement from parameter every second until the query gets to
+    a state given via parameter or a maximum retry count is reached.
+    Restriction: Only works if there is only one in flight query."""
+    impalad_service = ImpaladService(IMPALAD.split(':')[0])
+    if not impalad_service.wait_for_num_in_flight_queries(1):
+      raise Exception("No in flight query found")
+
+    retry_count = 0
+    while retry_count <= max_retry:
+      query_info = impalad_service.get_in_flight_queries()[0]
+      if query_info['stmt'] != stmt:
+        exc_text = "The found in flight query is not the one under test: " + \
+            query_info['stmt']
+        raise Exception(exc_text)
+      if query_info['state'] == state:
+        return
+      retry_count += 1
+      sleep(1.0)
+    raise Exception("Query didn't reach desired state: " + state)
+
+  def run_and_verify_query_cancellation_test(self, stmt, cancel_at_state):
     """Starts the execution of the received query, waits until the query
     execution in fact starts and then cancels it. Expects the query
     cancellation to succeed."""
+    args = "-q \"" + stmt + ";\""
     p = ImpalaShell(args)
-    sleep(2.0)
+
+    self.wait_for_query_state(stmt, cancel_at_state)
+
     os.kill(p.pid(), signal.SIGINT)
     result = p.get_result()
     assert "Cancelling Query" in result.stderr


[3/8] impala git commit: IMPALA-6232: Disable file handle cache by default

Posted by ta...@apache.org.
IMPALA-6232: Disable file handle cache by default

There are scenarios where HDFS file appends or HDFS file
overwrites can lead to HDFS disabling short circuit reads.
Since this can be a performance regression, this changes
the default value for max_cached_file_handles to 0 to
disable the file handle cache by default. This also changes
the default value for unused_file_handle_timeout_sec to 270.
If users enable the file handle cache, this setting will
prevent some of the scenarios that disable short circuit
reads.

Ran existing file handle cache tests to verify that there
is no impact.

Change-Id: Iea7f943f63b72b42286a9e8b9987308baa79d7b0
Reviewed-on: http://gerrit.cloudera.org:8080/8750
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1f1bff8e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1f1bff8e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1f1bff8e

Branch: refs/heads/master
Commit: 1f1bff8e8d35b66308a1e865cdc8bce41ce89873
Parents: e4a2f5d
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Mon Dec 4 10:21:33 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Dec 5 21:03:00 2017 +0000

----------------------------------------------------------------------
 be/src/runtime/io/disk-io-mgr.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1f1bff8e/be/src/runtime/io/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr.cc b/be/src/runtime/io/disk-io-mgr.cc
index 4f4074c..668fc75 100644
--- a/be/src/runtime/io/disk-io-mgr.cc
+++ b/be/src/runtime/io/disk-io-mgr.cc
@@ -94,7 +94,10 @@ DEFINE_int32(max_free_io_buffers, 128,
 // uses about 6kB of memory. 20k file handles will thus reserve ~120MB of memory.
 // The actual amount of memory that is associated with a file handle can be larger
 // or smaller, depending on the replication factor for this file or the path name.
-DEFINE_uint64(max_cached_file_handles, 20000, "Maximum number of HDFS file handles "
+// TODO: This is currently disabled due to HDFS-12528, which can disable short circuit
+// reads when file handle caching is enabled. This should be reenabled by default
+// when that issue is fixed.
+DEFINE_uint64(max_cached_file_handles, 0, "Maximum number of HDFS file handles "
     "that will be cached. Disabled if set to 0.");
 
 // The unused file handle timeout specifies how long a file handle will remain in the
@@ -106,7 +109,10 @@ DEFINE_uint64(max_cached_file_handles, 20000, "Maximum number of HDFS file handl
 // from being freed. When the metadata sees that a file has been deleted, the file handle
 // will no longer be used by future queries. Aging out this file handle allows the
 // disk space to be freed in an appropriate period of time.
-DEFINE_uint64(unused_file_handle_timeout_sec, 21600, "Maximum time, in seconds, that an "
+// TODO: HDFS-12528 (which can disable short circuit reads) is more likely to happen
+// if file handles are cached for longer than 5 minutes. Use a conservative value for
+// the unused file handle cache timeout until HDFS-12528 is fixed.
+DEFINE_uint64(unused_file_handle_timeout_sec, 270, "Maximum time, in seconds, that an "
     "unused HDFS file handle will remain in the file handle cache. Disabled if set "
     "to 0.");
 


[8/8] impala git commit: IMPALA-6210: Add query id to lineage graph logging

Posted by ta...@apache.org.
IMPALA-6210: Add query id to lineage graph logging

Some tools use lineage graph logging to collect query metrics. Currently
only query hash is present in this log. Adding query id into it makes
such accounting easier.

Testing: The equality of query id in the query profile and lineage log
is checked in test_lineage.py. A test for TUniqueIdUtil is added to the
FE tests.

Change-Id: I4adbd02df37a234dbb79f58b7c46ca11a914229f
Reviewed-on: http://gerrit.cloudera.org:8080/8589
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c505a815
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c505a815
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c505a815

Branch: refs/heads/master
Commit: c505a8159be73879e0a75d6199f60aa5ca1726fb
Parents: 16c5f51
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Fri Nov 17 14:05:45 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Dec 6 00:52:19 2017 +0000

----------------------------------------------------------------------
 be/src/util/lineage-util.h                      |  2 +
 common/thrift/LineageGraph.thrift               |  5 ++
 .../impala/analysis/ColumnLineageGraph.java     | 17 ++++-
 .../org/apache/impala/util/TUniqueIdUtil.java   | 38 ++++++++++
 .../apache/impala/util/TUniqueIdUtilTest.java   | 78 ++++++++++++++++++++
 .../queries/PlannerTest/lineage.test            | 44 +++++++++++
 tests/custom_cluster/test_lineage.py            |  9 ++-
 7 files changed, 186 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/be/src/util/lineage-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/lineage-util.h b/be/src/util/lineage-util.h
index 342e4f4..e52bb4c 100644
--- a/be/src/util/lineage-util.h
+++ b/be/src/util/lineage-util.h
@@ -79,6 +79,8 @@ class LineageUtil {
       writer.StartObject();
       writer.String("queryText");
       writer.String(lineage.query_text.c_str());
+      writer.String("queryId");
+      writer.String(PrintId(lineage.query_id).c_str());
       writer.String("hash");
       writer.String(lineage.hash.c_str());
       writer.String("user");

http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/common/thrift/LineageGraph.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/LineageGraph.thrift b/common/thrift/LineageGraph.thrift
index f0d3540..1fb0b55 100644
--- a/common/thrift/LineageGraph.thrift
+++ b/common/thrift/LineageGraph.thrift
@@ -18,6 +18,8 @@
 namespace cpp impala
 namespace java org.apache.impala.thrift
 
+include "Types.thrift"
+
 struct TVertex {
   // Vertex id
   1: required i64 id
@@ -62,4 +64,7 @@ struct TLineageGraph {
   6: list<TMultiEdge> edges
 
   7: list<TVertex> vertices
+
+  // Query id in TQueryCtx
+  8: required Types.TUniqueId query_id
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/fe/src/main/java/org/apache/impala/analysis/ColumnLineageGraph.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ColumnLineageGraph.java b/fe/src/main/java/org/apache/impala/analysis/ColumnLineageGraph.java
index bc535ff..e7c66e2 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ColumnLineageGraph.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ColumnLineageGraph.java
@@ -40,7 +40,9 @@ import org.apache.impala.thrift.TEdgeType;
 import org.apache.impala.thrift.TQueryCtx;
 import org.apache.impala.thrift.TLineageGraph;
 import org.apache.impala.thrift.TMultiEdge;
+import org.apache.impala.thrift.TUniqueId;
 import org.apache.impala.thrift.TVertex;
+import org.apache.impala.util.TUniqueIdUtil;
 import com.google.common.base.Joiner;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Strings;
@@ -287,6 +289,8 @@ public class ColumnLineageGraph {
   // Query statement
   private String queryStr_;
 
+  private TUniqueId queryId_;
+
   // Name of the user that issued this query
   private String user_;
 
@@ -320,8 +324,10 @@ public class ColumnLineageGraph {
   /**
    * Private c'tor, used only for testing.
    */
-  private ColumnLineageGraph(String stmt, String user, long timestamp) {
+  private ColumnLineageGraph(String stmt, TUniqueId queryId, String user, long timestamp)
+  {
     queryStr_ = stmt;
+    queryId_ = queryId;
     user_ = user;
     timestamp_ = timestamp;
   }
@@ -394,6 +400,7 @@ public class ColumnLineageGraph {
     timestamp_ = queryCtx.start_unix_millis / 1000;
     descTbl_ = analyzer.getDescTbl();
     user_ = analyzer.getUser().getName();
+    queryId_ = queryCtx.query_id;
   }
 
   private void computeProjectionDependencies(List<Expr> resultExprs) {
@@ -525,6 +532,7 @@ public class ColumnLineageGraph {
     if (Strings.isNullOrEmpty(queryStr_)) return "";
     Map obj = new LinkedHashMap();
     obj.put("queryText", queryStr_);
+    obj.put("queryId", TUniqueIdUtil.PrintId(queryId_));
     obj.put("hash", getQueryHash(queryStr_));
     obj.put("user", user_);
     obj.put("timestamp", timestamp_);
@@ -551,6 +559,7 @@ public class ColumnLineageGraph {
     TLineageGraph graph = new TLineageGraph();
     if (Strings.isNullOrEmpty(queryStr_)) return graph;
     graph.setQuery_text(queryStr_);
+    graph.setQuery_id(queryId_);
     graph.setHash(getQueryHash(queryStr_));
     graph.setUser(user_);
     graph.setStarted(timestamp_);
@@ -575,7 +584,7 @@ public class ColumnLineageGraph {
    */
   public static ColumnLineageGraph fromThrift(TLineageGraph obj) {
     ColumnLineageGraph lineage =
-        new ColumnLineageGraph(obj.query_text, obj.user, obj.started);
+        new ColumnLineageGraph(obj.query_text, obj.query_id, obj.user, obj.started);
     TreeSet<Vertex> vertices = Sets.newTreeSet();
     for (TVertex vertex: obj.vertices) {
       vertices.add(Vertex.fromThrift(vertex));
@@ -611,10 +620,10 @@ public class ColumnLineageGraph {
     if (!(obj instanceof JSONObject)) return null;
     JSONObject jsonObj = (JSONObject) obj;
     String stmt = (String) jsonObj.get("queryText");
-    String hash = (String) jsonObj.get("hash");
+    TUniqueId queryId = TUniqueIdUtil.ParseId((String) jsonObj.get("queryId"));
     String user = (String) jsonObj.get("user");
     long timestamp = (Long) jsonObj.get("timestamp");
-    ColumnLineageGraph graph = new ColumnLineageGraph(stmt, user, timestamp);
+    ColumnLineageGraph graph = new ColumnLineageGraph(stmt, queryId, user, timestamp);
     JSONArray serializedVertices = (JSONArray) jsonObj.get("vertices");
     Set<Vertex> vertices = Sets.newHashSet();
     for (int i = 0; i < serializedVertices.size(); ++i) {

http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/fe/src/main/java/org/apache/impala/util/TUniqueIdUtil.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/util/TUniqueIdUtil.java b/fe/src/main/java/org/apache/impala/util/TUniqueIdUtil.java
new file mode 100644
index 0000000..0d0e67a
--- /dev/null
+++ b/fe/src/main/java/org/apache/impala/util/TUniqueIdUtil.java
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.util;
+
+import org.apache.impala.thrift.TUniqueId;
+import com.google.common.primitives.UnsignedLong;
+
+/**
+ * Utility functions for working with TUniqueId objects.
+ */
+public class TUniqueIdUtil {
+  public static String PrintId(TUniqueId id) {
+    return Long.toHexString(id.hi) + ":" + Long.toHexString(id.lo);
+  }
+
+  public static TUniqueId ParseId(String id) {
+    String[] splitted = id.split(":");
+    if (splitted.length != 2) throw new NumberFormatException(
+        "Invalid unique id format: " + id);
+    return new TUniqueId(UnsignedLong.valueOf(splitted[0], 16).longValue(),
+        UnsignedLong.valueOf(splitted[1], 16).longValue());
+  }
+}

http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/fe/src/test/java/org/apache/impala/util/TUniqueIdUtilTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/util/TUniqueIdUtilTest.java b/fe/src/test/java/org/apache/impala/util/TUniqueIdUtilTest.java
new file mode 100644
index 0000000..c95e721
--- /dev/null
+++ b/fe/src/test/java/org/apache/impala/util/TUniqueIdUtilTest.java
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.util;
+
+import org.apache.impala.thrift.TUniqueId;
+import org.junit.Test;
+
+import static org.apache.impala.util.TUniqueIdUtil.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class TUniqueIdUtilTest {
+  @Test
+  public void testUniqueId() {
+    TUniqueId unique_id = new TUniqueId();
+    unique_id.hi = 0xfeedbeeff00d7777L;
+    unique_id.lo = 0x2020202020202020L;
+    String str = "feedbeeff00d7777:2020202020202020";
+    assertEquals(str, PrintId(unique_id));
+    unique_id.lo = 0x20L;
+    assertEquals("feedbeeff00d7777:20", PrintId(unique_id));
+  }
+
+  @Test
+  public void QueryIdParsing() {
+    try {
+      ParseId("abcd");
+      fail();
+    } catch (NumberFormatException e) {}
+    try {
+      ParseId("abcdabcdabcdabcdabcdabcdabcdabcda");
+      fail();
+    } catch (NumberFormatException e) {}
+    try {
+      ParseId("zbcdabcdabcdabcd:abcdabcdabcdabcd");
+      fail();
+    } catch (NumberFormatException e) {}
+    try {
+      ParseId("~bcdabcdabcdabcd:abcdabcdabcdabcd");
+      fail();
+    } catch (NumberFormatException e) {}
+    try {
+      ParseId("abcdabcdabcdabcd:!bcdabcdabcdabcd");
+      fail();
+    } catch (NumberFormatException e) {}
+
+    TUniqueId id = ParseId("abcdabcdabcdabcd:abcdabcdabcdabcd");
+    assertEquals(id.hi, 0xabcdabcdabcdabcdL);
+    assertEquals(id.lo, 0xabcdabcdabcdabcdL);
+
+    id = ParseId("abcdabcdabcdabcd:1234abcdabcd5678");
+    assertEquals(id.hi, 0xabcdabcdabcdabcdL);
+    assertEquals(id.lo, 0x1234abcdabcd5678L);
+
+    id = ParseId("cdabcdabcdabcd:1234abcdabcd5678");
+    assertEquals(id.hi, 0xcdabcdabcdabcdL);
+    assertEquals(id.lo, 0x1234abcdabcd5678L);
+
+    id = ParseId("cdabcdabcdabcd:abcdabcd5678");
+    assertEquals(id.hi, 0xcdabcdabcdabcdL);
+    assertEquals(id.lo, 0xabcdabcd5678L);
+  }
+}

http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/testdata/workloads/functional-planner/queries/PlannerTest/lineage.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/lineage.test b/testdata/workloads/functional-planner/queries/PlannerTest/lineage.test
index 7bb79a9..02ac733 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/lineage.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/lineage.test
@@ -7,6 +7,7 @@ select * from (
 ---- LINEAGE
 {
     "queryText":"select * from (\n  select tinyint_col + int_col x from functional.alltypes\n  union all\n  select sum(bigint_col) y from (select bigint_col from functional.alltypes) v1) v2",
+    "queryId":"0:0",
     "hash":"25456c60a2e874a20732f42c7af27553",
     "user":"dev",
     "timestamp":1446159271,
@@ -60,6 +61,7 @@ order by b.bigint_col limit 10
 ---- LINEAGE
 {
     "queryText":"select sum(a.tinyint_col) over (partition by a.smallint_col order by a.id),\n  count(b.string_col), b.timestamp_col\nfrom functional.alltypes a join functional.alltypessmall b on (a.id = b.id)\nwhere a.year = 2010 and b.float_col > 0\ngroup by a.tinyint_col, a.smallint_col, a.id, b.string_col, b.timestamp_col, b.bigint_col\nhaving count(a.int_col) > 10\norder by b.bigint_col limit 10",
+    "queryId":"0:0",
     "hash":"e0309eeff9811f53c82657d62c1e04eb",
     "user":"dev",
     "timestamp":1446159271,
@@ -196,6 +198,7 @@ create table lineage_test_tbl as select int_col, tinyint_col from functional.all
 ---- LINEAGE
 {
     "queryText":"create table lineage_test_tbl as select int_col, tinyint_col from functional.alltypes",
+    "queryId":"0:0",
     "hash":"f7666959b65ce1aa2a695ae90adb7c85",
     "user":"dev",
     "timestamp":1446159271,
@@ -250,6 +253,7 @@ where a.year = 2009 and b.month = 2
 ---- LINEAGE
 {
     "queryText":"create table lineage_test_tbl as\nselect distinct a.int_col, a.string_col from functional.alltypes a\ninner join functional.alltypessmall b on (a.id = b.id)\nwhere a.year = 2009 and b.month = 2",
+    "queryId":"0:0",
     "hash":"6d83126f8e34eec31ed4e111e1c32e78",
     "user":"dev",
     "timestamp":1446159271,
@@ -337,6 +341,7 @@ select * from
 ---- LINEAGE
 {
     "queryText":"create table lineage_test_tbl as\nselect * from\n  (select * from\n     (select int_col from functional.alltypestiny limit 1) v1 ) v2",
+    "queryId":"0:0",
     "hash":"f719f8eba46eda75e9cc560310885558",
     "user":"dev",
     "timestamp":1446159271,
@@ -370,6 +375,7 @@ create table lineage_test_tblm as select * from functional_hbase.alltypes limit
 ---- LINEAGE
 {
     "queryText":"create table lineage_test_tblm as select * from functional_hbase.alltypes limit 5",
+    "queryId":"0:0",
     "hash":"bedebc5bc72bbc6aec385c514944daae",
     "user":"dev",
     "timestamp":1446159271,
@@ -634,6 +640,7 @@ functional_hbase.alltypes
 ---- LINEAGE
 {
     "queryText":"insert into\nfunctional_hbase.alltypes\n  values (1, 1, true, \"1999-12-01\", 2.0, 1.0, 1, 12, 2, \"abs\",\n  cast(now() as timestamp), 1, 1999)",
+    "queryId":"0:0",
     "hash":"b923425ce9cc2d53d36523ec83971e67",
     "user":"dev",
     "timestamp":1446159271,
@@ -818,6 +825,7 @@ from functional.alltypes
 ---- LINEAGE
 {
     "queryText":"insert into table functional.alltypesnopart (id, bool_col, timestamp_col)\nselect id, bool_col, timestamp_col\nfrom functional.alltypes",
+    "queryId":"0:0",
     "hash":"b7b9474fc6b97f104bd031209438ee0e",
     "user":"dev",
     "timestamp":1446159271,
@@ -996,6 +1004,7 @@ where year=2009 and month=05
 ---- LINEAGE
 {
     "queryText":"insert into table functional.alltypessmall (smallint_col, int_col)\npartition (year=2009, month=04)\nselect smallint_col, int_col\nfrom functional.alltypes\nwhere year=2009 and month=05",
+    "queryId":"0:0",
     "hash":"2ed3a6c784e1c0c7fcef226d71375180",
     "user":"dev",
     "timestamp":1446159271,
@@ -1226,6 +1235,7 @@ where year=2009 and month>10
 ---- LINEAGE
 {
     "queryText":"insert into table functional.alltypessmall (id, string_col, int_col)\npartition (year, month)\nselect id, string_col, int_col, year, month\nfrom functional_seq_snap.alltypes\nwhere year=2009 and month>10",
+    "queryId":"0:0",
     "hash":"39ac95ce0632ef1ee8b474be644971f3",
     "user":"dev",
     "timestamp":1446159271,
@@ -1468,6 +1478,7 @@ having min(id) > 10
 ---- LINEAGE
 {
     "queryText":"insert into table functional.alltypessmall\npartition (year=2009, month)\nselect min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col),\nmin(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col),\nmin(timestamp_col), month\nfrom functional.alltypes\nwhere year=2009 and month>10\ngroup by month\nhaving min(id) > 10",
+    "queryId":"0:0",
     "hash":"e6969c2cc67e9d6f3f985ddc6431f915",
     "user":"dev",
     "timestamp":1446159271,
@@ -1754,6 +1765,7 @@ group by int_col, tinyint_col
 ---- LINEAGE
 {
     "queryText":"select\nmax(tinyint_col) over(partition by int_col)\nfrom functional.alltypes\ngroup by int_col, tinyint_col",
+    "queryId":"0:0",
     "hash":"83c78528e6f5325c56a3f3521b08a78d",
     "user":"dev",
     "timestamp":1446159271,
@@ -1800,6 +1812,7 @@ select int_col, rank() over(order by int_col) from functional.alltypesagg
 ---- LINEAGE
 {
     "queryText":"select int_col, rank() over(order by int_col) from functional.alltypesagg",
+    "queryId":"0:0",
     "hash":"4f1ecaaed571d2ed9f09f091f399c311",
     "user":"dev",
     "timestamp":1446159272,
@@ -1857,6 +1870,7 @@ order by a.tinyint_col, a.int_col
 ---- LINEAGE
 {
     "queryText":"select a.tinyint_col, a.int_col, count(a.double_col)\n  over(partition by a.tinyint_col order by a.int_col desc rows between 1 preceding and 1 following)\nfrom functional.alltypes a inner join functional.alltypessmall b on a.id = b.id\norder by a.tinyint_col, a.int_col",
+    "queryId":"0:0",
     "hash":"b6e26c00b2ef17f0592ebadb0ecc21f6",
     "user":"dev",
     "timestamp":1446159272,
@@ -1967,6 +1981,7 @@ order by 2, 3, 4
 ---- LINEAGE
 {
     "queryText":"with v2 as\n  (select\n   double_col,\n   count(int_col) over() a,\n   sum(int_col + bigint_col) over(partition by bool_col) b\n   from\n     (select * from functional.alltypes) v1)\nselect double_col, a, b, a + b, double_col + a from v2\norder by 2, 3, 4",
+    "queryId":"0:0",
     "hash":"6bf993cea0d1ab9e613674ef178916c9",
     "user":"dev",
     "timestamp":1446159272,
@@ -2098,6 +2113,7 @@ order by 2, 3, 4
 ---- LINEAGE
 {
     "queryText":"select double_col, a, b, a + b, double_col + a from\n  (select\n   double_col,\n   count(int_col) over() a,\n   sum(int_col + bigint_col) over(partition by bool_col) b\n   from\n     (select * from functional.alltypes) v1) v2\norder by 2, 3, 4",
+    "queryId":"0:0",
     "hash":"811403c86e86fe630dea7bd0a6c89273",
     "user":"dev",
     "timestamp":1446159272,
@@ -2231,6 +2247,7 @@ where b.month = 1
 ---- LINEAGE
 {
     "queryText":"select a.month, a.year, b.int_col, b.month\nfrom\n  (select year, month from functional.alltypes\n   union all\n   select year, month from functional.alltypes) a\n  inner join\n  functional.alltypessmall b\n  on (a.month = b.month)\nwhere b.month = 1",
+    "queryId":"0:0",
     "hash":"e3000cd5edf2a02e1f5407810f3cc09a",
     "user":"dev",
     "timestamp":1446159272,
@@ -2338,6 +2355,7 @@ where month = 1
 ---- LINEAGE
 {
     "queryText":"select t1.int_col, t2.month, t2.int_col + 1\nfrom (\n  select int_col, count(*)\n  from functional.alltypessmall\n  where month = 1\n  group by int_col\n  having count(*) > 1\n  order by count(*) desc limit 5\n  ) t1\njoin functional.alltypes t2 on (t1.int_col = t2.int_col)\nwhere month = 1",
+    "queryId":"0:0",
     "hash":"3f1ecf7239e205342aee4979e7cb4877",
     "user":"dev",
     "timestamp":1446159272,
@@ -2439,6 +2457,7 @@ and x.int_col + x.float_col + cast(c.string_col as float) < 1000
 ---- LINEAGE
 {
     "queryText":"select x.smallint_col, x.id, x.tinyint_col, c.id, x.int_col, x.float_col, c.string_col\nfrom functional.alltypessmall c\njoin (\n   select a.smallint_col smallint_col, a.tinyint_col tinyint_col, a.day day,\n           a.int_col int_col, a.month month, b.float_col float_col, b.id id\n   from ( select * from functional.alltypesagg a where month=1 ) a\n   join functional.alltypessmall b on (a.smallint_col = b.id)\n ) x on (x.tinyint_col = c.id)\nwhere x.day=1\nand x.int_col > 899\nand x.float_col > 4.5\nand c.string_col < '7'\nand x.int_col + x.float_col + cast(c.string_col as float) < 1000",
+    "queryId":"0:0",
     "hash":"4edf165aed5982ede63f7c91074f4b44",
     "user":"dev",
     "timestamp":1446159272,
@@ -2615,6 +2634,7 @@ from
 ---- LINEAGE
 {
     "queryText":"select c1, c2, c3\nfrom\n  (select c1, c2, c3\n   from\n     (select int_col c1, sum(float_col) c2, min(float_col) c3\n      from functional_hbase.alltypessmall\n      group by 1) x\n    order by 2,3 desc\n    limit 5\n) y",
+    "queryId":"0:0",
     "hash":"8b4d1ab11721d9ebdf26666d4195eb18",
     "user":"dev",
     "timestamp":1446159272,
@@ -2708,6 +2728,7 @@ limit 0
 ---- LINEAGE
 {
     "queryText":"select c1, x2\nfrom (\n  select c1, min(c2) x2\n  from (\n    select c1, c2, c3\n    from (\n      select int_col c1, tinyint_col c2, min(float_col) c3\n      from functional_hbase.alltypessmall\n      group by 1, 2\n      order by 1,2\n      limit 1\n    ) x\n  ) x2\n  group by c1\n) y\norder by 2,1 desc\nlimit 0",
+    "queryId":"0:0",
     "hash":"50d3b4f249f038b0711ea75c17640fc9",
     "user":"dev",
     "timestamp":1446159272,
@@ -2771,6 +2792,7 @@ select int_col, string_col from functional.view_view
 ---- LINEAGE
 {
     "queryText":"select int_col, string_col from functional.view_view",
+    "queryId":"0:0",
     "hash":"9073496459077de1332e5017977dedf5",
     "user":"dev",
     "timestamp":1446159272,
@@ -2823,6 +2845,7 @@ where t.id < 10
 ---- LINEAGE
 {
     "queryText":"select t.id from (select id from functional.alltypes_view) t\nwhere t.id < 10",
+    "queryId":"0:0",
     "hash":"8ba7998033f90e1e358f4fdc7ea4251b",
     "user":"dev",
     "timestamp":1446159272,
@@ -2868,6 +2891,7 @@ where id in
 ---- LINEAGE
 {
     "queryText":"select string_col, float_col, bool_col\nfrom functional.alltypes\nwhere id in\n  (select id from functional.alltypesagg)",
+    "queryId":"0:0",
     "hash":"e8ad1371d2a13e1ee9ec45689b62cdc9",
     "user":"dev",
     "timestamp":1446159272,
@@ -2967,6 +2991,7 @@ and tinyint_col < 10
 ---- LINEAGE
 {
     "queryText":"select 1\nfrom functional.alltypesagg a\nwhere exists\n  (select id, count(int_col) over (partition by bool_col)\n   from functional.alltypestiny b\n   where a.tinyint_col = b.tinyint_col\n   group by id, int_col, bool_col)\nand tinyint_col < 10",
+    "queryId":"0:0",
     "hash":"a7500c022d29c583c31b287868a848bf",
     "user":"dev",
     "timestamp":1446159272,
@@ -3017,6 +3042,7 @@ and a.bigint_col > 10
 ---- LINEAGE
 {
     "queryText":"select int_col + 1, tinyint_col - 1\nfrom functional.alltypes a\nwhere a.int_col <\n  (select max(int_col) from functional.alltypesagg g where g.bool_col = true)\nand a.bigint_col > 10",
+    "queryId":"0:0",
     "hash":"5e6227f323793ea4441e2a3119af2f09",
     "user":"dev",
     "timestamp":1446159272,
@@ -3097,6 +3123,7 @@ with t as (select int_col x, bigint_col y from functional.alltypes) select x, y
 ---- LINEAGE
 {
     "queryText":"with t as (select int_col x, bigint_col y from functional.alltypes) select x, y from t",
+    "queryId":"0:0",
     "hash":"a7ab58d90540f28a8dfd69703632ad7a",
     "user":"dev",
     "timestamp":1446159272,
@@ -3150,6 +3177,7 @@ select id, int_col, string_col, year, month from t1
 ---- LINEAGE
 {
     "queryText":"with t1 as (select * from functional.alltypestiny)\ninsert into functional.alltypesinsert (id, int_col, string_col) partition(year, month)\nselect id, int_col, string_col, year, month from t1",
+    "queryId":"0:0",
     "hash":"0bc5b3e66cc72387f74893b1f1934946",
     "user":"dev",
     "timestamp":1446159272,
@@ -3368,6 +3396,7 @@ from
 ---- LINEAGE
 {
     "queryText":"select lead(a) over (partition by b order by c)\nfrom\n  (select lead(id) over (partition by int_col order by bigint_col) as a,\n   max(id) over (partition by tinyint_col order by int_col) as b,\n   min(int_col) over (partition by string_col order by bool_col) as c\n   from functional.alltypes) v",
+    "queryId":"0:0",
     "hash":"aa95e5e6f39fc80bb3c318a2515dc77d",
     "user":"dev",
     "timestamp":1446159272,
@@ -3440,6 +3469,7 @@ create view test_view_lineage as select id from functional.alltypestiny
 ---- LINEAGE
 {
     "queryText":"create view test_view_lineage as select id from functional.alltypestiny",
+    "queryId":"0:0",
     "hash":"ff6b1ecb265afe4f03355a07238cfe37",
     "user":"dev",
     "timestamp":1446159272,
@@ -3489,6 +3519,7 @@ limit 0
 ---- LINEAGE
 {
     "queryText":"create view test_view_lineage (a, b) as select c1, x2\nfrom (\n  select c1, min(c2) x2\n  from (\n    select c1, c2, c3\n    from (\n      select int_col c1, tinyint_col c2, min(float_col) c3\n      from functional_hbase.alltypessmall\n      group by 1, 2\n      order by 1,2\n      limit 1\n    ) x\n  ) x2\n  group by c1\n) y\norder by 2,1 desc\nlimit 0",
+    "queryId":"0:0",
     "hash":"b96adf892b897da1e562c5be98724fb5",
     "user":"dev",
     "timestamp":1446159272,
@@ -3565,6 +3596,7 @@ create view test_view_lineage (a1, a2, a3, a4, a5, a6, a7) as
 ---- LINEAGE
 {
     "queryText":"create view test_view_lineage (a1, a2, a3, a4, a5, a6, a7) as\n  select x.smallint_col, x.id, x.tinyint_col, c.id, x.int_col, x.float_col, c.string_col\n  from functional.alltypessmall c\n  join (\n     select a.smallint_col smallint_col, a.tinyint_col tinyint_col, a.day day,\n           a.int_col int_col, a.month month, b.float_col float_col, b.id id\n     from ( select * from functional.alltypesagg a where month=1 ) a\n     join functional.alltypessmall b on (a.smallint_col = b.id)\n   ) x on (x.tinyint_col = c.id)\n  where x.day=1\n  and x.int_col > 899\n  and x.float_col > 4.5\n  and c.string_col < '7'\n  and x.int_col + x.float_col + cast(c.string_col as float) < 1000",
+    "queryId":"0:0",
     "hash":"ffbe643df8f26e92907fb45de1aeda36",
     "user":"dev",
     "timestamp":1446159272,
@@ -3747,6 +3779,7 @@ create view test_view_lineage as
 ---- LINEAGE
 {
     "queryText":"create view test_view_lineage as\n  select * from (\n    select sum(a.tinyint_col) over (partition by a.smallint_col order by a.id),\n      count(b.string_col), b.timestamp_col\n    from functional.alltypes a join functional.alltypessmall b on (a.id = b.id)\n    where a.year = 2010 and b.float_col > 0\n    group by a.tinyint_col, a.smallint_col, a.id, b.string_col, b.timestamp_col, b.bigint_col\n    having count(a.int_col) > 10\n    order by b.bigint_col limit 10) t",
+    "queryId":"0:0",
     "hash":"d4b9e2d63548088f911816b2ae29d7c2",
     "user":"dev",
     "timestamp":1446159272,
@@ -3883,6 +3916,7 @@ alter view functional.alltypes_view as select id from functional.alltypestiny
 ---- LINEAGE
 {
     "queryText":"alter view functional.alltypes_view as select id from functional.alltypestiny",
+    "queryId":"0:0",
     "hash":"8c9367afc562a4c04d2d40e1276646c2",
     "user":"dev",
     "timestamp":1446159272,
@@ -3923,6 +3957,7 @@ select * from (
 ---- LINEAGE
 {
     "queryText":"select * from (\n  select int_struct_col.f1 + int_struct_col.f2 x from functional.allcomplextypes\n  where year = 2000\n  order by nested_struct_col.f2.f12.f21 limit 10\n  union all\n  select sum(f1) y from\n    (select complex_struct_col.f1 f1 from functional.allcomplextypes\n     group by 1) v1) v2",
+    "queryId":"0:0",
     "hash":"4fb3ceddbf596097335af607d528f5a7",
     "user":"dev",
     "timestamp":1446159272,
@@ -3990,6 +4025,7 @@ select * from functional.allcomplextypes.int_array_col a inner join
 ---- LINEAGE
 {
     "queryText":"select * from functional.allcomplextypes.int_array_col a inner join\n  functional.allcomplextypes.struct_map_col m on (a.item = m.f1)",
+    "queryId":"0:0",
     "hash":"8c0c64f8a4c08b82ad343ab439101957",
     "user":"dev",
     "timestamp":1446159272,
@@ -4095,6 +4131,7 @@ select * from functional.allcomplextypes t, t.int_array_col a, t.struct_map_col
 ---- LINEAGE
 {
     "queryText":"select * from functional.allcomplextypes t, t.int_array_col a, t.struct_map_col m\n  where a.item = m.f1",
+    "queryId":"0:0",
     "hash":"1b0db371b32e90d33629ed7779332cf7",
     "user":"dev",
     "timestamp":1446159272,
@@ -4279,6 +4316,7 @@ select a + b as ab, c, d, e from functional.allcomplextypes t,
 ---- LINEAGE
 {
     "queryText":"select a + b as ab, c, d, e from functional.allcomplextypes t,\n  (select sum(item) a from t.int_array_col\n   where item < 10) v1,\n  (select count(f1) b from t.struct_map_col\n   group by key) v2,\n  (select avg(value) over(partition by key) c from t.map_map_col.value) v3,\n  (select item d from t.int_array_col\n   union all\n   select value from t.int_map_col) v4,\n  (select f21 e from t.complex_nested_struct_col.f2.f12 order by key limit 10) v5",
+    "queryId":"0:0",
     "hash":"4affc0d1e384475d1ff2fc2e19643064",
     "user":"dev",
     "timestamp":1446159272,
@@ -4416,6 +4454,7 @@ where not exists (select 1 from functional.alltypes a where v.id = a.id)
 ---- LINEAGE
 {
     "queryText":"create view test_view_lineage as\nselect id from functional.alltypes_view v\nwhere not exists (select 1 from functional.alltypes a where v.id = a.id)",
+    "queryId":"0:0",
     "hash":"e79b8abc8a682d9e0f6b2c30a6c885f3",
     "user":"dev",
     "timestamp":1475094005,
@@ -4459,6 +4498,7 @@ where k.int_col < 10
 ---- LINEAGE
 {
     "queryText":"select count(*) from functional_kudu.alltypes k join functional.alltypes h on k.id = h.id\nwhere k.int_col < 10",
+    "queryId":"0:0",
     "hash":"7b7c92d488186d869bb6b78c97666f41",
     "user":"dev",
     "timestamp":1479538352,
@@ -4513,6 +4553,7 @@ functional.alltypes a where a.id < 100
 ---- LINEAGE
 {
     "queryText":"insert into functional_kudu.testtbl select id, string_col as name, int_col as zip from\nfunctional.alltypes a where a.id < 100",
+    "queryId":"0:0",
     "hash":"87a59bac56c6ad27f7af6e71af46d552",
     "user":"dev",
     "timestamp":1479539012,
@@ -4596,6 +4637,7 @@ functional.alltypes where id < 10
 ---- LINEAGE
 {
     "queryText":"insert into functional_kudu.testtbl (name, id) select string_col as name, id from\nfunctional.alltypes where id < 10",
+    "queryId":"0:0",
     "hash":"0bccfdbf4118e6d5a3d94062ecb5130a",
     "user":"dev",
     "timestamp":1479933751,
@@ -4659,6 +4701,7 @@ functional.alltypes where id < 10
 ---- LINEAGE
 {
     "queryText":"upsert into functional_kudu.testtbl (name, id) select string_col as name, id from\nfunctional.alltypes where id < 10",
+    "queryId":"0:0",
     "hash":"f4c1e7b016e75012f7268f2f42ae5630",
     "user":"dev",
     "timestamp":1479933751,
@@ -4724,6 +4767,7 @@ from functional.alltypestiny
 ---- LINEAGE
 {
     "queryText":"create table kudu_ctas primary key (id) partition by hash (id) partitions 3\nstored as kudu as select id, bool_col, tinyint_col, smallint_col, int_col,\nbigint_col, float_col, double_col, date_string_col, string_col\nfrom functional.alltypestiny",
+    "queryId":"0:0",
     "hash":"6e3e192c7fb8bb6b22674a9b7b488b55",
     "user":"dev",
     "timestamp":1479933751,

http://git-wip-us.apache.org/repos/asf/impala/blob/c505a815/tests/custom_cluster/test_lineage.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_lineage.py b/tests/custom_cluster/test_lineage.py
index 825e1b7..240f064 100644
--- a/tests/custom_cluster/test_lineage.py
+++ b/tests/custom_cluster/test_lineage.py
@@ -22,6 +22,7 @@ import json
 import logging
 import os
 import pytest
+import re
 import shutil
 import stat
 import tempfile
@@ -33,7 +34,7 @@ LOG = logging.getLogger(__name__)
 
 class TestLineage(CustomClusterTestSuite):
 
-  lineage_log_dir = tempfile.mkdtemp();
+  lineage_log_dir = tempfile.mkdtemp()
 
   query = """
       select count(*) from functional.alltypes
@@ -45,7 +46,7 @@ class TestLineage(CustomClusterTestSuite):
 
   @classmethod
   def teardown_class(cls):
-    shutil.rmtree(cls.lineage_log_dir);
+    shutil.rmtree(cls.lineage_log_dir)
 
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--lineage_event_log_dir=%s" % lineage_log_dir)
@@ -54,7 +55,8 @@ class TestLineage(CustomClusterTestSuite):
        UNIX times."""
     LOG.info("lineage_event_log_dir is " + self.lineage_log_dir)
     before_time = int(time.time())
-    self.execute_query_expect_success(self.client, self.query)
+    result = self.execute_query_expect_success(self.client, self.query)
+    profile_query_id = re.search("Query \(id=(.*)\):", result.runtime_profile).group(1)
     after_time = int(time.time())
     LOG.info("before_time " + str(before_time) + " after_time " + str(after_time))
 
@@ -68,6 +70,7 @@ class TestLineage(CustomClusterTestSuite):
         LOG.info("examining file: " + log_path)
         with open(log_path) as log_file:
           lineage_json = json.load(log_file)
+          assert lineage_json["queryId"] == profile_query_id
           timestamp = int(lineage_json["timestamp"])
           end_time = int(lineage_json["endTime"])
           assert before_time <= timestamp


[7/8] impala git commit: IMPALA-6273: fixes subquery tests for functional_hbase

Posted by ta...@apache.org.
IMPALA-6273: fixes subquery tests for functional_hbase

IMPALA-1422 introduced tests that do not work with
the testing setup for hbase. Namely, tinyinttable is
not defined in the functional_hbase database, but
is defined in the functional database. Exhaustive
tests uncovered the issue.

This change makes two changes so that tests work with
functional_hbase:
1) use a table that is present in both functional and
   functional_hbase. the tests needed a subquery result
   with a single int column. tinyinttable is replaced
   with an inline view that provides this single int
   column in a portable manner.
2) nulls are handled differently with hbase (see IMPALA-728)
   so the nulltable used in the tests is set to
   functional.nulltable to avoid inconsistent results across
   input formats.

Testing:
- ran e2e tests with exhaustive exploration strategy for the
  broken test.

Change-Id: Ibaa3a3df7362ac6d3ed07aff133dc4b3520bb4e0
Reviewed-on: http://gerrit.cloudera.org:8080/8765
Reviewed-by: Dimitris Tsirogiannis <dt...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/16c5f514
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/16c5f514
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/16c5f514

Branch: refs/heads/master
Commit: 16c5f514e0315dfd63ee88d98d25099547e42cc4
Parents: e79f644
Author: Vuk Ercegovac <ve...@cloudera.com>
Authored: Tue Dec 5 10:13:32 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Dec 6 00:41:11 2017 +0000

----------------------------------------------------------------------
 .../QueryTest/subquery-in-constant-lhs.test     | 21 ++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/16c5f514/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test b/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
index df9303b..56627cb 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
@@ -112,8 +112,7 @@ INT, INT, STRING
 ====
 ---- QUERY
 # LHS null, Predicate IN, RHS not-empty with match. Uncorrelated. Expect no results.
-SELECT count(*) FROM alltypessmall a
-WHERE NULL IN (SELECT d FROM nulltable);
+SELECT count(*) FROM alltypessmall a WHERE NULL IN (SELECT d FROM nulltable);
 ---- RESULTS
 0
 ---- TYPES
@@ -215,7 +214,7 @@ INT, INT, STRING
 ====
 ---- QUERY
 # LHS is non-null, Predicate is IN, RHS is an aggregate.
-SELECT count(*) FROM alltypessmall a WHERE 0 IN (SELECT MIN(int_col) from tinyinttable);
+SELECT count(*) FROM alltypessmall a WHERE 0 IN (SELECT MIN(int_col) from alltypestiny);
 ---- RESULTS
 100
 ---- TYPES
@@ -223,7 +222,7 @@ BIGINT
 ====
 ---- QUERY
 # LHS is non-null, Predicate is NOT IN, RHS is an aggregate.
-SELECT a.id FROM alltypessmall a WHERE 0 NOT IN (SELECT MIN(int_col) from tinyinttable);
+SELECT a.id FROM alltypessmall a WHERE 0 NOT IN (SELECT MIN(int_col) from alltypestiny);
 ---- RESULTS
 ---- TYPES
 INT
@@ -245,14 +244,18 @@ INT
 ====
 ---- QUERY
 # LHS is null, Predicate is NOT IN, RHS is group by without aggregation. Expect no results.
-SELECT a.id from alltypessmall a where NULL NOT IN (SELECT d from nulltable group by d);
+# Note: using functional.nulltable explicitly to avoid inconsistent results due to
+# different null handling across formats (specifically, hbase). See IMPALA-6276.
+SELECT a.id from alltypessmall a where
+NULL NOT IN (SELECT d from functional.nulltable group by d);
 ---- RESULTS
 ---- TYPES
 INT
 ====
 ---- QUERY
 # LHS is non-null, Predicate is IN, RHS is select list is "*".
-SELECT a.id FROM alltypessmall a WHERE 1 IN (SELECT * FROM tinyinttable) and a.id < 3;
+SELECT a.id FROM alltypessmall a WHERE
+1 IN (SELECT * FROM (select int_col from alltypestiny) tmp) and a.id < 3;
 ---- RESULTS
 0
 1
@@ -262,7 +265,8 @@ INT
 ====
 ---- QUERY
 # LHS is non-null, Predicate is NOT IN, RHS is select list is "*".
-SELECT a.id FROM alltypessmall a WHERE 1 NOT IN (SELECT * FROM tinyinttable);
+SELECT a.id FROM alltypessmall a WHERE
+1 NOT IN (SELECT * FROM (select int_col from alltypestiny) tmp);
 ---- RESULTS
 ---- TYPES
 INT
@@ -289,7 +293,8 @@ BIGINT
 ---- QUERY
 # LHS is non-null, Predicate is IN, RHS includes nested subqueries.
 SELECT a.id FROM alltypessmall a WHERE
-1 IN (SELECT int_col FROM alltypessmall WHERE -10000 IN (SELECT * FROM tinyinttable));
+1 IN (SELECT int_col FROM alltypessmall WHERE
+-10000 IN (SELECT * FROM (select int_col from alltypestiny) tmp));
 ---- RESULTS
 ---- TYPES
 INT