You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2023/12/15 18:33:24 UTC

(impala) 01/02: Revert "Revert "IMPALA-9923: Load ORC serially to hack around ...""

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 378169be1f571f4d16db2d98b418903c8593889f
Author: Riza Suminto <ri...@cloudera.com>
AuthorDate: Thu Dec 14 12:46:16 2023 -0800

    Revert "Revert "IMPALA-9923: Load ORC serially to hack around ...""
    
    This reverts commit b03e8ef95c856f499d17ea7815831e30e2e9f467.
    
    IMPALA-12630 report several tests were broken due to loading ORC in
    parallel with other non-text table format. ORC tables returns to load
    serially after this commit.
    
    Change-Id: I5d3f2ee1c15f9aff6aa632a78d86ba32c640e53d
    Reviewed-on: http://gerrit.cloudera.org:8080/20795
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/load-data.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/bin/load-data.py b/bin/load-data.py
index 090524cf5..a4cfd5a97 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -396,6 +396,7 @@ def main():
 
     impala_create_files = []
     hive_load_text_files = []
+    hive_load_orc_files = []
     hive_load_nontext_files = []
     hbase_create_files = []
     hbase_postload_files = []
@@ -407,6 +408,8 @@ def main():
       elif hive_load_match in filename:
         if 'text-none-none' in filename:
           hive_load_text_files.append(filename)
+        elif 'orc-def-block' in filename:
+          hive_load_orc_files.append(filename)
         else:
           hive_load_nontext_files.append(filename)
       elif hbase_create_match in filename:
@@ -429,6 +432,7 @@ def main():
 
     log_file_list("Impala Create Files:", impala_create_files)
     log_file_list("Hive Load Text Files:", hive_load_text_files)
+    log_file_list("Hive Load Orc Files:", hive_load_orc_files)
     log_file_list("Hive Load Non-Text Files:", hive_load_nontext_files)
     log_file_list("HBase Create Files:", hbase_create_files)
     log_file_list("HBase Post-Load Files:", hbase_postload_files)
@@ -453,6 +457,13 @@ def main():
     # need to be loaded first
     assert(len(hive_load_text_files) <= 1)
     hive_exec_query_files_parallel(thread_pool, hive_load_text_files)
+    # IMPALA-9923: Run ORC serially separately from other non-text formats. This hacks
+    # around flakiness seen when loading this in parallel. This should be removed as
+    # soon as possible.
+    assert(len(hive_load_orc_files) <= 1)
+    hive_exec_query_files_parallel(thread_pool, hive_load_orc_files)
+
+    # Load all non-text formats (goes parallel)
     hive_exec_query_files_parallel(thread_pool, hive_load_nontext_files)
 
     assert(len(hbase_postload_files) <= 1)