You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/03/18 23:31:54 UTC

[impala] 01/02: IMPALA-11185: Reuse orc row batch in the scanner life-cycle

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 4d32ab7122557ca3336354301a3a467a206913a9
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Wed Mar 16 13:12:13 2022 +0800

    IMPALA-11185: Reuse orc row batch in the scanner life-cycle
    
    In HdfsOrcScanner::AssembleRows(), we always re-create a
    orc::ColumnVectorBatch. The ideal pattern is reusing the batch and only
    destroying it when the scanner is closed.
    
    This save half of the scanner time in some TPCH queries. See the flame
    graph in JIRA description.
    
    Tests:
     - Run CORE test
    
    Change-Id: I03887ed94af2ff03d67cd00c79375c734a75af62
    Reviewed-on: http://gerrit.cloudera.org:8080/18325
    Reviewed-by: Quanlong Huang <hu...@gmail.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/hdfs-orc-scanner.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index 5809ac7..bc81fc4 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -437,6 +437,8 @@ Status HdfsOrcScanner::Open(ScannerContext* context) {
     }
     orc_root_reader_ = this->obj_pool_.Add(
         new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+    orc_root_batch_ = tmp_row_reader->createRowBatch(state_->batch_size());
+    DCHECK_EQ(orc_root_batch_->numElements, 0);
   } RETURN_ON_ORC_EXCEPTION(
       "Encountered parse error during schema selection in ORC file $0: $1");
 
@@ -934,11 +936,6 @@ Status HdfsOrcScanner::AssembleRows(RowBatch* row_batch) {
   // We're going to free the previous batch. Clear the reference first.
   RETURN_IF_ERROR(orc_root_reader_->UpdateInputBatch(nullptr));
 
-  try {
-    orc_root_batch_ = row_reader_->createRowBatch(row_batch->capacity());
-    DCHECK_EQ(orc_root_batch_->numElements, 0);
-  } RETURN_ON_ORC_EXCEPTION("Encounter error in creating ORC row batch for file $0: $1.");
-
   int64_t num_rows_read = 0;
   while (continue_execution) {  // one ORC batch (ColumnVectorBatch) in a round
     if (orc_root_reader_->EndOfBatch()) {