You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/06/24 17:27:02 UTC
[impala] branch master updated: IMPALA-9747: More fine-grained codegen for text file scanners

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new e6c930a  IMPALA-9747: More fine-grained codegen for text file scanners
e6c930a is described below

commit e6c930a38f54899e66ad83ab88d886dcd4c869f9
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Wed May 20 19:31:41 2020 +0200

    IMPALA-9747: More fine-grained codegen for text file scanners
    
    Currently if the materialization of any column cannot be codegen'd
    because its type is unsupported (e.g. CHAR(N)), the whole codegen is
    cancelled for the text scanner.
    
    This commit adds the function TextConverter::SupportsCodegenWriteSlot
    that returns whether the given ColumnType is supported. If the type is
    not supported, HdfsScanner codegens code that calls the interpreted
    version instead of failing codegen. For other columns codegen is used as
    usually.
    
    Benchmarks:
      Copied and modified a TPCH table with scale factor 5 to add a CHAR
      column to it::
    
        USE tpch5;
        CREATE TABLE IF NOT EXISTS lineitem_char AS
        SELECT *, CAST(l_shipdate AS CHAR(10)) l_shipdate_char
        FROM lineitem;
    
      Run the following query 100 times after one warm-up run with and
      without this change:
    
        SELECT *
        FROM tpch5.lineitem_char
        WHERE
          l_partkey BETWEEN 500 AND 500000 AND
          l_linestatus = 'F' AND
          l_quantity < 35 AND
          l_extendedprice BETWEEN 2000 AND 8000 AND
          l_discount > 0 AND
          l_tax BETWEEN 0.04 AND 0.06 AND
          l_returnflag IN ('A', 'N') AND
          l_shipdate_char < '1996-06-20'
        ORDER BY l_shipdate_char
        LIMIT 10;
    
      Without this commit: mean: 2.92, standard deviation: 0.13.
      With this commit:    mean: 2.21, standard deviation: 0.072.
    
    Testing:
      The interesting cases regarding char are covered in
      https://github.com/apache/impala/blob/0167c5b4242fcebf6be19aba5ecfb440204278ad/testdata/workloads/functional-query/queries/QueryTest/chars.test
    
    Change-Id: Id370193af578ecf23ed3c6bfcc65fec448156fa3
    Reviewed-on: http://gerrit.cloudera.org:8080/16059
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/codegen/gen_ir_descriptions.py              |  2 ++
 be/src/exec/hdfs-scanner-ir.cc                     | 17 +++++++++++
 be/src/exec/hdfs-scanner.cc                        | 34 +++++++++++++++++-----
 be/src/exec/hdfs-scanner.h                         |  9 ++++++
 be/src/exec/text-converter.cc                      | 11 ++++---
 be/src/exec/text-converter.h                       |  7 ++++-
 .../QueryTest/datastream-sender-codegen.test       |  1 -
 tests/query_test/test_codegen.py                   | 11 ++++---
 8 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index 49d5c4e..1fd3efe 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -185,6 +185,8 @@ ir_functions = [
    "_ZN6impala19HdfsColumnarScanner19ProcessScratchBatchEPNS_8RowBatchE"],
   ["HDFS_SCANNER_EVAL_RUNTIME_FILTER",
    "_ZN6impala11HdfsScanner17EvalRuntimeFilterEiPNS_8TupleRowE"],
+  ["HDFS_SCANNER_TEXT_CONVERTER_WRITE_SLOT_INTERPRETED_IR",
+   "_ZN6impala11HdfsScanner35TextConverterWriteSlotInterpretedIREPS0_iPNS_5TupleEPKciPNS_7MemPoolE"],
   ["STRING_TO_BOOL", "IrStringToBool"],
   ["STRING_TO_INT8", "IrStringToInt8"],
   ["STRING_TO_INT16", "IrStringToInt16"],
diff --git a/be/src/exec/hdfs-scanner-ir.cc b/be/src/exec/hdfs-scanner-ir.cc
index 436974c..1008ccf 100644
--- a/be/src/exec/hdfs-scanner-ir.cc
+++ b/be/src/exec/hdfs-scanner-ir.cc
@@ -16,6 +16,8 @@
 // under the License.
 
 #include "exec/hdfs-scanner.h"
+#include "exec/text-converter.h"
+#include "exec/text-converter.inline.h"
 #include "runtime/row-batch.h"
 #include "util/string-parser.h"
 #include "runtime/string-value.inline.h"
@@ -109,6 +111,21 @@ bool HdfsScanner::EvalRuntimeFilter(int i, TupleRow* row) {
   return true;
 }
 
+bool HdfsScanner::TextConverterWriteSlotInterpretedIR(HdfsScanner* hdfs_scanner,
+    int slot_idx, Tuple* tuple, const char* data, int len, MemPool* pool) {
+  constexpr bool copy_string = false;
+
+  int need_escape = false;
+  if (UNLIKELY(len < 0)) {
+    len = -len;
+    need_escape = true;
+  }
+
+  SlotDescriptor* desc = hdfs_scanner->scan_node_->materialized_slots()[slot_idx];
+  return hdfs_scanner->text_converter_->WriteSlot(desc, tuple, data, len, copy_string,
+      need_escape, pool);
+}
+
 // Define the string parsing functions for llvm.  Stamp out the templated functions
 #ifdef IR_COMPILE
 using ParseResult = StringParser::ParseResult;
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index f8f9a4d..afa9dbf 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -342,11 +342,17 @@ Status HdfsScanner::CodegenWriteCompleteTuple(const HdfsScanPlanNode* node,
   for (int i = 0; i < node->materialized_slots_.size(); ++i) {
     llvm::Function* fn = nullptr;
     SlotDescriptor* slot_desc = node->materialized_slots_[i];
-    RETURN_IF_ERROR(TextConverter::CodegenWriteSlot(codegen, tuple_desc, slot_desc, &fn,
-        node->hdfs_table_->null_column_value().data(),
-        node->hdfs_table_->null_column_value().size(), true,
-        state->query_options().strict_mode));
-    if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) codegen->SetNoInline(fn);
+
+    // If the type is CHAR, WriteSlot for this slot cannot be codegen'd. To keep codegen
+    // for other things, we call the interpreted code for this slot from the codegen'd
+    // code instead of failing codegen. See IMPALA-9747.
+    if (TextConverter::SupportsCodegenWriteSlot(slot_desc->type())) {
+      RETURN_IF_ERROR(TextConverter::CodegenWriteSlot(codegen, tuple_desc, slot_desc, &fn,
+          node->hdfs_table_->null_column_value().data(),
+          node->hdfs_table_->null_column_value().size(), true,
+          state->query_options().strict_mode));
+      if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) codegen->SetNoInline(fn);
+    }
     slot_fns.push_back(fn);
   }
 
@@ -392,6 +398,7 @@ Status HdfsScanner::CodegenWriteCompleteTuple(const HdfsScanPlanNode* node,
 
   // Extract the input args
   llvm::Value* this_arg = args[0];
+  llvm::Value* mem_pool_arg = args[1];
   llvm::Value* fields_arg = args[2];
   llvm::Value* opaque_tuple_arg = args[3];
   llvm::Value* tuple_arg =
@@ -474,8 +481,21 @@ Status HdfsScanner::CodegenWriteCompleteTuple(const HdfsScanPlanNode* node,
 
       // Call slot parse function
       llvm::Function* slot_fn = slot_fns[slot_idx];
-      llvm::Value* slot_parsed = builder.CreateCall(
-          slot_fn, llvm::ArrayRef<llvm::Value*>({tuple_arg, data, len}));
+
+      llvm::Value* slot_parsed = nullptr;
+      if (LIKELY(slot_fn != nullptr)) {
+        slot_parsed = builder.CreateCall(
+            slot_fn, llvm::ArrayRef<llvm::Value*>({tuple_arg, data, len}));
+      } else {
+        llvm::Value* slot_idx_value = codegen->GetI32Constant(slot_idx);
+        llvm::Function* interpreted_slot_fn = codegen->GetFunction(
+            IRFunction::HDFS_SCANNER_TEXT_CONVERTER_WRITE_SLOT_INTERPRETED_IR, false);
+        DCHECK(interpreted_slot_fn != nullptr);
+
+        slot_parsed = builder.CreateCall(interpreted_slot_fn,
+            {this_arg, slot_idx_value, opaque_tuple_arg, data, len, mem_pool_arg});
+      }
+
       llvm::Value* slot_error = builder.CreateNot(slot_parsed, "slot_parse_error");
       error_in_row = builder.CreateOr(error_in_row, slot_error, "error_in_row");
       slot_error = builder.CreateZExt(slot_error, codegen->i8_type());
diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h
index 961653d..cb127f1 100644
--- a/be/src/exec/hdfs-scanner.h
+++ b/be/src/exec/hdfs-scanner.h
@@ -588,6 +588,15 @@ class HdfsScanner {
     return reinterpret_cast<TupleRow*>(mem + sizeof(Tuple*));
   }
 
+  /// This is cross-compiled to IR. Used to call the interpreted version of
+  /// TextConverter::WriteSlot from codegen'd code. For types for which
+  /// TextConverter::SupportsCodegenWriteSlot returns false (for example CHAR),
+  /// TextConverter does not support codegenning WriteSlot, but other columns may still
+  /// benefit from codegen so instead of disabling it entirely we call the interpreted
+  /// version from codegen'd code.
+  static bool TextConverterWriteSlotInterpretedIR(HdfsScanner* hdfs_scanner,
+      int slot_idx, Tuple* tuple, const char* data, int len, MemPool* pool);
+
   /// Simple wrapper around conjunct_evals_[idx]. Used in the codegen'd version of
   /// WriteCompleteTuple() because it's easier than writing IR to access
   /// conjunct_evals_.
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index f208021..16d01f2 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <sstream>
+
 #include <boost/algorithm/string.hpp>
 
 #include "codegen/llvm-codegen.h"
@@ -108,10 +110,7 @@ Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
     TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc, llvm::Function** fn,
     const char* null_col_val, int len, bool check_null, bool strict_mode) {
   DCHECK(fn != nullptr);
-  if (slot_desc->type().type == TYPE_CHAR) {
-    return Status::Expected("TextConverter::CodegenWriteSlot(): Char isn't supported for"
-        " CodegenWriteSlot");
-  }
+  DCHECK(SupportsCodegenWriteSlot(slot_desc->type()));
 
   // Codegen is_null_string
   bool is_default_null = (len == 2 && null_col_val[0] == '\\' && null_col_val[1] == 'N');
@@ -330,3 +329,7 @@ Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
   }
   return Status::OK();
 }
+
+bool TextConverter::SupportsCodegenWriteSlot(const ColumnType& col_type) {
+  return col_type.type != TYPE_CHAR;
+}
diff --git a/be/src/exec/text-converter.h b/be/src/exec/text-converter.h
index 59d1968..77c9799 100644
--- a/be/src/exec/text-converter.h
+++ b/be/src/exec/text-converter.h
@@ -29,6 +29,7 @@ namespace llvm {
 
 namespace impala {
 
+struct ColumnType;
 class LlvmCodeGen;
 class MemPool;
 class SlotDescriptor;
@@ -70,8 +71,10 @@ class TextConverter {
   void UnescapeString(const char* src, char* dest, int* len, int64_t maxlen = -1);
 
   /// Codegen the function to write a slot for slot_desc.
+  /// Should only be called if the column type is supported, i.e.
+  /// SupportsCodegenWriteSlot(slot_desc->type()) returns true.
   /// Returns Status::OK() if codegen was successful. If codegen was successful
-  /// llvm::Function** fn points to the codegen'd function
+  /// llvm::Function** fn points to the codegen'd function.
   /// The signature of the generated function is:
   /// bool WriteSlot(Tuple* tuple, const char* data, int len);
   /// The codegen function returns true if the slot could be written and false
@@ -86,6 +89,8 @@ class TextConverter {
       TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc, llvm::Function** fn,
       const char* null_col_val, int len, bool check_null, bool strict_mode = false);
 
+  /// Returns whether codegen is supported for the given type.
+  static bool SupportsCodegenWriteSlot(const ColumnType& col_type);
  private:
   char escape_char_;
   /// Special string to indicate NULL column values.
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datastream-sender-codegen.test b/testdata/workloads/functional-query/queries/QueryTest/datastream-sender-codegen.test
index 07c15c2..a7e9202 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datastream-sender-codegen.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datastream-sender-codegen.test
@@ -39,5 +39,4 @@ bigint
 # Verify that CHAR codegen was enabled for hash partitioning even though CHAR
 # codegen isn't supported everywhere.
 row_regex: .*Hash Partitioned Sender Codegen Enabled.*
-row_regex: .*Char isn't supported for CodegenWriteSlot.*
 ====
diff --git a/tests/query_test/test_codegen.py b/tests/query_test/test_codegen.py
index af723c8..0622d7d 100644
--- a/tests/query_test/test_codegen.py
+++ b/tests/query_test/test_codegen.py
@@ -60,6 +60,11 @@ class TestCodegen(ImpalaTestSuite):
     """IMPALA-7288: Regression tests for the codegen failure path when working with a
     CHAR column type. Until IMPALA-3207 is completely fixed there are various paths where
     we need to bail out of codegen."""
+    # Previously codegen failed in TextConverter::CodegenWriteSlot() if there was a CHAR
+    # column, but IMPALA-9747 changed this so that instead of failing codegen for all
+    # columns, the codegen'd code for the CHAR column calls the interpreted version of the
+    # function.
+
     # Previously codegen for this join failed in HashTableCtx::CodegenEquals() because of
     # missing ScalarFnCall codegen support, which was added in IMPALA-7331.
     result = self.execute_query("select 1 from functional.chars_tiny t1, "
@@ -68,8 +73,6 @@ class TestCodegen(ImpalaTestSuite):
     profile_str = str(result.runtime_profile)
     assert "Probe Side Codegen Enabled" in profile_str, profile_str
     assert "Build Side Codegen Enabled" in profile_str, profile_str
-    assert ("TextConverter::CodegenWriteSlot(): Char isn't supported for CodegenWriteSlot"
-            in profile_str), profile_str
 
     # Codegen for this join fails because it is joining two CHAR exprs.
     result = self.execute_query("select 1 from functional.chars_tiny t1, "
@@ -80,8 +83,6 @@ class TestCodegen(ImpalaTestSuite):
             in profile_str), profile_str
     assert ("Build Side Codegen Disabled: HashTableCtx::CodegenHashRow(): CHAR NYI"
             in profile_str), profile_str
-    assert ("TextConverter::CodegenWriteSlot(): Char isn't supported for CodegenWriteSlot"
-            in profile_str), profile_str
 
     # Previously codegen for this join failed in HashTableCtx::CodegenEvalRow() because of
     # missing ScalarFnCall codegen support, which was added in IMPALA-7331.
@@ -91,5 +92,3 @@ class TestCodegen(ImpalaTestSuite):
     profile_str = str(result.runtime_profile)
     assert "Probe Side Codegen Enabled" in profile_str, profile_str
     assert "Build Side Codegen Enabled" in profile_str, profile_str
-    assert ("TextConverter::CodegenWriteSlot(): Char isn't supported for CodegenWriteSlot"
-            in profile_str), profile_str