You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ab...@apache.org on 2019/12/09 22:34:07 UTC

[kudu] branch master updated (6b69108 -> 3175c35)

This is an automated email from the ASF dual-hosted git repository.

abukor pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.


    from 6b69108  KUDU-3001 Multi-thread to load containers in a data directory
     new c9dd2b5  Improve SIMD code generation for primitive predicates
     new dff0349  client: optimize destruction of WriteRpc
     new 3175c35  KUDU-1938 Add VARCHAR to schema design docs

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/known_issues.adoc              |  7 ++--
 docs/schema_design.adoc             |  3 +-
 src/kudu/client/batcher.cc          | 79 +++++++++++++++++++++++++++++++++++--
 src/kudu/common/column_predicate.cc | 29 ++++++++------
 src/kudu/common/column_predicate.h  |  1 +
 src/kudu/common/partial_row.h       |  4 ++
 6 files changed, 104 insertions(+), 19 deletions(-)


[kudu] 01/03: Improve SIMD code generation for primitive predicates

Posted by ab...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c9dd2b520a8b82500eb6a56961b9da0ccd2ed752
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Thu Dec 5 11:26:09 2019 -0800

    Improve SIMD code generation for primitive predicates
    
    This adds a local (on-stack) copy of the bounds for range and equality
    predicates before evaluating them against the columns. These on-stack
    copies help the compiler realize that the stores to the selection vector
    can't overwrite the predicate itself, and thus allows SIMD code
    generation.
    
    Benchmarked with column_predicate-test. Highlighting the 'NOT NULL'
    results (since this doesn't change the evaluation of nulls):
    
    Before:
     int8   NOT NULL   (c >= 0 AND c < 2) 1363.5M evals/sec	2.09 cycles/eval
     int16  NOT NULL   (c >= 0 AND c < 2) 1238.3M evals/sec	2.30 cycles/eval
     int32  NOT NULL   (c >= 0 AND c < 2) 1321.3M evals/sec	2.15 cycles/eval
     int64  NOT NULL   (c >= 0 AND c < 2) 1408.3M evals/sec	2.02 cycles/eval
     float  NOT NULL   (c >= 0 AND c < 2) 1134.8M evals/sec	2.52 cycles/eval
     double NOT NULL   (c >= 0 AND c < 2) 1144.2M evals/sec	2.49 cycles/eval
    
    After:
     int8   NOT NULL   (c >= 0 AND c < 2) 3152.2M evals/sec	0.88 cycles/eval
     int16  NOT NULL   (c >= 0 AND c < 2) 3309.6M evals/sec	0.85 cycles/eval
     int32  NOT NULL   (c >= 0 AND c < 2) 3384.0M evals/sec	0.85 cycles/eval
     int64  NOT NULL   (c >= 0 AND c < 2) 1847.6M evals/sec	1.57 cycles/eval
     float  NOT NULL   (c >= 0 AND c < 2) 3268.3M evals/sec	0.88 cycles/eval
     double NOT NULL   (c >= 0 AND c < 2) 2245.2M evals/sec	1.27 cycles/eval
    
    The numbers for non-range predicates didn't seem to change here.
    
    Change-Id: I1772584c1d0c53128608ea26248dd4ab069b8108
    Reviewed-on: http://gerrit.cloudera.org:8080/14855
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/common/column_predicate.cc | 29 ++++++++++++++++++-----------
 src/kudu/common/column_predicate.h  |  1 +
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/kudu/common/column_predicate.cc b/src/kudu/common/column_predicate.cc
index bea1142..2c84e85 100644
--- a/src/kudu/common/column_predicate.cc
+++ b/src/kudu/common/column_predicate.cc
@@ -673,7 +673,7 @@ int ApplyPredicatePrimitive(const ColumnBlock& block, uint8_t* __restrict__ sel_
   const cpp_type* data = reinterpret_cast<const cpp_type*>(block.data());
   const int n_chunks = block.nrows() / 8;
   for (int i = 0; i < n_chunks; i++) {
-    uint8_t res_8 = 0;;
+    uint8_t res_8 = 0;
     for (int j = 0; j < 8; j++) {
       res_8 |= p(data++) << j;
     }
@@ -733,27 +733,34 @@ void ApplyNullPredicate(const ColumnBlock& block, uint8_t* __restrict__ sel_vec)
 template <DataType PhysicalType>
 void ColumnPredicate::EvaluateForPhysicalType(const ColumnBlock& block,
                                               SelectionVector* sel) const {
+  using traits = DataTypeTraits<PhysicalType>;
+  using cpp_type = typename traits::cpp_type;
+
   switch (predicate_type()) {
     case PredicateType::Range: {
+      cpp_type local_lower = lower_ ? *static_cast<const cpp_type*>(lower_) : cpp_type();
+      cpp_type local_upper = upper_ ? *static_cast<const cpp_type*>(upper_) : cpp_type();
+
       if (lower_ == nullptr) {
-        ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
-          return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0;
+        ApplyPredicate<PhysicalType>(block, sel, [local_upper] (const void* cell) {
+            return traits::Compare(cell, &local_upper) < 0;
         });
       } else if (upper_ == nullptr) {
-        ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
-          return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
+        ApplyPredicate<PhysicalType>(block, sel, [local_lower] (const void* cell) {
+            return traits::Compare(cell, &local_lower) >= 0;
         });
       } else {
-        ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
-          return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0 &&
-                 DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
+        ApplyPredicate<PhysicalType>(block, sel, [local_lower, local_upper] (const void* cell) {
+            return traits::Compare(cell, &local_upper) < 0 &&
+                   traits::Compare(cell, &local_lower) >= 0;
         });
       }
       return;
     };
     case PredicateType::Equality: {
-      ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
-        return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) == 0;
+      cpp_type local_lower = lower_ ? *static_cast<const cpp_type*>(lower_) : cpp_type();
+      ApplyPredicate<PhysicalType>(block, sel, [local_lower] (const void* cell) {
+            return traits::Compare(cell, &local_lower) == 0;
       });
       return;
     };
@@ -774,7 +781,7 @@ void ColumnPredicate::EvaluateForPhysicalType(const ColumnBlock& block,
       ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
         return std::binary_search(values_.begin(), values_.end(), cell,
                                   [] (const void* lhs, const void* rhs) {
-                                    return DataTypeTraits<PhysicalType>::Compare(lhs, rhs) < 0;
+                                    return traits::Compare(lhs, rhs) < 0;
                                   });
       });
       return;
diff --git a/src/kudu/common/column_predicate.h b/src/kudu/common/column_predicate.h
index 2527fe1..963a172 100644
--- a/src/kudu/common/column_predicate.h
+++ b/src/kudu/common/column_predicate.h
@@ -22,6 +22,7 @@
 #include <cstdint>
 #include <ostream>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <boost/optional/optional.hpp>


[kudu] 03/03: KUDU-1938 Add VARCHAR to schema design docs

Posted by ab...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 3175c35c7d721aef0c4c6b358cc3b422089c1ba7
Author: Attila Bukor <ab...@apache.org>
AuthorDate: Mon Dec 9 13:43:59 2019 -0800

    KUDU-1938 Add VARCHAR to schema design docs
    
    Change-Id: Ieae3000dde14f68a900dd63624d2778ce01cfa26
    Reviewed-on: http://gerrit.cloudera.org:8080/14865
    Reviewed-by: Grant Henke <gr...@apache.org>
    Tested-by: Kudu Jenkins
---
 docs/known_issues.adoc  | 7 +++----
 docs/schema_design.adoc | 3 ++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/known_issues.adoc b/docs/known_issues.adoc
index c8e2e5d..84f2925 100644
--- a/docs/known_issues.adoc
+++ b/docs/known_issues.adoc
@@ -51,11 +51,10 @@
 
 === Columns
 
-* CHAR, VARCHAR, DATE, and complex types like ARRAY, MAP, and STRUCT are not supported.
+* CHAR, DATE, and complex types like ARRAY, MAP, and STRUCT are not supported.
 
-* Type and nullability of existing columns cannot be changed by altering the table.
-
-* The precision and scale of `DECIMAL` columns cannot be changed by altering the table.
+* Type, nullability and type attributes (i.e. precision and scale of `DECIMAL`,
+  length of `VARCHAR`)of existing columns cannot be changed by altering the table.
 
 * Tables can have a maximum of 300 columns by default.
 
diff --git a/docs/schema_design.adoc b/docs/schema_design.adoc
index db12adb..3ecceed 100644
--- a/docs/schema_design.adoc
+++ b/docs/schema_design.adoc
@@ -78,6 +78,7 @@ column types include:
 * decimal (see <<decimal>> for details)
 * UTF-8 encoded string (up to 64KB uncompressed)
 * binary (up to 64KB uncompressed)
+* VARCHAR type with configurable maximum length (up to 64KB uncompressed)
 
 Kudu takes advantage of strongly-typed columns and a columnar on-disk storage
 format to provide efficient encoding and serialization. To make the most of
@@ -149,7 +150,7 @@ of the column.
 | int64, unixtime_micros  | plain, bitshuffle, run length  | bitshuffle
 | float, double, decimal  | plain, bitshuffle              | bitshuffle
 | bool                    | plain, run length              | run length
-| string, binary          | plain, prefix, dictionary      | dictionary
+| string, binary, varchar | plain, prefix, dictionary      | dictionary
 |===
 
 [[plain]]


[kudu] 02/03: client: optimize destruction of WriteRpc

Posted by ab...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit dff0349f9f9c6dde759bcd97e2f76957d8043df7
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Fri Dec 6 15:12:59 2019 -0800

    client: optimize destruction of WriteRpc
    
    When writing batches with lots of operations, the WriteRpc destructor
    ends up cache-miss bound, since the various InFlightOp and WriteOps are
    strewn all about memory. This adds some prefetching which sped things up
    noticeably (~37%) in a benchmark which ends up bound by the reactor thread on
    the client side.
    
      $ perf stat ./build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8
    
    Before:
      Generator report
        time total  : 51403.6 ms
        time per row: 0.000642545 ms
      Dropping auto-created table 'default.loadgen_auto_d289807fc12a4b1c861f79b19af9ec8e'
    
       Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8':
    
              180,585.24 msec task-clock                #    3.508 CPUs utilized
                  25,373      context-switches          #    0.141 K/sec
                   1,648      cpu-migrations            #    0.009 K/sec
                  50,927      page-faults               #    0.282 K/sec
         726,022,544,856      cycles                    #    4.020 GHz                      (83.33%)
          71,782,315,500      stalled-cycles-frontend   #    9.89% frontend cycles idle     (83.36%)
         412,273,652,207      stalled-cycles-backend    #   56.79% backend cycles idle      (83.29%)
         408,271,477,858      instructions              #    0.56  insn per cycle
                                                        #    1.01  stalled cycles per insn  (83.35%)
          75,750,045,948      branches                  #  419.470 M/sec                    (83.33%)
             296,247,270      branch-misses             #    0.39% of all branches          (83.34%)
    
            51.475433628 seconds time elapsed
    
           178.590913000 seconds user
             1.935099000 seconds sys
    
    After:
      Generator report
        time total  : 37293.8 ms
        time per row: 0.000466172 ms
      Dropping auto-created table 'default.loadgen_auto_ece2f41beef94a9fa032c77899f7e61c'
    
       Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8':
    
              189,125.49 msec task-clock                #    5.060 CPUs utilized
                  29,363      context-switches          #    0.155 K/sec
                   2,043      cpu-migrations            #    0.011 K/sec
                  48,405      page-faults               #    0.256 K/sec
         772,496,448,279      cycles                    #    4.085 GHz                      (83.33%)
         129,999,474,226      stalled-cycles-frontend   #   16.83% frontend cycles idle     (83.36%)
         300,049,388,250      stalled-cycles-backend    #   38.84% backend cycles idle      (83.30%)
         414,415,517,571      instructions              #    0.54  insn per cycle
                                                        #    0.72  stalled cycles per insn  (83.32%)
          76,829,647,882      branches                  #  406.236 M/sec                    (83.34%)
             352,749,453      branch-misses             #    0.46% of all branches          (83.35%)
    
            37.376785122 seconds time elapsed
    
           186.834651000 seconds user
             2.143945000 seconds sys
    
    Change-Id: I538f995f7ec161e746885c6b31cd1dccd72139b0
    Reviewed-on: http://gerrit.cloudera.org:8080/14868
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Tested-by: Todd Lipcon <to...@apache.org>
---
 src/kudu/client/batcher.cc    | 79 +++++++++++++++++++++++++++++++++++++++++--
 src/kudu/common/partial_row.h |  4 +++
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc
index e6ce9dd..c68e77d 100644
--- a/src/kudu/client/batcher.cc
+++ b/src/kudu/client/batcher.cc
@@ -41,6 +41,7 @@
 #include "kudu/client/write_op-internal.h"
 #include "kudu/client/write_op.h"
 #include "kudu/common/common.pb.h"
+#include "kudu/common/partial_row.h"
 #include "kudu/common/partition.h"
 #include "kudu/common/row_operations.h"
 #include "kudu/common/wire_protocol.h"
@@ -50,7 +51,7 @@
 #include "kudu/gutil/bind_helpers.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/map-util.h"
-#include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/rpc/connection.h"
 #include "kudu/rpc/request_tracker.h"
@@ -76,7 +77,6 @@ using strings::Substitute;
 
 namespace kudu {
 
-class KuduPartialRow;
 class Schema;
 
 namespace rpc {
@@ -342,7 +342,80 @@ WriteRpc::WriteRpc(const scoped_refptr<Batcher>& batcher,
 }
 
 WriteRpc::~WriteRpc() {
-  STLDeleteElements(&ops_);
+  // Since the WriteRpc is destructed a while after all of the
+  // InFlightOps and other associated objects were last touched,
+  // and because those operations were not all allocated together,
+  // they're likely to be strewn all around in RAM. This function
+  // then ends up cache-miss-bound.
+  //
+  // Ideally, we could change the allocation pattern to make them
+  // more contiguous, but it's a bit tricky -- this is client code,
+  // so we don't really have great control over how the write ops
+  // themselves are allocated.
+  //
+  // So, instead, we do some prefetching. The pointer graph looks like:
+  //
+  // vector<InFlightOp*>
+  //    [i] InFlightOp* pointer
+  //         \----> InFlightOp instance
+  //                | WriteOp* pointer
+  //                |   \-----> WriteOp instance
+  //                            | KuduPartialRow (embedded)
+  //                            | | isset_bitmap_
+  //                                   \-----> heap allocated memory
+  //
+  //
+  // So, we need to do three "layers" of prefetch. First, prefetch the
+  // InFlightOp instance. Then, prefetch the KuduPartialRow contained by
+  // the WriteOp that it points to. Then, prefetch the isset bitmap that
+  // the PartialRow points to.
+  //
+  // In order to get parallelism here, we need to stagger the prefetches:
+  // the "root" of the tree needs to look farthest in the future, then
+  // prefetch the next level, then prefetch the closest level, before
+  // eventually calling the destructor.
+  //
+  // Experimentally, it seems we get enough benefit from only prefetching
+  // one entry "ahead" in between each.
+  constexpr static int kPrefetchDistance = 1;
+  const int size = ops_.size();
+
+  auto iter = [this, size](int i) {
+    int ifo_prefetch = i + kPrefetchDistance * 3;
+    int op_prefetch = i + kPrefetchDistance * 2;
+    int row_prefetch = i + kPrefetchDistance;
+    if (ifo_prefetch >= 0 && ifo_prefetch < size) {
+      __builtin_prefetch(ops_[ifo_prefetch], 0, PREFETCH_HINT_T0);
+    }
+    if (op_prefetch >= 0 && op_prefetch < size) {
+      const auto* op = ops_[op_prefetch]->write_op.get();
+      if (op) {
+        __builtin_prefetch(&op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+      }
+    }
+    if (row_prefetch >= 0 && row_prefetch < size) {
+      const auto* op = ops_[row_prefetch]->write_op.get();
+      if (op) {
+        __builtin_prefetch(op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+      }
+    }
+    if (i >= 0) {
+      delete ops_[i];
+    }
+  };
+
+  // Explicitly perform "loop splitting" to avoid the branches in the main
+  // body of the loop.
+  int i = -kPrefetchDistance * 3;
+  while (i < 0) {
+    iter(i++);
+  }
+  while (i < size - kPrefetchDistance * 3) {
+    iter(i++);
+  }
+  while (i < size) {
+    iter(i++);
+  }
 }
 
 string WriteRpc::ToString() const {
diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h
index 3a50907..d2fc631 100644
--- a/src/kudu/common/partial_row.h
+++ b/src/kudu/common/partial_row.h
@@ -44,6 +44,9 @@ class ColumnSchema;
 namespace client {
 class ClientTest_TestProjectionPredicatesFuzz_Test;
 class KuduWriteOperation;
+namespace internal {
+class WriteRpc;
+} // namespace internal
 template<typename KeyTypeWrapper> struct SliceKeysTestSetup;// IWYU pragma: keep
 template<typename KeyTypeWrapper> struct IntKeysTestSetup;  // IWYU pragma: keep
 } // namespace client
@@ -586,6 +589,7 @@ class KUDU_EXPORT KuduPartialRow {
 
  private:
   friend class client::KuduWriteOperation;   // for row_data_.
+  friend class client::internal::WriteRpc;   // for row_data_.
   friend class KeyUtilTest;
   friend class PartitionSchema;
   friend class RowOperationsPBDecoder;