You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ab...@apache.org on 2019/12/09 22:34:07 UTC
[kudu] branch master updated (6b69108 -> 3175c35)
This is an automated email from the ASF dual-hosted git repository.
abukor pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.
from 6b69108 KUDU-3001 Multi-thread to load containers in a data directory
new c9dd2b5 Improve SIMD code generation for primitive predicates
new dff0349 client: optimize destruction of WriteRpc
new 3175c35 KUDU-1938 Add VARCHAR to schema design docs
The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
docs/known_issues.adoc | 7 ++--
docs/schema_design.adoc | 3 +-
src/kudu/client/batcher.cc | 79 +++++++++++++++++++++++++++++++++++--
src/kudu/common/column_predicate.cc | 29 ++++++++------
src/kudu/common/column_predicate.h | 1 +
src/kudu/common/partial_row.h | 4 ++
6 files changed, 104 insertions(+), 19 deletions(-)
[kudu] 01/03: Improve SIMD code generation for primitive predicates
Posted by ab...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit c9dd2b520a8b82500eb6a56961b9da0ccd2ed752
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Thu Dec 5 11:26:09 2019 -0800
Improve SIMD code generation for primitive predicates
This adds a local (on-stack) copy of the bounds for range and equality
predicates before evaluating them against the columns. These on-stack
copies help the compiler realize that the stores to the selection vector
can't overwrite the predicate itself, and thus allows SIMD code
generation.
Benchmarked with column_predicate-test. Highlighting the 'NOT NULL'
results (since this doesn't change the evaluation of nulls):
Before:
int8 NOT NULL (c >= 0 AND c < 2) 1363.5M evals/sec 2.09 cycles/eval
int16 NOT NULL (c >= 0 AND c < 2) 1238.3M evals/sec 2.30 cycles/eval
int32 NOT NULL (c >= 0 AND c < 2) 1321.3M evals/sec 2.15 cycles/eval
int64 NOT NULL (c >= 0 AND c < 2) 1408.3M evals/sec 2.02 cycles/eval
float NOT NULL (c >= 0 AND c < 2) 1134.8M evals/sec 2.52 cycles/eval
double NOT NULL (c >= 0 AND c < 2) 1144.2M evals/sec 2.49 cycles/eval
After:
int8 NOT NULL (c >= 0 AND c < 2) 3152.2M evals/sec 0.88 cycles/eval
int16 NOT NULL (c >= 0 AND c < 2) 3309.6M evals/sec 0.85 cycles/eval
int32 NOT NULL (c >= 0 AND c < 2) 3384.0M evals/sec 0.85 cycles/eval
int64 NOT NULL (c >= 0 AND c < 2) 1847.6M evals/sec 1.57 cycles/eval
float NOT NULL (c >= 0 AND c < 2) 3268.3M evals/sec 0.88 cycles/eval
double NOT NULL (c >= 0 AND c < 2) 2245.2M evals/sec 1.27 cycles/eval
The numbers for non-range predicates didn't seem to change here.
Change-Id: I1772584c1d0c53128608ea26248dd4ab069b8108
Reviewed-on: http://gerrit.cloudera.org:8080/14855
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Kudu Jenkins
---
src/kudu/common/column_predicate.cc | 29 ++++++++++++++++++-----------
src/kudu/common/column_predicate.h | 1 +
2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/src/kudu/common/column_predicate.cc b/src/kudu/common/column_predicate.cc
index bea1142..2c84e85 100644
--- a/src/kudu/common/column_predicate.cc
+++ b/src/kudu/common/column_predicate.cc
@@ -673,7 +673,7 @@ int ApplyPredicatePrimitive(const ColumnBlock& block, uint8_t* __restrict__ sel_
const cpp_type* data = reinterpret_cast<const cpp_type*>(block.data());
const int n_chunks = block.nrows() / 8;
for (int i = 0; i < n_chunks; i++) {
- uint8_t res_8 = 0;;
+ uint8_t res_8 = 0;
for (int j = 0; j < 8; j++) {
res_8 |= p(data++) << j;
}
@@ -733,27 +733,34 @@ void ApplyNullPredicate(const ColumnBlock& block, uint8_t* __restrict__ sel_vec)
template <DataType PhysicalType>
void ColumnPredicate::EvaluateForPhysicalType(const ColumnBlock& block,
SelectionVector* sel) const {
+ using traits = DataTypeTraits<PhysicalType>;
+ using cpp_type = typename traits::cpp_type;
+
switch (predicate_type()) {
case PredicateType::Range: {
+ cpp_type local_lower = lower_ ? *static_cast<const cpp_type*>(lower_) : cpp_type();
+ cpp_type local_upper = upper_ ? *static_cast<const cpp_type*>(upper_) : cpp_type();
+
if (lower_ == nullptr) {
- ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
- return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0;
+ ApplyPredicate<PhysicalType>(block, sel, [local_upper] (const void* cell) {
+ return traits::Compare(cell, &local_upper) < 0;
});
} else if (upper_ == nullptr) {
- ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
- return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
+ ApplyPredicate<PhysicalType>(block, sel, [local_lower] (const void* cell) {
+ return traits::Compare(cell, &local_lower) >= 0;
});
} else {
- ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
- return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0 &&
- DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
+ ApplyPredicate<PhysicalType>(block, sel, [local_lower, local_upper] (const void* cell) {
+ return traits::Compare(cell, &local_upper) < 0 &&
+ traits::Compare(cell, &local_lower) >= 0;
});
}
return;
};
case PredicateType::Equality: {
- ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
- return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) == 0;
+ cpp_type local_lower = lower_ ? *static_cast<const cpp_type*>(lower_) : cpp_type();
+ ApplyPredicate<PhysicalType>(block, sel, [local_lower] (const void* cell) {
+ return traits::Compare(cell, &local_lower) == 0;
});
return;
};
@@ -774,7 +781,7 @@ void ColumnPredicate::EvaluateForPhysicalType(const ColumnBlock& block,
ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return std::binary_search(values_.begin(), values_.end(), cell,
[] (const void* lhs, const void* rhs) {
- return DataTypeTraits<PhysicalType>::Compare(lhs, rhs) < 0;
+ return traits::Compare(lhs, rhs) < 0;
});
});
return;
diff --git a/src/kudu/common/column_predicate.h b/src/kudu/common/column_predicate.h
index 2527fe1..963a172 100644
--- a/src/kudu/common/column_predicate.h
+++ b/src/kudu/common/column_predicate.h
@@ -22,6 +22,7 @@
#include <cstdint>
#include <ostream>
#include <string>
+#include <utility>
#include <vector>
#include <boost/optional/optional.hpp>
[kudu] 03/03: KUDU-1938 Add VARCHAR to schema design docs
Posted by ab...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 3175c35c7d721aef0c4c6b358cc3b422089c1ba7
Author: Attila Bukor <ab...@apache.org>
AuthorDate: Mon Dec 9 13:43:59 2019 -0800
KUDU-1938 Add VARCHAR to schema design docs
Change-Id: Ieae3000dde14f68a900dd63624d2778ce01cfa26
Reviewed-on: http://gerrit.cloudera.org:8080/14865
Reviewed-by: Grant Henke <gr...@apache.org>
Tested-by: Kudu Jenkins
---
docs/known_issues.adoc | 7 +++----
docs/schema_design.adoc | 3 ++-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/docs/known_issues.adoc b/docs/known_issues.adoc
index c8e2e5d..84f2925 100644
--- a/docs/known_issues.adoc
+++ b/docs/known_issues.adoc
@@ -51,11 +51,10 @@
=== Columns
-* CHAR, VARCHAR, DATE, and complex types like ARRAY, MAP, and STRUCT are not supported.
+* CHAR, DATE, and complex types like ARRAY, MAP, and STRUCT are not supported.
-* Type and nullability of existing columns cannot be changed by altering the table.
-
-* The precision and scale of `DECIMAL` columns cannot be changed by altering the table.
+* Type, nullability and type attributes (i.e. precision and scale of `DECIMAL`,
+ length of `VARCHAR`)of existing columns cannot be changed by altering the table.
* Tables can have a maximum of 300 columns by default.
diff --git a/docs/schema_design.adoc b/docs/schema_design.adoc
index db12adb..3ecceed 100644
--- a/docs/schema_design.adoc
+++ b/docs/schema_design.adoc
@@ -78,6 +78,7 @@ column types include:
* decimal (see <<decimal>> for details)
* UTF-8 encoded string (up to 64KB uncompressed)
* binary (up to 64KB uncompressed)
+* VARCHAR type with configurable maximum length (up to 64KB uncompressed)
Kudu takes advantage of strongly-typed columns and a columnar on-disk storage
format to provide efficient encoding and serialization. To make the most of
@@ -149,7 +150,7 @@ of the column.
| int64, unixtime_micros | plain, bitshuffle, run length | bitshuffle
| float, double, decimal | plain, bitshuffle | bitshuffle
| bool | plain, run length | run length
-| string, binary | plain, prefix, dictionary | dictionary
+| string, binary, varchar | plain, prefix, dictionary | dictionary
|===
[[plain]]
[kudu] 02/03: client: optimize destruction of WriteRpc
Posted by ab...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit dff0349f9f9c6dde759bcd97e2f76957d8043df7
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Fri Dec 6 15:12:59 2019 -0800
client: optimize destruction of WriteRpc
When writing batches with lots of operations, the WriteRpc destructor
ends up cache-miss bound, since the various InFlightOp and WriteOps are
strewn all about memory. This adds some prefetching which sped things up
noticeably (~37%) in a benchmark which ends up bound by the reactor thread on
the client side.
$ perf stat ./build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8
Before:
Generator report
time total : 51403.6 ms
time per row: 0.000642545 ms
Dropping auto-created table 'default.loadgen_auto_d289807fc12a4b1c861f79b19af9ec8e'
Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8':
180,585.24 msec task-clock # 3.508 CPUs utilized
25,373 context-switches # 0.141 K/sec
1,648 cpu-migrations # 0.009 K/sec
50,927 page-faults # 0.282 K/sec
726,022,544,856 cycles # 4.020 GHz (83.33%)
71,782,315,500 stalled-cycles-frontend # 9.89% frontend cycles idle (83.36%)
412,273,652,207 stalled-cycles-backend # 56.79% backend cycles idle (83.29%)
408,271,477,858 instructions # 0.56 insn per cycle
# 1.01 stalled cycles per insn (83.35%)
75,750,045,948 branches # 419.470 M/sec (83.33%)
296,247,270 branch-misses # 0.39% of all branches (83.34%)
51.475433628 seconds time elapsed
178.590913000 seconds user
1.935099000 seconds sys
After:
Generator report
time total : 37293.8 ms
time per row: 0.000466172 ms
Dropping auto-created table 'default.loadgen_auto_ece2f41beef94a9fa032c77899f7e61c'
Performance counter stats for './build/thinlto/bin/kudu perf loadgen localhost -num_rows_per_thread=10000000 -num_threads=8':
189,125.49 msec task-clock # 5.060 CPUs utilized
29,363 context-switches # 0.155 K/sec
2,043 cpu-migrations # 0.011 K/sec
48,405 page-faults # 0.256 K/sec
772,496,448,279 cycles # 4.085 GHz (83.33%)
129,999,474,226 stalled-cycles-frontend # 16.83% frontend cycles idle (83.36%)
300,049,388,250 stalled-cycles-backend # 38.84% backend cycles idle (83.30%)
414,415,517,571 instructions # 0.54 insn per cycle
# 0.72 stalled cycles per insn (83.32%)
76,829,647,882 branches # 406.236 M/sec (83.34%)
352,749,453 branch-misses # 0.46% of all branches (83.35%)
37.376785122 seconds time elapsed
186.834651000 seconds user
2.143945000 seconds sys
Change-Id: I538f995f7ec161e746885c6b31cd1dccd72139b0
Reviewed-on: http://gerrit.cloudera.org:8080/14868
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Todd Lipcon <to...@apache.org>
---
src/kudu/client/batcher.cc | 79 +++++++++++++++++++++++++++++++++++++++++--
src/kudu/common/partial_row.h | 4 +++
2 files changed, 80 insertions(+), 3 deletions(-)
diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc
index e6ce9dd..c68e77d 100644
--- a/src/kudu/client/batcher.cc
+++ b/src/kudu/client/batcher.cc
@@ -41,6 +41,7 @@
#include "kudu/client/write_op-internal.h"
#include "kudu/client/write_op.h"
#include "kudu/common/common.pb.h"
+#include "kudu/common/partial_row.h"
#include "kudu/common/partition.h"
#include "kudu/common/row_operations.h"
#include "kudu/common/wire_protocol.h"
@@ -50,7 +51,7 @@
#include "kudu/gutil/bind_helpers.h"
#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/map-util.h"
-#include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/rpc/connection.h"
#include "kudu/rpc/request_tracker.h"
@@ -76,7 +77,6 @@ using strings::Substitute;
namespace kudu {
-class KuduPartialRow;
class Schema;
namespace rpc {
@@ -342,7 +342,80 @@ WriteRpc::WriteRpc(const scoped_refptr<Batcher>& batcher,
}
WriteRpc::~WriteRpc() {
- STLDeleteElements(&ops_);
+ // Since the WriteRpc is destructed a while after all of the
+ // InFlightOps and other associated objects were last touched,
+ // and because those operations were not all allocated together,
+ // they're likely to be strewn all around in RAM. This function
+ // then ends up cache-miss-bound.
+ //
+ // Ideally, we could change the allocation pattern to make them
+ // more contiguous, but it's a bit tricky -- this is client code,
+ // so we don't really have great control over how the write ops
+ // themselves are allocated.
+ //
+ // So, instead, we do some prefetching. The pointer graph looks like:
+ //
+ // vector<InFlightOp*>
+ // [i] InFlightOp* pointer
+ // \----> InFlightOp instance
+ // | WriteOp* pointer
+ // | \-----> WriteOp instance
+ // | KuduPartialRow (embedded)
+ // | | isset_bitmap_
+ // \-----> heap allocated memory
+ //
+ //
+ // So, we need to do three "layers" of prefetch. First, prefetch the
+ // InFlightOp instance. Then, prefetch the KuduPartialRow contained by
+ // the WriteOp that it points to. Then, prefetch the isset bitmap that
+ // the PartialRow points to.
+ //
+ // In order to get parallelism here, we need to stagger the prefetches:
+ // the "root" of the tree needs to look farthest in the future, then
+ // prefetch the next level, then prefetch the closest level, before
+ // eventually calling the destructor.
+ //
+ // Experimentally, it seems we get enough benefit from only prefetching
+ // one entry "ahead" in between each.
+ constexpr static int kPrefetchDistance = 1;
+ const int size = ops_.size();
+
+ auto iter = [this, size](int i) {
+ int ifo_prefetch = i + kPrefetchDistance * 3;
+ int op_prefetch = i + kPrefetchDistance * 2;
+ int row_prefetch = i + kPrefetchDistance;
+ if (ifo_prefetch >= 0 && ifo_prefetch < size) {
+ __builtin_prefetch(ops_[ifo_prefetch], 0, PREFETCH_HINT_T0);
+ }
+ if (op_prefetch >= 0 && op_prefetch < size) {
+ const auto* op = ops_[op_prefetch]->write_op.get();
+ if (op) {
+ __builtin_prefetch(&op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+ }
+ }
+ if (row_prefetch >= 0 && row_prefetch < size) {
+ const auto* op = ops_[row_prefetch]->write_op.get();
+ if (op) {
+ __builtin_prefetch(op->row().isset_bitmap_, 0, PREFETCH_HINT_T0);
+ }
+ }
+ if (i >= 0) {
+ delete ops_[i];
+ }
+ };
+
+ // Explicitly perform "loop splitting" to avoid the branches in the main
+ // body of the loop.
+ int i = -kPrefetchDistance * 3;
+ while (i < 0) {
+ iter(i++);
+ }
+ while (i < size - kPrefetchDistance * 3) {
+ iter(i++);
+ }
+ while (i < size) {
+ iter(i++);
+ }
}
string WriteRpc::ToString() const {
diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h
index 3a50907..d2fc631 100644
--- a/src/kudu/common/partial_row.h
+++ b/src/kudu/common/partial_row.h
@@ -44,6 +44,9 @@ class ColumnSchema;
namespace client {
class ClientTest_TestProjectionPredicatesFuzz_Test;
class KuduWriteOperation;
+namespace internal {
+class WriteRpc;
+} // namespace internal
template<typename KeyTypeWrapper> struct SliceKeysTestSetup;// IWYU pragma: keep
template<typename KeyTypeWrapper> struct IntKeysTestSetup; // IWYU pragma: keep
} // namespace client
@@ -586,6 +589,7 @@ class KUDU_EXPORT KuduPartialRow {
private:
friend class client::KuduWriteOperation; // for row_data_.
+ friend class client::internal::WriteRpc; // for row_data_.
friend class KeyUtilTest;
friend class PartitionSchema;
friend class RowOperationsPBDecoder;