You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2018/02/23 00:19:20 UTC
[5/5] kudu git commit: row: optimize copying of MRS rows into the
Arena
row: optimize copying of MRS rows into the Arena
I tried a stress workload using YCSB with 100 columns, each a 10-byte
string. I expected this to be roughly the same performance as 10 columns
containing 100-byte strings, but in fact it was about 3x as slow. A
profile showed most of the CPU consumed in MemRowSet::Insert,
specifically in the inlined Arena::AllocateBytes call. Apparently with
many threads trying to allocate each cell of each row separately from
the arena, this became a point of contention.
This patch batches the allocation to do a single allocation for all of
the strings to be copied.
I didn't do a full run to measure throughput, but roughly it seems about
20% faster and cluster-wide CPU usage is down about 50%. The
MemRowSet::Insert call went from about 50% of the cycles down to <2%.
Change-Id: I6eea882d1d9a7355fb0bbad12c388908ec399a39
Reviewed-on: http://gerrit.cloudera.org:8080/9404
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <da...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/40ba6c14
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/40ba6c14
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/40ba6c14
Branch: refs/heads/master
Commit: 40ba6c1436abc2785516160326af4b156e418580
Parents: 8ea955d
Author: Todd Lipcon <to...@apache.org>
Authored: Thu Feb 22 12:33:39 2018 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Fri Feb 23 00:17:54 2018 +0000
----------------------------------------------------------------------
src/kudu/common/row.h | 28 +++++++++++++++++++++++-----
1 file changed, 23 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/40ba6c14/src/kudu/common/row.h
----------------------------------------------------------------------
diff --git a/src/kudu/common/row.h b/src/kudu/common/row.h
index ab45cb6..8015cdd 100644
--- a/src/kudu/common/row.h
+++ b/src/kudu/common/row.h
@@ -343,8 +343,8 @@ class DeltaProjector {
template <class RowType, class ArenaType>
inline Status RelocateIndirectDataToArena(RowType *row, ArenaType *dst_arena) {
const Schema* schema = row->schema();
- // For any Slice columns, copy the sliced data into the arena
- // and update the pointers
+ // First calculate the total size we'll need to allocate in the arena.
+ int size = 0;
for (int i = 0; i < schema->num_columns(); i++) {
typename RowType::Cell cell = row->cell(i);
if (cell.typeinfo()->physical_type() == BINARY) {
@@ -352,10 +352,28 @@ inline Status RelocateIndirectDataToArena(RowType *row, ArenaType *dst_arena) {
continue;
}
- Slice *slice = reinterpret_cast<Slice *>(cell.mutable_ptr());
- if (!dst_arena->RelocateSlice(*slice, slice)) {
- return Status::IOError("Unable to relocate slice");
+ const Slice *slice = reinterpret_cast<const Slice *>(cell.ptr());
+ size += slice->size();
+ }
+ }
+ if (size == 0) return Status::OK();
+
+ // Then allocate it in one shot and copy the actual data.
+ // Even though Arena allocation is cheap, a row may have hundreds of
+ // small string columns and each operation is at least one CAS. With
+ // many concurrent threads copying into a single arena, this avoids
+ // a lot of contention.
+ uint8_t* dst = static_cast<uint8_t*>(dst_arena->AllocateBytes(size));
+ for (int i = 0; i < schema->num_columns(); i++) {
+ typename RowType::Cell cell = row->cell(i);
+ if (cell.typeinfo()->physical_type() == BINARY) {
+ if (cell.is_nullable() && cell.is_null()) {
+ continue;
}
+
+ Slice *slice = reinterpret_cast<Slice *>(cell.mutable_ptr());
+ slice->relocate(dst);
+ dst += slice->size();
}
}
return Status::OK();