You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2020/01/22 19:24:27 UTC
[kudu] 02/04: schema: use dense_hash_map instead of
std::unordered_map
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit c3170a9fc6b5c30e85b94be615d8064ed7d26cd6
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Fri Jan 17 09:04:18 2020 -0800
schema: use dense_hash_map instead of std::unordered_map
In a time series benchmark I'm working on, the client spent 12% of its
CPU in Schema::FindColumn. In particular, most of the CPU went to the
bucket calculation in std::unordered_map, which required a 'divq'
instruction that can take hundreds of cycles.
This switches Schema to use a dense_hash_map instead which performs
better. After this change, the percent of CPU used by my benchmark
worker thread in Schema::FindColumn dropped from ~12% to ~1.5% which
resulted in a few percent overall throughput increase.
This also made the fancy allocator which tried to count memory usage
unnecessary, since dense_hash_map is a simple enough data structure that
we can directly compute the memory usage. Now we can also simplify the
constructors since we no longer need to pass an allocator instance.
Change-Id: I8e8f80229b2dcfad05e204a6f6e50ce7dc3f4c73
Reviewed-on: http://gerrit.cloudera.org:8080/15064
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Kudu Jenkins
---
src/kudu/client/client-test.cc | 6 +-
src/kudu/common/schema.cc | 64 +++++++++-------------
src/kudu/common/schema.h | 61 ++++++++-------------
src/kudu/common/wire_protocol.cc | 9 +--
thirdparty/download-thirdparty.sh | 5 +-
...und-for-dense_hashtable-move-constructor-.patch | 29 ++++++++++
6 files changed, 88 insertions(+), 86 deletions(-)
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index d020987..87f0642 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -3853,8 +3853,10 @@ TEST_F(ClientTest, TestBasicAlterOperations) {
unique_ptr<KuduTableAlterer> table_alterer(client_->NewTableAlterer(kTableName));
table_alterer->AlterColumn("int_val")->RenameTo(bad_name);
Status s = table_alterer->Alter();
- ASSERT_TRUE(s.IsInvalidArgument());
- ASSERT_STR_CONTAINS(s.ToString(), "invalid column name");
+ EXPECT_TRUE(s.IsInvalidArgument());
+ EXPECT_THAT(s.ToString(), testing::AnyOf(
+ testing::HasSubstr("invalid column name"),
+ testing::HasSubstr("column name must be non-empty")));
}
// Test that renaming a table to an invalid name throws an error.
diff --git a/src/kudu/common/schema.cc b/src/kudu/common/schema.cc
index 11a9966..e723bc4 100644
--- a/src/kudu/common/schema.cc
+++ b/src/kudu/common/schema.cc
@@ -191,11 +191,8 @@ size_t ColumnSchema::memory_footprint_including_this() const {
const int Schema::kColumnNotFound = -1;
Schema::Schema(const Schema& other)
- : name_to_index_bytes_(0),
- name_to_index_(/*bucket_count*/ 10,
- NameToIndexMap::hasher(),
- NameToIndexMap::key_equal(),
- NameToIndexMapAllocator(&name_to_index_bytes_)) {
+ : name_to_index_(other.num_columns()) {
+ name_to_index_.set_empty_key(StringPiece());
CopyFrom(other);
}
@@ -216,7 +213,7 @@ void Schema::CopyFrom(const Schema& other) {
// We can't simply copy name_to_index_ since the StringPiece keys
// reference the other Schema's ColumnSchema objects.
- name_to_index_.clear();
+ name_to_index_.clear_no_resize();
int i = 0;
for (const ColumnSchema &col : cols_) {
// The map uses the 'name' string from within the ColumnSchema object.
@@ -233,25 +230,10 @@ Schema::Schema(Schema&& other) noexcept
col_ids_(std::move(other.col_ids_)),
max_col_id_(other.max_col_id_),
col_offsets_(std::move(other.col_offsets_)),
- name_to_index_bytes_(0),
- name_to_index_(/*bucket_count*/ 10,
- NameToIndexMap::hasher(),
- NameToIndexMap::key_equal(),
- NameToIndexMapAllocator(&name_to_index_bytes_)),
+ name_to_index_(std::move(other.name_to_index_)),
id_to_index_(std::move(other.id_to_index_)),
first_is_deleted_virtual_column_idx_(other.first_is_deleted_virtual_column_idx_),
has_nullables_(other.has_nullables_) {
- // 'name_to_index_' uses a customer allocator which holds a pointer to
- // 'name_to_index_bytes_'. swap() will swap the contents but not the
- // allocators; std::move will move the allocator[1], which will mean the moved
- // to value will be holding a pointer to the moved-from's member, which is
- // wrong and will likely lead to use-after-free. The 'name_to_index_bytes_'
- // values should also be swapped so both schemas end up in a valid state.
- //
- // [1] STLCountingAllocator::propagate_on_container_move_assignment::value
- // is std::true_type, which it inherits from the default Allocator.
- std::swap(name_to_index_bytes_, other.name_to_index_bytes_);
- name_to_index_.swap(other.name_to_index_);
}
Schema& Schema::operator=(Schema&& other) noexcept {
@@ -264,18 +246,15 @@ Schema& Schema::operator=(Schema&& other) noexcept {
id_to_index_ = std::move(other.id_to_index_);
first_is_deleted_virtual_column_idx_ = other.first_is_deleted_virtual_column_idx_;
has_nullables_ = other.has_nullables_;
-
- // See the comment in the move constructor implementation for why we swap.
- std::swap(name_to_index_bytes_, other.name_to_index_bytes_);
- name_to_index_.swap(other.name_to_index_);
+ name_to_index_ = std::move(other.name_to_index_);
}
return *this;
}
-Status Schema::Reset(const vector<ColumnSchema>& cols,
- const vector<ColumnId>& ids,
+Status Schema::Reset(vector<ColumnSchema> cols,
+ vector<ColumnId> ids,
int key_columns) {
- cols_ = cols;
+ cols_ = std::move(cols);
num_key_columns_ = key_columns;
if (PREDICT_FALSE(key_columns > cols_.size())) {
@@ -306,8 +285,11 @@ Status Schema::Reset(const vector<ColumnSchema>& cols,
col_offsets_.reserve(cols_.size() + 1); // Include space for total byte size at the end.
size_t off = 0;
size_t i = 0;
- name_to_index_.clear();
+ name_to_index_.clear_no_resize();
for (const ColumnSchema &col : cols_) {
+ if (col.name().empty()) {
+ return Status::InvalidArgument("column names must be non-empty");
+ }
// The map uses the 'name' string from within the ColumnSchema object.
if (!InsertIfNotPresent(&name_to_index_, col.name(), i++)) {
return Status::InvalidArgument("Duplicate column name", col.name());
@@ -322,14 +304,14 @@ Status Schema::Reset(const vector<ColumnSchema>& cols,
col_offsets_.push_back(off);
// Initialize IDs mapping
- col_ids_ = ids;
+ col_ids_ = std::move(ids);
id_to_index_.clear();
max_col_id_ = 0;
- for (int i = 0; i < ids.size(); ++i) {
- if (ids[i] > max_col_id_) {
- max_col_id_ = ids[i];
+ for (int i = 0; i < col_ids_.size(); ++i) {
+ if (col_ids_[i] > max_col_id_) {
+ max_col_id_ = col_ids_[i];
}
- id_to_index_.set(ids[i], i);
+ id_to_index_.set(col_ids_[i], i);
}
RETURN_NOT_OK(FindFirstIsDeletedVirtualColumnIdx(
@@ -371,7 +353,7 @@ Status Schema::CreateProjectionByNames(const std::vector<StringPiece>& col_names
}
cols.push_back(column(idx));
}
- return out->Reset(cols, ids, 0);
+ return out->Reset(std::move(cols), std::move(ids), 0);
}
Status Schema::CreateProjectionByIdsIgnoreMissing(const std::vector<ColumnId>& col_ids,
@@ -386,7 +368,7 @@ Status Schema::CreateProjectionByIdsIgnoreMissing(const std::vector<ColumnId>& c
cols.push_back(column(idx));
filtered_col_ids.push_back(id);
}
- return out->Reset(cols, filtered_col_ids, 0);
+ return out->Reset(std::move(cols), std::move(filtered_col_ids), 0);
}
Schema Schema::CopyWithColumnIds() const {
@@ -556,7 +538,7 @@ size_t Schema::memory_footprint_excluding_this() const {
if (col_offsets_.capacity() > 0) {
size += kudu_malloc_usable_size(col_offsets_.data());
}
- size += name_to_index_bytes_;
+ size += name_to_index_.bucket_count() * sizeof(NameToIndexMap::value_type);
size += id_to_index_.memory_footprint_excluding_this();
return size;
@@ -610,6 +592,9 @@ Status SchemaBuilder::AddColumn(const string& name,
bool is_nullable,
const void *read_default,
const void *write_default) {
+ if (name.empty()) {
+ return Status::InvalidArgument("column name must be non-empty");
+ }
return AddColumn(ColumnSchema(name, type, is_nullable, read_default, write_default), false);
}
@@ -636,6 +621,9 @@ Status SchemaBuilder::RemoveColumn(const string& name) {
}
Status SchemaBuilder::RenameColumn(const string& old_name, const string& new_name) {
+ if (new_name.empty()) {
+ return Status::InvalidArgument("column name must be non-empty");
+ }
// check if 'new_name' is already in use
if (col_names_.find(new_name) != col_names_.end()) {
return Status::AlreadyPresent("The column already exists", new_name);
diff --git a/src/kudu/common/schema.h b/src/kudu/common/schema.h
index 489bb00..5c6fd16 100644
--- a/src/kudu/common/schema.h
+++ b/src/kudu/common/schema.h
@@ -23,12 +23,12 @@
#include <memory>
#include <ostream>
#include <string>
-#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/optional/optional.hpp>
+#include <sparsehash/dense_hash_map>
#include <glog/logging.h>
#include "kudu/common/common.pb.h"
@@ -452,16 +452,12 @@ class Schema {
Schema()
: num_key_columns_(0),
- name_to_index_bytes_(0),
- // TODO(wdberkeley): C++11 provides a single-argument constructor, but
- // it's not supported in GCC < 4.9. This (and the other occurrences here
- // and in schema.cc) can be fixed if we adopt C++14 or a later standard.
- name_to_index_(/*bucket_count=*/10,
- NameToIndexMap::hasher(),
- NameToIndexMap::key_equal(),
- NameToIndexMapAllocator(&name_to_index_bytes_)),
+ // Initialize for 1 expected element since 0 gets replaced with
+ // the default (32).
+ name_to_index_(1),
first_is_deleted_virtual_column_idx_(kColumnNotFound),
has_nullables_(false) {
+ name_to_index_.set_empty_key(StringPiece());
}
Schema(const Schema& other);
@@ -478,14 +474,11 @@ class Schema {
// empty schema and then use Reset(...) so that errors can be
// caught. If an invalid schema is passed to this constructor, an
// assertion will be fired!
- Schema(const std::vector<ColumnSchema>& cols,
+ Schema(std::vector<ColumnSchema> cols,
int key_columns)
- : name_to_index_bytes_(0),
- name_to_index_(/*bucket_count=*/10,
- NameToIndexMap::hasher(),
- NameToIndexMap::key_equal(),
- NameToIndexMapAllocator(&name_to_index_bytes_)) {
- CHECK_OK(Reset(cols, key_columns));
+ : name_to_index_(/*expected_max_items_in_table=*/cols.size()) {
+ name_to_index_.set_empty_key(StringPiece());
+ CHECK_OK(Reset(std::move(cols), key_columns));
}
// Construct a schema with the given information.
@@ -494,30 +487,27 @@ class Schema {
// empty schema and then use Reset(...) so that errors can be
// caught. If an invalid schema is passed to this constructor, an
// assertion will be fired!
- Schema(const std::vector<ColumnSchema>& cols,
- const std::vector<ColumnId>& ids,
+ Schema(std::vector<ColumnSchema> cols,
+ std::vector<ColumnId> ids,
int key_columns)
- : name_to_index_bytes_(0),
- name_to_index_(/*bucket_count=*/10,
- NameToIndexMap::hasher(),
- NameToIndexMap::key_equal(),
- NameToIndexMapAllocator(&name_to_index_bytes_)) {
- CHECK_OK(Reset(cols, ids, key_columns));
+ : name_to_index_(/*expected_max_items_in_table=*/cols.size()) {
+ name_to_index_.set_empty_key(StringPiece());
+ CHECK_OK(Reset(std::move(cols), std::move(ids), key_columns));
}
// Reset this Schema object to the given schema.
// If this fails, the Schema object is left in an inconsistent
// state and may not be used.
- Status Reset(const std::vector<ColumnSchema>& cols, int key_columns) {
+ Status Reset(std::vector<ColumnSchema> cols, int key_columns) {
std::vector<ColumnId> ids;
- return Reset(cols, ids, key_columns);
+ return Reset(std::move(cols), ids, key_columns);
}
// Reset this Schema object to the given schema.
// If this fails, the Schema object is left in an inconsistent
// state and may not be used.
- Status Reset(const std::vector<ColumnSchema>& cols,
- const std::vector<ColumnId>& ids,
+ Status Reset(std::vector<ColumnSchema> cols,
+ std::vector<ColumnId> ids,
int key_columns);
// Find the column index corresponding to the given column name,
@@ -732,7 +722,7 @@ class Schema {
col_ids.assign(col_ids_.begin(), col_ids_.begin() + num_key_columns_);
}
- return Schema(key_cols, col_ids, num_key_columns_);
+ return Schema(std::move(key_cols), std::move(col_ids), num_key_columns_);
}
// Return a new Schema which is the same as this one, but with IDs assigned.
@@ -973,17 +963,12 @@ class Schema {
// ColumnSchema objects inside cols_. This avoids an extra copy of those strings,
// and also allows us to do lookups on the map using StringPiece keys, sometimes
// avoiding copies.
- //
- // The map is instrumented with a counting allocator so that we can accurately
- // measure its memory footprint.
- int64_t name_to_index_bytes_;
- typedef STLCountingAllocator<std::pair<const StringPiece, size_t> > NameToIndexMapAllocator;
- typedef std::unordered_map<
+ typedef google::dense_hash_map<
StringPiece,
size_t,
- std::hash<StringPiece>,
- std::equal_to<StringPiece>,
- NameToIndexMapAllocator> NameToIndexMap;
+ GoodFastHash<StringPiece>,
+ std::equal_to<StringPiece>> NameToIndexMap;
+
NameToIndexMap name_to_index_;
IdMapping id_to_index_;
diff --git a/src/kudu/common/wire_protocol.cc b/src/kudu/common/wire_protocol.cc
index d5f3826..5c473a3 100644
--- a/src/kudu/common/wire_protocol.cc
+++ b/src/kudu/common/wire_protocol.cc
@@ -19,11 +19,11 @@
#include <time.h>
+#include <algorithm>
#include <cstdint>
#include <cstring>
#include <ostream>
#include <string>
-#include <utility>
#include <vector>
#include <boost/optional/optional.hpp>
@@ -391,7 +391,7 @@ Status ColumnPBsToSchema(const RepeatedPtrField<ColumnSchemaPB>& column_pbs,
for (const ColumnSchemaPB& pb : column_pbs) {
boost::optional<ColumnSchema> column;
RETURN_NOT_OK(ColumnSchemaFromPB(pb, &column));
- columns.push_back(*column);
+ columns.emplace_back(std::move(*column));
if (pb.is_key()) {
if (!is_handling_key) {
return Status::InvalidArgument(
@@ -408,10 +408,7 @@ Status ColumnPBsToSchema(const RepeatedPtrField<ColumnSchemaPB>& column_pbs,
DCHECK_LE(num_key_columns, columns.size());
- // TODO(perf): could make the following faster by adding a
- // Reset() variant which actually takes ownership of the column
- // vector.
- return schema->Reset(columns, column_ids, num_key_columns);
+ return schema->Reset(std::move(columns), std::move(column_ids), num_key_columns);
}
Status SchemaToColumnPBs(const Schema& schema,
diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh
index 89ac43d..88cda30 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -369,12 +369,13 @@ fetch_and_patch \
$BREAKPAD_PATCHLEVEL \
"patch -p1 < $TP_DIR/patches/breakpad-add-basic-support-for-dwz-dwarf-extension.patch"
-SPARSEHASH_PATCHLEVEL=2
+SPARSEHASH_PATCHLEVEL=3
fetch_and_patch \
sparsehash-c11-${SPARSEHASH_VERSION}.tar.gz \
$SPARSEHASH_SOURCE \
$SPARSEHASH_PATCHLEVEL \
- "patch -p1 < $TP_DIR/patches/sparsehash-0001-Add-compatibily-for-gcc-4.x-in-traits.patch"
+ "patch -p1 < $TP_DIR/patches/sparsehash-0001-Add-compatibily-for-gcc-4.x-in-traits.patch" \
+ "patch -p1 < $TP_DIR/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch"
SPARSEPP_PATCHLEVEL=0
fetch_and_patch \
diff --git a/thirdparty/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch b/thirdparty/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch
new file mode 100644
index 0000000..d786757
--- /dev/null
+++ b/thirdparty/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch
@@ -0,0 +1,29 @@
+From 517c4bce0f1c30f8868da9bf5a568c4db40e95ea Mon Sep 17 00:00:00 2001
+From: Todd Lipcon <to...@cloudera.com>
+Date: Tue, 21 Jan 2020 13:52:41 -0800
+Subject: [PATCH] Add workaround for dense_hashtable move constructor in gcc
+ 4.8
+
+---
+ sparsehash/internal/densehashtable.h | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/sparsehash/internal/densehashtable.h b/sparsehash/internal/densehashtable.h
+index 72b3607..cc7ff69 100644
+--- a/sparsehash/internal/densehashtable.h
++++ b/sparsehash/internal/densehashtable.h
+@@ -739,7 +739,10 @@ class dense_hashtable {
+ }
+
+ dense_hashtable(dense_hashtable&& ht)
+- : dense_hashtable() {
++ // NOTE: redundantly set num_buckets = 0 for gcc 4.8.x compatibility.
++ // It can't find the dense_hashtable constructor unless at least one
++ // argument is set.
++ : dense_hashtable(0) {
+ swap(ht);
+ }
+
+--
+2.17.1
+