You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2020/01/22 19:24:27 UTC

[kudu] 02/04: schema: use dense_hash_map instead of std::unordered_map

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c3170a9fc6b5c30e85b94be615d8064ed7d26cd6
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Fri Jan 17 09:04:18 2020 -0800

    schema: use dense_hash_map instead of std::unordered_map
    
    In a time series benchmark I'm working on, the client spent 12% of its
    CPU in Schema::FindColumn. In particular, most of the CPU went to the
    bucket calculation in std::unordered_map, which required a 'divq'
    instruction that can take hundreds of cycles.
    
    This switches Schema to use a dense_hash_map instead which performs
    better. After this change, the percent of CPU used by my benchmark
    worker thread in Schema::FindColumn dropped from ~12% to ~1.5% which
    resulted in a few percent overall throughput increase.
    
    This also made the fancy allocator which tried to count memory usage
    unnecessary, since dense_hash_map is a simple enough data structure that
    we can directly compute the memory usage. Now we can also simplify the
    constructors since we no longer need to pass an allocator instance.
    
    Change-Id: I8e8f80229b2dcfad05e204a6f6e50ce7dc3f4c73
    Reviewed-on: http://gerrit.cloudera.org:8080/15064
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/client/client-test.cc                     |  6 +-
 src/kudu/common/schema.cc                          | 64 +++++++++-------------
 src/kudu/common/schema.h                           | 61 ++++++++-------------
 src/kudu/common/wire_protocol.cc                   |  9 +--
 thirdparty/download-thirdparty.sh                  |  5 +-
 ...und-for-dense_hashtable-move-constructor-.patch | 29 ++++++++++
 6 files changed, 88 insertions(+), 86 deletions(-)

diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index d020987..87f0642 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -3853,8 +3853,10 @@ TEST_F(ClientTest, TestBasicAlterOperations) {
     unique_ptr<KuduTableAlterer> table_alterer(client_->NewTableAlterer(kTableName));
     table_alterer->AlterColumn("int_val")->RenameTo(bad_name);
     Status s = table_alterer->Alter();
-    ASSERT_TRUE(s.IsInvalidArgument());
-    ASSERT_STR_CONTAINS(s.ToString(), "invalid column name");
+    EXPECT_TRUE(s.IsInvalidArgument());
+    EXPECT_THAT(s.ToString(), testing::AnyOf(
+        testing::HasSubstr("invalid column name"),
+        testing::HasSubstr("column name must be non-empty")));
   }
 
   // Test that renaming a table to an invalid name throws an error.
diff --git a/src/kudu/common/schema.cc b/src/kudu/common/schema.cc
index 11a9966..e723bc4 100644
--- a/src/kudu/common/schema.cc
+++ b/src/kudu/common/schema.cc
@@ -191,11 +191,8 @@ size_t ColumnSchema::memory_footprint_including_this() const {
 const int Schema::kColumnNotFound = -1;
 
 Schema::Schema(const Schema& other)
-  : name_to_index_bytes_(0),
-    name_to_index_(/*bucket_count*/ 10,
-                   NameToIndexMap::hasher(),
-                   NameToIndexMap::key_equal(),
-                   NameToIndexMapAllocator(&name_to_index_bytes_)) {
+  : name_to_index_(other.num_columns()) {
+  name_to_index_.set_empty_key(StringPiece());
   CopyFrom(other);
 }
 
@@ -216,7 +213,7 @@ void Schema::CopyFrom(const Schema& other) {
 
   // We can't simply copy name_to_index_ since the StringPiece keys
   // reference the other Schema's ColumnSchema objects.
-  name_to_index_.clear();
+  name_to_index_.clear_no_resize();
   int i = 0;
   for (const ColumnSchema &col : cols_) {
     // The map uses the 'name' string from within the ColumnSchema object.
@@ -233,25 +230,10 @@ Schema::Schema(Schema&& other) noexcept
       col_ids_(std::move(other.col_ids_)),
       max_col_id_(other.max_col_id_),
       col_offsets_(std::move(other.col_offsets_)),
-      name_to_index_bytes_(0),
-      name_to_index_(/*bucket_count*/ 10,
-                     NameToIndexMap::hasher(),
-                     NameToIndexMap::key_equal(),
-                     NameToIndexMapAllocator(&name_to_index_bytes_)),
+      name_to_index_(std::move(other.name_to_index_)),
       id_to_index_(std::move(other.id_to_index_)),
       first_is_deleted_virtual_column_idx_(other.first_is_deleted_virtual_column_idx_),
       has_nullables_(other.has_nullables_) {
-  // 'name_to_index_' uses a customer allocator which holds a pointer to
-  // 'name_to_index_bytes_'. swap() will swap the contents but not the
-  // allocators; std::move will move the allocator[1], which will mean the moved
-  // to value will be holding a pointer to the moved-from's member, which is
-  // wrong and will likely lead to use-after-free. The 'name_to_index_bytes_'
-  // values should also be swapped so both schemas end up in a valid state.
-  //
-  // [1] STLCountingAllocator::propagate_on_container_move_assignment::value
-  //     is std::true_type, which it inherits from the default Allocator.
-  std::swap(name_to_index_bytes_, other.name_to_index_bytes_);
-  name_to_index_.swap(other.name_to_index_);
 }
 
 Schema& Schema::operator=(Schema&& other) noexcept {
@@ -264,18 +246,15 @@ Schema& Schema::operator=(Schema&& other) noexcept {
     id_to_index_ = std::move(other.id_to_index_);
     first_is_deleted_virtual_column_idx_ = other.first_is_deleted_virtual_column_idx_;
     has_nullables_ = other.has_nullables_;
-
-    // See the comment in the move constructor implementation for why we swap.
-    std::swap(name_to_index_bytes_, other.name_to_index_bytes_);
-    name_to_index_.swap(other.name_to_index_);
+    name_to_index_ = std::move(other.name_to_index_);
   }
   return *this;
 }
 
-Status Schema::Reset(const vector<ColumnSchema>& cols,
-                     const vector<ColumnId>& ids,
+Status Schema::Reset(vector<ColumnSchema> cols,
+                     vector<ColumnId> ids,
                      int key_columns) {
-  cols_ = cols;
+  cols_ = std::move(cols);
   num_key_columns_ = key_columns;
 
   if (PREDICT_FALSE(key_columns > cols_.size())) {
@@ -306,8 +285,11 @@ Status Schema::Reset(const vector<ColumnSchema>& cols,
   col_offsets_.reserve(cols_.size() + 1);  // Include space for total byte size at the end.
   size_t off = 0;
   size_t i = 0;
-  name_to_index_.clear();
+  name_to_index_.clear_no_resize();
   for (const ColumnSchema &col : cols_) {
+    if (col.name().empty()) {
+      return Status::InvalidArgument("column names must be non-empty");
+    }
     // The map uses the 'name' string from within the ColumnSchema object.
     if (!InsertIfNotPresent(&name_to_index_, col.name(), i++)) {
       return Status::InvalidArgument("Duplicate column name", col.name());
@@ -322,14 +304,14 @@ Status Schema::Reset(const vector<ColumnSchema>& cols,
   col_offsets_.push_back(off);
 
   // Initialize IDs mapping
-  col_ids_ = ids;
+  col_ids_ = std::move(ids);
   id_to_index_.clear();
   max_col_id_ = 0;
-  for (int i = 0; i < ids.size(); ++i) {
-    if (ids[i] > max_col_id_) {
-      max_col_id_ = ids[i];
+  for (int i = 0; i < col_ids_.size(); ++i) {
+    if (col_ids_[i] > max_col_id_) {
+      max_col_id_ = col_ids_[i];
     }
-    id_to_index_.set(ids[i], i);
+    id_to_index_.set(col_ids_[i], i);
   }
 
   RETURN_NOT_OK(FindFirstIsDeletedVirtualColumnIdx(
@@ -371,7 +353,7 @@ Status Schema::CreateProjectionByNames(const std::vector<StringPiece>& col_names
     }
     cols.push_back(column(idx));
   }
-  return out->Reset(cols, ids, 0);
+  return out->Reset(std::move(cols), std::move(ids), 0);
 }
 
 Status Schema::CreateProjectionByIdsIgnoreMissing(const std::vector<ColumnId>& col_ids,
@@ -386,7 +368,7 @@ Status Schema::CreateProjectionByIdsIgnoreMissing(const std::vector<ColumnId>& c
     cols.push_back(column(idx));
     filtered_col_ids.push_back(id);
   }
-  return out->Reset(cols, filtered_col_ids, 0);
+  return out->Reset(std::move(cols), std::move(filtered_col_ids), 0);
 }
 
 Schema Schema::CopyWithColumnIds() const {
@@ -556,7 +538,7 @@ size_t Schema::memory_footprint_excluding_this() const {
   if (col_offsets_.capacity() > 0) {
     size += kudu_malloc_usable_size(col_offsets_.data());
   }
-  size += name_to_index_bytes_;
+  size += name_to_index_.bucket_count() * sizeof(NameToIndexMap::value_type);
   size += id_to_index_.memory_footprint_excluding_this();
 
   return size;
@@ -610,6 +592,9 @@ Status SchemaBuilder::AddColumn(const string& name,
                                 bool is_nullable,
                                 const void *read_default,
                                 const void *write_default) {
+  if (name.empty()) {
+    return Status::InvalidArgument("column name must be non-empty");
+  }
   return AddColumn(ColumnSchema(name, type, is_nullable, read_default, write_default), false);
 }
 
@@ -636,6 +621,9 @@ Status SchemaBuilder::RemoveColumn(const string& name) {
 }
 
 Status SchemaBuilder::RenameColumn(const string& old_name, const string& new_name) {
+  if (new_name.empty()) {
+    return Status::InvalidArgument("column name must be non-empty");
+  }
   // check if 'new_name' is already in use
   if (col_names_.find(new_name) != col_names_.end()) {
     return Status::AlreadyPresent("The column already exists", new_name);
diff --git a/src/kudu/common/schema.h b/src/kudu/common/schema.h
index 489bb00..5c6fd16 100644
--- a/src/kudu/common/schema.h
+++ b/src/kudu/common/schema.h
@@ -23,12 +23,12 @@
 #include <memory>
 #include <ostream>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include <boost/optional/optional.hpp>
+#include <sparsehash/dense_hash_map>
 #include <glog/logging.h>
 
 #include "kudu/common/common.pb.h"
@@ -452,16 +452,12 @@ class Schema {
 
   Schema()
     : num_key_columns_(0),
-      name_to_index_bytes_(0),
-      // TODO(wdberkeley): C++11 provides a single-argument constructor, but
-      // it's not supported in GCC < 4.9. This (and the other occurrences here
-      // and in schema.cc) can be fixed if we adopt C++14 or a later standard.
-      name_to_index_(/*bucket_count=*/10,
-                     NameToIndexMap::hasher(),
-                     NameToIndexMap::key_equal(),
-                     NameToIndexMapAllocator(&name_to_index_bytes_)),
+      // Initialize for 1 expected element since 0 gets replaced with
+      // the default (32).
+      name_to_index_(1),
       first_is_deleted_virtual_column_idx_(kColumnNotFound),
       has_nullables_(false) {
+    name_to_index_.set_empty_key(StringPiece());
   }
 
   Schema(const Schema& other);
@@ -478,14 +474,11 @@ class Schema {
   // empty schema and then use Reset(...)  so that errors can be
   // caught. If an invalid schema is passed to this constructor, an
   // assertion will be fired!
-  Schema(const std::vector<ColumnSchema>& cols,
+  Schema(std::vector<ColumnSchema> cols,
          int key_columns)
-    : name_to_index_bytes_(0),
-      name_to_index_(/*bucket_count=*/10,
-                     NameToIndexMap::hasher(),
-                     NameToIndexMap::key_equal(),
-                     NameToIndexMapAllocator(&name_to_index_bytes_)) {
-    CHECK_OK(Reset(cols, key_columns));
+      : name_to_index_(/*expected_max_items_in_table=*/cols.size()) {
+    name_to_index_.set_empty_key(StringPiece());
+    CHECK_OK(Reset(std::move(cols), key_columns));
   }
 
   // Construct a schema with the given information.
@@ -494,30 +487,27 @@ class Schema {
   // empty schema and then use Reset(...)  so that errors can be
   // caught. If an invalid schema is passed to this constructor, an
   // assertion will be fired!
-  Schema(const std::vector<ColumnSchema>& cols,
-         const std::vector<ColumnId>& ids,
+  Schema(std::vector<ColumnSchema> cols,
+         std::vector<ColumnId> ids,
          int key_columns)
-    : name_to_index_bytes_(0),
-      name_to_index_(/*bucket_count=*/10,
-                     NameToIndexMap::hasher(),
-                     NameToIndexMap::key_equal(),
-                     NameToIndexMapAllocator(&name_to_index_bytes_)) {
-    CHECK_OK(Reset(cols, ids, key_columns));
+      : name_to_index_(/*expected_max_items_in_table=*/cols.size()) {
+    name_to_index_.set_empty_key(StringPiece());
+    CHECK_OK(Reset(std::move(cols), std::move(ids), key_columns));
   }
 
   // Reset this Schema object to the given schema.
   // If this fails, the Schema object is left in an inconsistent
   // state and may not be used.
-  Status Reset(const std::vector<ColumnSchema>& cols, int key_columns) {
+  Status Reset(std::vector<ColumnSchema> cols, int key_columns) {
     std::vector<ColumnId> ids;
-    return Reset(cols, ids, key_columns);
+    return Reset(std::move(cols), ids, key_columns);
   }
 
   // Reset this Schema object to the given schema.
   // If this fails, the Schema object is left in an inconsistent
   // state and may not be used.
-  Status Reset(const std::vector<ColumnSchema>& cols,
-               const std::vector<ColumnId>& ids,
+  Status Reset(std::vector<ColumnSchema> cols,
+               std::vector<ColumnId> ids,
                int key_columns);
 
   // Find the column index corresponding to the given column name,
@@ -732,7 +722,7 @@ class Schema {
       col_ids.assign(col_ids_.begin(), col_ids_.begin() + num_key_columns_);
     }
 
-    return Schema(key_cols, col_ids, num_key_columns_);
+    return Schema(std::move(key_cols), std::move(col_ids), num_key_columns_);
   }
 
   // Return a new Schema which is the same as this one, but with IDs assigned.
@@ -973,17 +963,12 @@ class Schema {
   // ColumnSchema objects inside cols_. This avoids an extra copy of those strings,
   // and also allows us to do lookups on the map using StringPiece keys, sometimes
   // avoiding copies.
-  //
-  // The map is instrumented with a counting allocator so that we can accurately
-  // measure its memory footprint.
-  int64_t name_to_index_bytes_;
-  typedef STLCountingAllocator<std::pair<const StringPiece, size_t> > NameToIndexMapAllocator;
-  typedef std::unordered_map<
+  typedef google::dense_hash_map<
       StringPiece,
       size_t,
-      std::hash<StringPiece>,
-      std::equal_to<StringPiece>,
-      NameToIndexMapAllocator> NameToIndexMap;
+      GoodFastHash<StringPiece>,
+    std::equal_to<StringPiece>> NameToIndexMap;
+
   NameToIndexMap name_to_index_;
 
   IdMapping id_to_index_;
diff --git a/src/kudu/common/wire_protocol.cc b/src/kudu/common/wire_protocol.cc
index d5f3826..5c473a3 100644
--- a/src/kudu/common/wire_protocol.cc
+++ b/src/kudu/common/wire_protocol.cc
@@ -19,11 +19,11 @@
 
 #include <time.h>
 
+#include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <ostream>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <boost/optional/optional.hpp>
@@ -391,7 +391,7 @@ Status ColumnPBsToSchema(const RepeatedPtrField<ColumnSchemaPB>& column_pbs,
   for (const ColumnSchemaPB& pb : column_pbs) {
     boost::optional<ColumnSchema> column;
     RETURN_NOT_OK(ColumnSchemaFromPB(pb, &column));
-    columns.push_back(*column);
+    columns.emplace_back(std::move(*column));
     if (pb.is_key()) {
       if (!is_handling_key) {
         return Status::InvalidArgument(
@@ -408,10 +408,7 @@ Status ColumnPBsToSchema(const RepeatedPtrField<ColumnSchemaPB>& column_pbs,
 
   DCHECK_LE(num_key_columns, columns.size());
 
-  // TODO(perf): could make the following faster by adding a
-  // Reset() variant which actually takes ownership of the column
-  // vector.
-  return schema->Reset(columns, column_ids, num_key_columns);
+  return schema->Reset(std::move(columns), std::move(column_ids), num_key_columns);
 }
 
 Status SchemaToColumnPBs(const Schema& schema,
diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh
index 89ac43d..88cda30 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -369,12 +369,13 @@ fetch_and_patch \
  $BREAKPAD_PATCHLEVEL \
  "patch -p1 < $TP_DIR/patches/breakpad-add-basic-support-for-dwz-dwarf-extension.patch"
 
-SPARSEHASH_PATCHLEVEL=2
+SPARSEHASH_PATCHLEVEL=3
 fetch_and_patch \
  sparsehash-c11-${SPARSEHASH_VERSION}.tar.gz \
  $SPARSEHASH_SOURCE \
  $SPARSEHASH_PATCHLEVEL \
- "patch -p1 < $TP_DIR/patches/sparsehash-0001-Add-compatibily-for-gcc-4.x-in-traits.patch"
+ "patch -p1 < $TP_DIR/patches/sparsehash-0001-Add-compatibily-for-gcc-4.x-in-traits.patch" \
+ "patch -p1 < $TP_DIR/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch"
 
 SPARSEPP_PATCHLEVEL=0
 fetch_and_patch \
diff --git a/thirdparty/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch b/thirdparty/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch
new file mode 100644
index 0000000..d786757
--- /dev/null
+++ b/thirdparty/patches/sparsehash-0002-Add-workaround-for-dense_hashtable-move-constructor-.patch
@@ -0,0 +1,29 @@
+From 517c4bce0f1c30f8868da9bf5a568c4db40e95ea Mon Sep 17 00:00:00 2001
+From: Todd Lipcon <to...@cloudera.com>
+Date: Tue, 21 Jan 2020 13:52:41 -0800
+Subject: [PATCH] Add workaround for dense_hashtable move constructor in gcc
+ 4.8
+
+---
+ sparsehash/internal/densehashtable.h | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/sparsehash/internal/densehashtable.h b/sparsehash/internal/densehashtable.h
+index 72b3607..cc7ff69 100644
+--- a/sparsehash/internal/densehashtable.h
++++ b/sparsehash/internal/densehashtable.h
+@@ -739,7 +739,10 @@ class dense_hashtable {
+   }
+ 
+   dense_hashtable(dense_hashtable&& ht)
+-      : dense_hashtable() {
++      // NOTE: redundantly set num_buckets = 0 for gcc 4.8.x compatibility.
++      // It can't find the dense_hashtable constructor unless at least one
++      // argument is set.
++      : dense_hashtable(0) {
+     swap(ht);
+   }
+ 
+-- 
+2.17.1
+