You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@quickstep.apache.org by na...@apache.org on 2016/09/02 23:36:07 UTC

[2/7] incubator-quickstep git commit: WIP: Add partial bulk insert function in InsertDestination.

WIP: Add partial bulk insert function in InsertDestination.


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/69cc69b0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/69cc69b0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/69cc69b0

Branch: refs/heads/col_vec_opt
Commit: 69cc69b098d9d58caa11c9e501e718bf4aad9134
Parents: cc47933
Author: Navneet Potti <na...@gmail.com>
Authored: Wed Aug 10 16:41:46 2016 -0500
Committer: Navneet Potti <na...@gmail.com>
Committed: Wed Aug 10 16:41:46 2016 -0500

----------------------------------------------------------------------
 storage/InsertDestination.cpp          | 66 +++++++++++++++++++++++++++++
 storage/InsertDestination.hpp          | 16 +++++++
 storage/InsertDestinationInterface.hpp | 22 ++++++++++
 storage/StorageBlock.cpp               |  6 ++-
 storage/StorageBlock.hpp               | 49 ++++++++++++++++++++-
 storage/TupleStorageSubBlock.hpp       | 46 ++++++++++++++++++++
 6 files changed, 202 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/69cc69b0/storage/InsertDestination.cpp
----------------------------------------------------------------------
diff --git a/storage/InsertDestination.cpp b/storage/InsertDestination.cpp
index 9897aed..6b20bb5 100644
--- a/storage/InsertDestination.cpp
+++ b/storage/InsertDestination.cpp
@@ -247,6 +247,72 @@ void InsertDestination::bulkInsertTuplesWithRemappedAttributes(
   });
 }
 
+void InsertDestination::bulkInsertTuplesFromValueAccessors(
+    std::vector<std::pair<ValueAccessor *, std::vector<attribute_id>>> accessor_attribute_map,
+    bool always_mark_full) {
+  // Handle pathological corner case where there are no accessors
+  if (accessor_attribute_map.size() != 0)
+    return;
+
+  // First off, initialize iteration through each accessor
+  for (auto &p : accessor_attribute_map) {
+    p.first->beginIterationVirtual();
+  }
+
+  // We assume that all input accessors have the same number of tuples, so
+  // the iterations finish together. Therefore, we can just check the first one.
+  auto first_accessor = accessor_attribute_map[0].first;
+  while (!first_accessor->iterationFinishedVirtual()) {
+    tuple_id num_tuples_to_insert = kCatalogMaxID;
+    tuple_id num_tuples_inserted = 0;
+    MutableBlockReference output_block = this->getBlockForInsertion();
+
+    // Now iterate through all the accessors and do one round of bulk-insertion
+    // of partial tuples into the selected output_block.
+    // While inserting from the first ValueAccessor, space is reserved for
+    // all the columns including those coming from other ValueAccessors.
+    // Thereafter, in a given round, we only insert the remaining columns of the
+    // same tuples from the other ValueAccessors.
+    for (auto &p : accessor_attribute_map) {
+      ValueAccessor *accessor = p.first;
+      std::vector<attribute_id> attribute_map = p.second;
+
+      InvokeOnAnyValueAccessor(
+          accessor,
+          [&](auto *accessor) -> void {  // NOLINT(build/c++11)
+            num_tuples_inserted = output_block->bulkInsertPartialTuples(
+                attribute_map, accessor, num_tuples_to_insert);
+      });
+
+      if (accessor == first_accessor) {
+        // Now we know how many full tuples can be inserted into this
+        // output_block (viz. number of tuples inserted from first ValueAccessor).
+        // We should only insert that many tuples from the remaining
+        // ValueAccessors as well.
+        num_tuples_to_insert = num_tuples_inserted;
+      }
+      else {
+        // Since the bulk insertion of the first ValueAccessor should already
+        // have reserved the space for all the other ValueAccessors' columns,
+        // we must have been able to insert all the tuples we asked to insert.
+        DCHECK(num_tuples_inserted == num_tuples_to_insert);
+      }
+    }
+
+    // After one round of insertions, we have successfully inserted as many
+    // tuples as possible into the output_block. Strictly speaking, it's
+    // possible that there is more space for insertions because the size
+    // estimation of variable length columns is conservative. But we will ignore
+    // that case and proceed assuming that this output_block is full.
+
+    // Update the header for output_block and then return it.
+    output_block->bulkInsertPartialTuplesFinalize(num_tuples_inserted);
+    const bool mark_full = always_mark_full
+                           || !first_accessor->iterationFinishedVirtual();
+    this->returnBlock(std::move(output_block), mark_full);
+  }
+}
+
 void InsertDestination::insertTuplesFromVector(std::vector<Tuple>::const_iterator begin,
                                                std::vector<Tuple>::const_iterator end) {
   if (begin == end) {

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/69cc69b0/storage/InsertDestination.hpp
----------------------------------------------------------------------
diff --git a/storage/InsertDestination.hpp b/storage/InsertDestination.hpp
index 3dae9a0..41d0c96 100644
--- a/storage/InsertDestination.hpp
+++ b/storage/InsertDestination.hpp
@@ -152,6 +152,10 @@ class InsertDestination : public InsertDestinationInterface {
       ValueAccessor *accessor,
       bool always_mark_full = false) override;
 
+  void bulkInsertTuplesFromValueAccessors(
+      std::vector<std::pair<ValueAccessor *, std::vector<attribute_id>>> accessor_attribute_map,
+      bool always_mark_full = false) override;
+
   void insertTuplesFromVector(std::vector<Tuple>::const_iterator begin,
                               std::vector<Tuple>::const_iterator end) override;
 
@@ -315,6 +319,12 @@ class AlwaysCreateBlockInsertDestination : public InsertDestination {
   ~AlwaysCreateBlockInsertDestination() override {
   }
 
+  void bulkInsertTuplesFromValueAccessors(
+      std::unordered_map<ValueAccessor *, std::vector<attribute_id>> accessor_attribute_map,
+      bool always_mark_full = false) override  {
+    FATAL_ERROR("bulkInsertTuplesFromValueAccessors is not implemented for AlwaysCreateBlockInsertDestination");
+  }
+
  protected:
   MutableBlockReference getBlockForInsertion() override;
 
@@ -519,6 +529,12 @@ class PartitionAwareInsertDestination : public InsertDestination {
       ValueAccessor *accessor,
       bool always_mark_full = false) override;
 
+  void bulkInsertTuplesFromValueAccessors(
+      std::unordered_map<ValueAccessor *, std::vector<attribute_id>> accessor_attribute_map,
+      bool always_mark_full = false) override  {
+    FATAL_ERROR("bulkInsertTuplesFromValueAccessors is not implemented for PartitionAwareInsertDestination");
+  }
+
   void insertTuplesFromVector(std::vector<Tuple>::const_iterator begin,
                               std::vector<Tuple>::const_iterator end) override;
 

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/69cc69b0/storage/InsertDestinationInterface.hpp
----------------------------------------------------------------------
diff --git a/storage/InsertDestinationInterface.hpp b/storage/InsertDestinationInterface.hpp
index 423dff1..cf1a35d 100644
--- a/storage/InsertDestinationInterface.hpp
+++ b/storage/InsertDestinationInterface.hpp
@@ -20,6 +20,7 @@
 #ifndef QUICKSTEP_STORAGE_INSERT_DESTINATION_INTERFACE_HPP_
 #define QUICKSTEP_STORAGE_INSERT_DESTINATION_INTERFACE_HPP_
 
+#include <unordered_map>
 #include <vector>
 
 #include "catalog/CatalogTypedefs.hpp"
@@ -122,6 +123,27 @@ class InsertDestinationInterface {
       bool always_mark_full = false) = 0;
 
   /**
+   * @brief Bulk-insert tuples from one or more ValueAccessors
+   *        into blocks managed by this InsertDestination.
+   *
+   * @warning It is implicitly assumed that all the input ValueAccessors have
+   *          the same number of tuples in them.
+   *
+   * @param accessor_attribute_map A vector of pairs of ValueAccessor and
+   *        corresponding attribute map
+   *        The i-th attribute ID in the attr map for a value accessor is "n" 
+   *        if the attribute_id "i" in the output relation
+   *        is the attribute_id "n" in corresponding input value accessor.
+   *        Set the i-th element to kInvalidCatalogId if it doesn't come from
+   *        the corresponding value accessor.
+   * @param always_mark_full If \c true, always mark the blocks full after
+   *        insertion from ValueAccessor even when partially full.
+   **/
+  virtual void bulkInsertTuplesFromValueAccessors(
+      std::vector<std::pair<ValueAccessor *, std::vector<attribute_id>>> accessor_attribute_map,
+      bool always_mark_full = false) = 0;
+
+  /**
    * @brief Insert tuples from a range of Tuples in a vector.
    * @warning Unlike bulkInsertTuples(), this is not well-optimized and not
    *          intended for general use. It should only be used by

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/69cc69b0/storage/StorageBlock.cpp
----------------------------------------------------------------------
diff --git a/storage/StorageBlock.cpp b/storage/StorageBlock.cpp
index 21aa12c..6493d55 100644
--- a/storage/StorageBlock.cpp
+++ b/storage/StorageBlock.cpp
@@ -269,10 +269,12 @@ tuple_id StorageBlock::bulkInsertTuples(ValueAccessor *accessor) {
 
 tuple_id StorageBlock::bulkInsertTuplesWithRemappedAttributes(
     const std::vector<attribute_id> &attribute_map,
-    ValueAccessor *accessor) {
+    ValueAccessor *accessor,
+    const tuple_id max_tuples_to_insert) {
   const tuple_id num_inserted
       = tuple_store_->bulkInsertTuplesWithRemappedAttributes(attribute_map,
-                                                             accessor);
+                                                             accessor,
+                                                             max_tuples_to_insert);
   if (num_inserted != 0) {
     invalidateAllIndexes();
     dirty_ = true;

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/69cc69b0/storage/StorageBlock.hpp
----------------------------------------------------------------------
diff --git a/storage/StorageBlock.hpp b/storage/StorageBlock.hpp
index 97b4773..1562f9d 100644
--- a/storage/StorageBlock.hpp
+++ b/storage/StorageBlock.hpp
@@ -306,13 +306,60 @@ class StorageBlock : public StorageBlockBase {
    *        iteration will be advanced to the first non-inserted tuple or, if
    *        all accessible tuples were inserted in this block, to the end
    *        position.
+   * @param max_tuples_to_insert Insert at most these many tuples
    * @return The number of tuples inserted from accessor.
    **/
   tuple_id bulkInsertTuplesWithRemappedAttributes(
       const std::vector<attribute_id> &attribute_map,
-      ValueAccessor *accessor);
+      ValueAccessor *accessor); 
 
   /**
+   * @brief Insert up to max_num_tuples_to_insert tuples from a ValueAccessor
+   *        as a single batch, using the attribute_map to project and reorder
+   *        columns from the input ValueAccessor. Does not update header.
+   *
+   * @note Typical usage is where you want to bulk-insert columns from two
+   *       or more value accessors. Instead of writing out the columns into
+   *       one or more column vector value accessors, you can simply use this
+   *       function with the appropriate attribute_map for each value
+   *       accessor (InsertDestination::bulkInsertTuplesFromValueAccessors
+   *       handles all the details) to insert tuples without an extra temp copy.
+   * 
+   * @warning Must call bulkInsertPartialTuplesFinalize() to update the header,
+   *          until which point, the insertion is not visible to others.
+   * @warning The inserted tuples may be placed in an "incorrect" or
+   *          sub-optimal locations in this TupleStorageSubBlock. The only
+   *          methods which are safe to call between bulkInsertTuples() and
+   *          rebuild() are insertTupleInBatch(), bulkInsertTuples(), and
+   *          bulkInsertTuplesWithRemappedAttributes().
+   *
+   * @param attribute_map A vector which maps the attributes of this
+   *        TupleStorageSubBlock's relation (gaps indicated with kInvalidCatalogId)
+   *         to the corresponding attributes which should be read from accessor.
+   * @param accessor A ValueAccessor to insert tuples from. The accessor's
+   *        iteration will be advanced to the first non-inserted tuple or, if
+   *        all accessible tuples were inserted in this sub-block, to the end
+   *        position.
+   * @return The number of tuples inserted from accessor.
+   **/
+  virtual tuple_id bulkInsertPartialTuples(
+      const std::vector<attribute_id> &attribute_map,
+      ValueAccessor *accessor,
+      tuple_id max_num_tuples_to_insert) = 0;
+
+  /**
+   * @brief Update header after a bulkInsertPartialTuples.
+   *
+   * @warning Only call this after a bulkInsertPartialTuples, passing in the
+   *          number of tuples that were inserted (return value of that function).
+   *
+   * @param num_tuples_inserted Number of tuples inserted (i.e., how much to
+   *        advance the header.num_tuples by). Should be equal to the return
+   *        value of bulkInsertPartialTuples.
+   **/
+  virtual void bulkInsertPartialTuplesFinalize(tuple_id num_tuples_inserted) = 0;
+  
+  /**
    * @brief Perform a random sampling of data on  the StorageBlock. The number
    *       of records sampled is determined by the sample percentage in case of
    *       tuple sample. For block sample all the records in a block are taken.

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/69cc69b0/storage/TupleStorageSubBlock.hpp
----------------------------------------------------------------------
diff --git a/storage/TupleStorageSubBlock.hpp b/storage/TupleStorageSubBlock.hpp
index aed6eea..f937080 100644
--- a/storage/TupleStorageSubBlock.hpp
+++ b/storage/TupleStorageSubBlock.hpp
@@ -272,6 +272,52 @@ class TupleStorageSubBlock {
       ValueAccessor *accessor) = 0;
 
   /**
+   * @brief Insert up to max_num_tuples_to_insert tuples from a ValueAccessor
+   *        as a single batch, using the attribute_map to project and reorder
+   *        columns from the input ValueAccessor. Does not update header.
+   *
+   * @note Typical usage is where you want to bulk-insert columns from two
+   *       or more value accessors. Instead of writing out the columns into
+   *       one or more column vector value accessors, you can simply use this
+   *       function with the appropriate attribute_map for each value
+   *       accessor (InsertDestination::bulkInsertTuplesFromValueAccessors
+   *       handles all the details) to insert tuples without an extra temp copy.
+   * 
+   * @warning Must call bulkInsertPartialTuplesFinalize() to update the header,
+   *          until which point, the insertion is not visible to others.
+   * @warning The inserted tuples may be placed in an "incorrect" or
+   *          sub-optimal locations in this TupleStorageSubBlock. The only
+   *          methods which are safe to call between bulkInsertTuples() and
+   *          rebuild() are insertTupleInBatch(), bulkInsertTuples(), and
+   *          bulkInsertTuplesWithRemappedAttributes().
+   *
+   * @param attribute_map A vector which maps the attributes of this
+   *        TupleStorageSubBlock's relation (gaps indicated with kInvalidCatalogId)
+   *         to the corresponding attributes which should be read from accessor.
+   * @param accessor A ValueAccessor to insert tuples from. The accessor's
+   *        iteration will be advanced to the first non-inserted tuple or, if
+   *        all accessible tuples were inserted in this sub-block, to the end
+   *        position.
+   * @return The number of tuples inserted from accessor.
+   **/
+  virtual tuple_id bulkInsertPartialTuples(
+      const std::vector<attribute_id> &attribute_map,
+      ValueAccessor *accessor,
+      tuple_id max_num_tuples_to_insert) = 0;
+
+  /**
+   * @brief Update header after a bulkInsertPartialTuples.
+   *
+   * @warning Only call this after a bulkInsertPartialTuples, passing in the
+   *          number of tuples that were inserted (return value of that function).
+   *
+   * @param num_tuples_inserted Number of tuples inserted (i.e., how much to
+   *        advance the header.num_tuples by). Should be equal to the return
+   *        value of bulkInsertPartialTuples.
+   **/
+  virtual void bulkInsertPartialTuplesFinalize(tuple_id num_tuples_inserted) = 0;
+  
+  /**
    * @brief Get the (untyped) value of an attribute in a tuple in this buffer.
    * @warning This method may not be supported for all implementations of
    *          TupleStorageSubBlock. supportsUntypedGetAttributeValue() MUST be