You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/26 19:06:23 UTC

[arrow] branch master updated: ARROW-5635: [C++] Added a Compact() method to Table.

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 4b777f6  ARROW-5635: [C++] Added a Compact() method to Table.
4b777f6 is described below

commit 4b777f60be1ad1bb671985a6e0611c41ce661d0d
Author: Zhuo Peng <18...@users.noreply.github.com>
AuthorDate: Wed Jun 26 14:06:13 2019 -0500

    ARROW-5635: [C++] Added a Compact() method to Table.
    
    A column in a table may consist of multiple chunks. This PR is proposing a Table.Compact() method that returns a table whose columns are of one chunk, which is the concatenation of the corresponding column's chunks.
    
    This method could be useful if the table is fragmented (after Table.slice(), then Table.concatenate()) while one wants to conduct vectorized computation through to_numpy().
    
    Author: Zhuo Peng <18...@users.noreply.github.com>
    
    Closes #4598 from brills/compact and squashes the following commits:
    
    6bbd0174a <Zhuo Peng> Rename as CombineChunks and fixed doc.
    db640d6cd <Zhuo Peng> Addressed PR comments.
    18aa9ebd8 <Zhuo Peng> Added a Compact() method to Table.
---
 cpp/src/arrow/table-test.cc | 35 +++++++++++++++++++++++++++++++++++
 cpp/src/arrow/table.cc      | 30 ++++++++++++++++++++++++++++++
 cpp/src/arrow/table.h       |  9 +++++++++
 3 files changed, 74 insertions(+)

diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index 47cb3d1..b0a870e 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -423,6 +423,41 @@ TEST_F(TestTable, FromRecordBatchesZeroLength) {
   ASSERT_TRUE(result->schema()->Equals(*schema_));
 }
 
+TEST_F(TestTable, CombineChunksEmptyTable) {
+  MakeExample1(10);
+
+  std::shared_ptr<Table> table;
+  ASSERT_OK(Table::FromRecordBatches(schema_, {}, &table));
+  ASSERT_EQ(0, table->num_rows());
+
+  std::shared_ptr<Table> compacted;
+  ASSERT_OK(table->CombineChunks(default_memory_pool(), &compacted));
+
+  EXPECT_TRUE(compacted->Equals(*table));
+}
+
+TEST_F(TestTable, CombineChunks) {
+  MakeExample1(10);
+  auto batch1 = RecordBatch::Make(schema_, 10, arrays_);
+
+  MakeExample1(15);
+  auto batch2 = RecordBatch::Make(schema_, 15, arrays_);
+
+  std::shared_ptr<Table> table;
+  ASSERT_OK(Table::FromRecordBatches({batch1, batch2}, &table));
+  for (int i = 0; i < table->num_columns(); ++i) {
+    ASSERT_EQ(2, table->column(i)->data()->num_chunks());
+  }
+
+  std::shared_ptr<Table> compacted;
+  ASSERT_OK(table->CombineChunks(default_memory_pool(), &compacted));
+
+  EXPECT_TRUE(compacted->Equals(*table));
+  for (int i = 0; i < compacted->num_columns(); ++i) {
+    EXPECT_EQ(1, compacted->column(i)->data()->num_chunks());
+  }
+}
+
 TEST_F(TestTable, ConcatenateTables) {
   const int64_t length = 10;
 
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 506ce93..addb255 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -24,6 +24,7 @@
 #include <utility>
 
 #include "arrow/array.h"
+#include "arrow/array/concatenate.h"
 #include "arrow/record_batch.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -35,6 +36,25 @@ namespace arrow {
 
 using internal::checked_cast;
 
+namespace {
+
+// If a column contains multiple chunks, concatenates those chunks into one and
+// makes a new column out of it. Otherwise makes `compacted` point to the same
+// column.
+Status CompactColumn(const std::shared_ptr<Column>& column, MemoryPool* pool,
+                     std::shared_ptr<Column>* compacted) {
+  if (column->data()->num_chunks() <= 1) {
+    *compacted = column;
+    return Status::OK();
+  }
+  std::shared_ptr<Array> merged_data_array;
+  RETURN_NOT_OK(Concatenate(column->data()->chunks(), pool, &merged_data_array));
+  *compacted = std::make_shared<Column>(column->field(), merged_data_array);
+  return Status::OK();
+}
+
+}  // namespace
+
 // ----------------------------------------------------------------------
 // ChunkedArray and Column methods
 
@@ -573,6 +593,16 @@ bool Table::Equals(const Table& other) const {
   return true;
 }
 
+Status Table::CombineChunks(MemoryPool* pool, std::shared_ptr<Table>* out) const {
+  const int ncolumns = num_columns();
+  std::vector<std::shared_ptr<Column>> compacted_columns(ncolumns);
+  for (int i = 0; i < ncolumns; ++i) {
+    RETURN_NOT_OK(CompactColumn(column(i), pool, &compacted_columns[i]));
+  }
+  *out = Table::Make(schema(), compacted_columns);
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // Convert a table to a sequence of record batches
 
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index 8eaa650..2e7dcee 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -337,6 +337,15 @@ class ARROW_EXPORT Table {
   /// However, they may be equal even if they have different chunkings.
   bool Equals(const Table& other) const;
 
+  /// \brief Make a new table by combining the chunks this table has.
+  ///
+  /// All the underlying chunks in the ChunkedArray of each column are
+  /// concatenated into zero or one chunk.
+  ///
+  /// \param[in] pool The pool for buffer allocations
+  /// \param[out] out The table with chunks combined
+  Status CombineChunks(MemoryPool* pool, std::shared_ptr<Table>* out) const;
+
  protected:
   Table();