You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/30 19:04:14 UTC
[2/2] arrow git commit: ARROW-728: [C++/Python] Add
Table::RemoveColumn method, remove name member, some other code cleaning
ARROW-728: [C++/Python] Add Table::RemoveColumn method, remove name member, some other code cleaning
* Consolidated column.h and table.h
* Consolidated schema.h and type.h
* Removed some `Equals(const std::shared_ptr<T>&)` methods, better to use `const T&` methods
Author: Wes McKinney <we...@twosigma.com>
Closes #457 from wesm/ARROW-728 and squashes the following commits:
961783d [Wes McKinney] Fix glib test suite
1645ea2 [Wes McKinney] Return new vector from DeleteVectorElement
ea36a8c [Wes McKinney] Fix GLib bindings for removal of name Table member
77d363c [Wes McKinney] Incorporate API changes in pyarrow, add Table.remove_column function. Make nicer repr
b73c4d7 [Wes McKinney] Remove Table name attribute, implement and test Table::RemoveColumn
6a7f022 [Wes McKinney] Move Schema to type.h, remove Equals with shared_ptr function
818c46f [Wes McKinney] Consolidate column.h into table.h
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/47fad3f4
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/47fad3f4
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/47fad3f4
Branch: refs/heads/master
Commit: 47fad3f42c05bd4139796b93375dfb3cba74e87b
Parents: 642b753
Author: Wes McKinney <we...@twosigma.com>
Authored: Thu Mar 30 15:04:07 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Mar 30 15:04:07 2017 -0400
----------------------------------------------------------------------
c_glib/arrow-glib/table.cpp | 20 +-
c_glib/arrow-glib/table.h | 4 +-
c_glib/test/test-table.rb | 9 +-
cpp/CMakeLists.txt | 2 -
cpp/src/arrow/CMakeLists.txt | 3 -
cpp/src/arrow/api.h | 2 -
cpp/src/arrow/column-test.cc | 191 ------------------
cpp/src/arrow/column.cc | 132 -------------
cpp/src/arrow/column.h | 104 ----------
cpp/src/arrow/ipc/feather.cc | 2 +-
cpp/src/arrow/ipc/ipc-json-test.cc | 6 +-
cpp/src/arrow/ipc/ipc-read-write-test.cc | 2 +-
cpp/src/arrow/ipc/json-integration-test.cc | 6 +-
cpp/src/arrow/ipc/json-internal.cc | 1 -
cpp/src/arrow/ipc/json.cc | 1 -
cpp/src/arrow/ipc/metadata.cc | 1 -
cpp/src/arrow/ipc/reader.cc | 2 +-
cpp/src/arrow/ipc/test-common.h | 2 +-
cpp/src/arrow/ipc/writer.cc | 1 -
cpp/src/arrow/python/pandas-test.cc | 3 +-
cpp/src/arrow/python/pandas_convert.cc | 1 -
cpp/src/arrow/schema.cc | 72 -------
cpp/src/arrow/schema.h | 59 ------
cpp/src/arrow/table-test.cc | 246 ++++++++++++++++++++----
cpp/src/arrow/table.cc | 149 ++++++++++++--
cpp/src/arrow/table.h | 90 +++++++--
cpp/src/arrow/test-common.h | 1 -
cpp/src/arrow/test-util.h | 2 -
cpp/src/arrow/type-test.cc | 8 +-
cpp/src/arrow/type.cc | 53 +++++
cpp/src/arrow/type.h | 33 +++-
cpp/src/arrow/util/stl.h | 40 ++++
python/pyarrow/array.pyx | 4 +-
python/pyarrow/includes/libarrow.pxd | 17 +-
python/pyarrow/io.pyx | 6 +-
python/pyarrow/schema.pyx | 9 +-
python/pyarrow/table.pyx | 65 +++----
python/pyarrow/tests/test_parquet.py | 2 +-
python/pyarrow/tests/test_table.py | 33 ++--
39 files changed, 623 insertions(+), 761 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/c_glib/arrow-glib/table.cpp
----------------------------------------------------------------------
diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp
index 2410e76..2f82ffa 100644
--- a/c_glib/arrow-glib/table.cpp
+++ b/c_glib/arrow-glib/table.cpp
@@ -126,15 +126,13 @@ garrow_table_class_init(GArrowTableClass *klass)
/**
* garrow_table_new:
- * @name: The name of the table.
* @schema: The schema of the table.
* @columns: (element-type GArrowColumn): The columns of the table.
*
* Returns: A newly created #GArrowTable.
*/
GArrowTable *
-garrow_table_new(const gchar *name,
- GArrowSchema *schema,
+garrow_table_new(GArrowSchema *schema,
GList *columns)
{
std::vector<std::shared_ptr<arrow::Column>> arrow_columns;
@@ -144,26 +142,12 @@ garrow_table_new(const gchar *name,
}
auto arrow_table =
- std::make_shared<arrow::Table>(name,
- garrow_schema_get_raw(schema),
+ std::make_shared<arrow::Table>(garrow_schema_get_raw(schema),
arrow_columns);
return garrow_table_new_raw(&arrow_table);
}
/**
- * garrow_table_get_name:
- * @table: A #GArrowTable.
- *
- * Returns: The name of the table.
- */
-const gchar *
-garrow_table_get_name(GArrowTable *table)
-{
- const auto arrow_table = garrow_table_get_raw(table);
- return arrow_table->name().c_str();
-}
-
-/**
* garrow_table_get_schema:
* @table: A #GArrowTable.
*
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/c_glib/arrow-glib/table.h
----------------------------------------------------------------------
diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h
index 34a89a7..4dbb8c5 100644
--- a/c_glib/arrow-glib/table.h
+++ b/c_glib/arrow-glib/table.h
@@ -66,11 +66,9 @@ struct _GArrowTableClass
GType garrow_table_get_type (void) G_GNUC_CONST;
-GArrowTable *garrow_table_new (const gchar *name,
- GArrowSchema *schema,
+GArrowTable *garrow_table_new (GArrowSchema *schema,
GList *columns);
-const gchar *garrow_table_get_name (GArrowTable *table);
GArrowSchema *garrow_table_get_schema (GArrowTable *table);
GArrowColumn *garrow_table_get_column (GArrowTable *table,
guint i);
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/c_glib/test/test-table.rb
----------------------------------------------------------------------
diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb
index 1687d2f..0583e81 100644
--- a/c_glib/test/test-table.rb
+++ b/c_glib/test/test-table.rb
@@ -29,8 +29,7 @@ class TestTable < Test::Unit::TestCase
Arrow::Column.new(fields[0], build_boolean_array([true])),
Arrow::Column.new(fields[1], build_boolean_array([false])),
]
- table = Arrow::Table.new("memos", schema, columns)
- assert_equal("memos", table.name)
+ table = Arrow::Table.new(schema, columns)
end
end
@@ -45,11 +44,7 @@ class TestTable < Test::Unit::TestCase
Arrow::Column.new(fields[0], build_boolean_array([true])),
Arrow::Column.new(fields[1], build_boolean_array([false])),
]
- @table = Arrow::Table.new("memos", schema, columns)
- end
-
- def test_name
- assert_equal("memos", @table.name)
+ @table = Arrow::Table.new(schema, columns)
end
def test_schema
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4c18ca..e11de1b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -784,12 +784,10 @@ set(ARROW_SRCS
src/arrow/array.cc
src/arrow/buffer.cc
src/arrow/builder.cc
- src/arrow/column.cc
src/arrow/compare.cc
src/arrow/loader.cc
src/arrow/memory_pool.cc
src/arrow/pretty_print.cc
- src/arrow/schema.cc
src/arrow/status.cc
src/arrow/table.cc
src/arrow/tensor.cc
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index f965f1d..5c9aadf 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -22,12 +22,10 @@ install(FILES
array.h
buffer.h
builder.h
- column.h
compare.h
loader.h
memory_pool.h
pretty_print.h
- schema.h
status.h
table.h
type.h
@@ -59,7 +57,6 @@ ADD_ARROW_TEST(array-string-test)
ADD_ARROW_TEST(array-struct-test)
ADD_ARROW_TEST(array-union-test)
ADD_ARROW_TEST(buffer-test)
-ADD_ARROW_TEST(column-test)
ADD_ARROW_TEST(memory_pool-test)
ADD_ARROW_TEST(pretty_print-test)
ADD_ARROW_TEST(status-test)
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/api.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h
index ea818b6..50a0951 100644
--- a/cpp/src/arrow/api.h
+++ b/cpp/src/arrow/api.h
@@ -23,12 +23,10 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/builder.h"
-#include "arrow/column.h"
#include "arrow/compare.h"
#include "arrow/loader.h"
#include "arrow/memory_pool.h"
#include "arrow/pretty_print.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/column-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc
deleted file mode 100644
index 872fcb9..0000000
--- a/cpp/src/arrow/column-test.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "arrow/array.h"
-#include "arrow/column.h"
-#include "arrow/schema.h"
-#include "arrow/test-common.h"
-#include "arrow/test-util.h"
-#include "arrow/type.h"
-
-using std::shared_ptr;
-using std::vector;
-
-namespace arrow {
-
-class TestChunkedArray : public TestBase {
- protected:
- virtual void Construct() {
- one_ = std::make_shared<ChunkedArray>(arrays_one_);
- another_ = std::make_shared<ChunkedArray>(arrays_another_);
- }
-
- ArrayVector arrays_one_;
- ArrayVector arrays_another_;
-
- std::shared_ptr<ChunkedArray> one_;
- std::shared_ptr<ChunkedArray> another_;
-};
-
-TEST_F(TestChunkedArray, BasicEquals) {
- std::vector<bool> null_bitmap(100, true);
- std::vector<int32_t> data(100, 1);
- std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array);
- arrays_one_.push_back(array);
- arrays_another_.push_back(array);
-
- Construct();
- ASSERT_TRUE(one_->Equals(one_));
- ASSERT_FALSE(one_->Equals(nullptr));
- ASSERT_TRUE(one_->Equals(another_));
- ASSERT_TRUE(one_->Equals(*another_.get()));
-}
-
-TEST_F(TestChunkedArray, EqualsDifferingTypes) {
- std::vector<bool> null_bitmap(100, true);
- std::vector<int32_t> data32(100, 1);
- std::vector<int64_t> data64(100, 1);
- std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(null_bitmap, data32, &array);
- arrays_one_.push_back(array);
- ArrayFromVector<Int64Type, int64_t>(null_bitmap, data64, &array);
- arrays_another_.push_back(array);
-
- Construct();
- ASSERT_FALSE(one_->Equals(another_));
- ASSERT_FALSE(one_->Equals(*another_.get()));
-}
-
-TEST_F(TestChunkedArray, EqualsDifferingLengths) {
- std::vector<bool> null_bitmap100(100, true);
- std::vector<bool> null_bitmap101(101, true);
- std::vector<int32_t> data100(100, 1);
- std::vector<int32_t> data101(101, 1);
- std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(null_bitmap100, data100, &array);
- arrays_one_.push_back(array);
- ArrayFromVector<Int32Type, int32_t>(null_bitmap101, data101, &array);
- arrays_another_.push_back(array);
-
- Construct();
- ASSERT_FALSE(one_->Equals(another_));
- ASSERT_FALSE(one_->Equals(*another_.get()));
-
- std::vector<bool> null_bitmap1(1, true);
- std::vector<int32_t> data1(1, 1);
- ArrayFromVector<Int32Type, int32_t>(null_bitmap1, data1, &array);
- arrays_one_.push_back(array);
-
- Construct();
- ASSERT_TRUE(one_->Equals(another_));
- ASSERT_TRUE(one_->Equals(*another_.get()));
-}
-
-class TestColumn : public TestChunkedArray {
- protected:
- void Construct() override {
- TestChunkedArray::Construct();
-
- one_col_ = std::make_shared<Column>(one_field_, one_);
- another_col_ = std::make_shared<Column>(another_field_, another_);
- }
-
- std::shared_ptr<ChunkedArray> data_;
- std::unique_ptr<Column> column_;
-
- std::shared_ptr<Field> one_field_;
- std::shared_ptr<Field> another_field_;
-
- std::shared_ptr<Column> one_col_;
- std::shared_ptr<Column> another_col_;
-};
-
-TEST_F(TestColumn, BasicAPI) {
- ArrayVector arrays;
- arrays.push_back(MakePrimitive<Int32Array>(100));
- arrays.push_back(MakePrimitive<Int32Array>(100, 10));
- arrays.push_back(MakePrimitive<Int32Array>(100, 20));
-
- auto field = std::make_shared<Field>("c0", int32());
- column_.reset(new Column(field, arrays));
-
- ASSERT_EQ("c0", column_->name());
- ASSERT_TRUE(column_->type()->Equals(int32()));
- ASSERT_EQ(300, column_->length());
- ASSERT_EQ(30, column_->null_count());
- ASSERT_EQ(3, column_->data()->num_chunks());
-
- // nullptr array should not break
- column_.reset(new Column(field, std::shared_ptr<Array>(nullptr)));
- ASSERT_NE(column_.get(), nullptr);
-}
-
-TEST_F(TestColumn, ChunksInhomogeneous) {
- ArrayVector arrays;
- arrays.push_back(MakePrimitive<Int32Array>(100));
- arrays.push_back(MakePrimitive<Int32Array>(100, 10));
-
- auto field = std::make_shared<Field>("c0", int32());
- column_.reset(new Column(field, arrays));
-
- ASSERT_OK(column_->ValidateData());
-
- arrays.push_back(MakePrimitive<Int16Array>(100, 10));
- column_.reset(new Column(field, arrays));
- ASSERT_RAISES(Invalid, column_->ValidateData());
-}
-
-TEST_F(TestColumn, Equals) {
- std::vector<bool> null_bitmap(100, true);
- std::vector<int32_t> data(100, 1);
- std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array);
- arrays_one_.push_back(array);
- arrays_another_.push_back(array);
-
- one_field_ = std::make_shared<Field>("column", int32());
- another_field_ = std::make_shared<Field>("column", int32());
-
- Construct();
- ASSERT_TRUE(one_col_->Equals(one_col_));
- ASSERT_FALSE(one_col_->Equals(nullptr));
- ASSERT_TRUE(one_col_->Equals(another_col_));
- ASSERT_TRUE(one_col_->Equals(*another_col_.get()));
-
- // Field is different
- another_field_ = std::make_shared<Field>("two", int32());
- Construct();
- ASSERT_FALSE(one_col_->Equals(another_col_));
- ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
-
- // ChunkedArray is different
- another_field_ = std::make_shared<Field>("column", int32());
- arrays_another_.push_back(array);
- Construct();
- ASSERT_FALSE(one_col_->Equals(another_col_));
- ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
-}
-
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/column.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column.cc b/cpp/src/arrow/column.cc
deleted file mode 100644
index 78501f9..0000000
--- a/cpp/src/arrow/column.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/column.h"
-
-#include <memory>
-#include <sstream>
-
-#include "arrow/array.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-
-namespace arrow {
-
-ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) {
- length_ = 0;
- null_count_ = 0;
- for (const std::shared_ptr<Array>& chunk : chunks) {
- length_ += chunk->length();
- null_count_ += chunk->null_count();
- }
-}
-
-bool ChunkedArray::Equals(const ChunkedArray& other) const {
- if (length_ != other.length()) { return false; }
- if (null_count_ != other.null_count()) { return false; }
-
- // Check contents of the underlying arrays. This checks for equality of
- // the underlying data independently of the chunk size.
- int this_chunk_idx = 0;
- int64_t this_start_idx = 0;
- int other_chunk_idx = 0;
- int64_t other_start_idx = 0;
-
- int64_t elements_compared = 0;
- while (elements_compared < length_) {
- const std::shared_ptr<Array> this_array = chunks_[this_chunk_idx];
- const std::shared_ptr<Array> other_array = other.chunk(other_chunk_idx);
- int64_t common_length = std::min(
- this_array->length() - this_start_idx, other_array->length() - other_start_idx);
- if (!this_array->RangeEquals(this_start_idx, this_start_idx + common_length,
- other_start_idx, other_array)) {
- return false;
- }
-
- elements_compared += common_length;
-
- // If we have exhausted the current chunk, proceed to the next one individually.
- if (this_start_idx + common_length == this_array->length()) {
- this_chunk_idx++;
- this_start_idx = 0;
- } else {
- this_start_idx += common_length;
- }
-
- if (other_start_idx + common_length == other_array->length()) {
- other_chunk_idx++;
- other_start_idx = 0;
- } else {
- other_start_idx += common_length;
- }
- }
- return true;
-}
-
-bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const {
- if (this == other.get()) { return true; }
- if (!other) { return false; }
- return Equals(*other.get());
-}
-
-Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks)
- : field_(field) {
- data_ = std::make_shared<ChunkedArray>(chunks);
-}
-
-Column::Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data)
- : field_(field) {
- if (data) {
- data_ = std::make_shared<ChunkedArray>(ArrayVector({data}));
- } else {
- data_ = std::make_shared<ChunkedArray>(ArrayVector({}));
- }
-}
-
-Column::Column(const std::string& name, const std::shared_ptr<Array>& data)
- : Column(::arrow::field(name, data->type()), data) {}
-
-Column::Column(
- const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data)
- : field_(field), data_(data) {}
-
-bool Column::Equals(const Column& other) const {
- if (!field_->Equals(other.field())) { return false; }
- return data_->Equals(other.data());
-}
-
-bool Column::Equals(const std::shared_ptr<Column>& other) const {
- if (this == other.get()) { return true; }
- if (!other) { return false; }
-
- return Equals(*other.get());
-}
-
-Status Column::ValidateData() {
- for (int i = 0; i < data_->num_chunks(); ++i) {
- std::shared_ptr<DataType> type = data_->chunk(i)->type();
- if (!this->type()->Equals(type)) {
- std::stringstream ss;
- ss << "In chunk " << i << " expected type " << this->type()->ToString()
- << " but saw " << type->ToString();
- return Status::Invalid(ss.str());
- }
- }
- return Status::OK();
-}
-
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/column.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h
deleted file mode 100644
index bfcfd8e..0000000
--- a/cpp/src/arrow/column.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef ARROW_COLUMN_H
-#define ARROW_COLUMN_H
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/type.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class Status;
-
-typedef std::vector<std::shared_ptr<Array>> ArrayVector;
-
-// A data structure managing a list of primitive Arrow arrays logically as one
-// large array
-class ARROW_EXPORT ChunkedArray {
- public:
- explicit ChunkedArray(const ArrayVector& chunks);
-
- // @returns: the total length of the chunked array; computed on construction
- int64_t length() const { return length_; }
-
- int64_t null_count() const { return null_count_; }
-
- int num_chunks() const { return static_cast<int>(chunks_.size()); }
-
- std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
-
- const ArrayVector& chunks() const { return chunks_; }
-
- bool Equals(const ChunkedArray& other) const;
- bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
-
- protected:
- ArrayVector chunks_;
- int64_t length_;
- int64_t null_count_;
-};
-
-// An immutable column data structure consisting of a field (type metadata) and
-// a logical chunked data array (which can be validated as all being the same
-// type).
-class ARROW_EXPORT Column {
- public:
- Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
- Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data);
-
- Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data);
-
- /// Construct from name and array
- Column(const std::string& name, const std::shared_ptr<Array>& data);
-
- int64_t length() const { return data_->length(); }
-
- int64_t null_count() const { return data_->null_count(); }
-
- std::shared_ptr<Field> field() const { return field_; }
-
- // @returns: the column's name in the passed metadata
- const std::string& name() const { return field_->name; }
-
- // @returns: the column's type according to the metadata
- std::shared_ptr<DataType> type() const { return field_->type; }
-
- // @returns: the column's data as a chunked logical array
- std::shared_ptr<ChunkedArray> data() const { return data_; }
-
- bool Equals(const Column& other) const;
- bool Equals(const std::shared_ptr<Column>& other) const;
-
- // Verify that the column's array data is consistent with the passed field's
- // metadata
- Status ValidateData();
-
- protected:
- std::shared_ptr<Field> field_;
- std::shared_ptr<ChunkedArray> data_;
-};
-
-} // namespace arrow
-
-#endif // ARROW_COLUMN_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/feather.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index 000bba9..5820563 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -30,12 +30,12 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
-#include "arrow/column.h"
#include "arrow/io/file.h"
#include "arrow/ipc/feather-internal.h"
#include "arrow/ipc/feather_generated.h"
#include "arrow/loader.h"
#include "arrow/status.h"
+#include "arrow/table.h"
#include "arrow/util/bit-util.h"
namespace arrow {
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/ipc-json-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc
index 68261ab..9cf6a88 100644
--- a/cpp/src/arrow/ipc/ipc-json-test.cc
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -52,7 +52,7 @@ void TestSchemaRoundTrip(const Schema& schema) {
std::shared_ptr<Schema> out;
ASSERT_OK(ReadJsonSchema(d, &out));
- if (!schema.Equals(out)) {
+ if (!schema.Equals(*out)) {
FAIL() << "In schema: " << schema.ToString() << "\nOut schema: " << out->ToString();
}
}
@@ -263,14 +263,14 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) {
reinterpret_cast<const uint8_t*>(result.c_str()), static_cast<int>(result.size()));
ASSERT_OK(JsonReader::Open(buffer, &reader));
- ASSERT_TRUE(reader->schema()->Equals(*schema.get()));
+ ASSERT_TRUE(reader->schema()->Equals(*schema));
ASSERT_EQ(nbatches, reader->num_record_batches());
for (int i = 0; i < nbatches; ++i) {
std::shared_ptr<RecordBatch> batch;
ASSERT_OK(reader->GetRecordBatch(i, &batch));
- ASSERT_TRUE(batch->Equals(*batches[i].get()));
+ ASSERT_TRUE(batch->Equals(*batches[i]));
}
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/ipc-read-write-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index 48e546e..6ddda3f 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -158,7 +158,7 @@ class IpcTestFixture : public io::MemoryMapFixture {
void CheckReadResult(const RecordBatch& result, const RecordBatch& expected) {
EXPECT_EQ(expected.num_rows(), result.num_rows());
- ASSERT_TRUE(expected.schema()->Equals(result.schema()));
+ ASSERT_TRUE(expected.schema()->Equals(*result.schema()));
ASSERT_EQ(expected.num_columns(), result.num_columns())
<< expected.schema()->ToString() << " result: " << result.schema()->ToString();
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/json-integration-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc
index c16074e..aa95500 100644
--- a/cpp/src/arrow/ipc/json-integration-test.cc
+++ b/cpp/src/arrow/ipc/json-integration-test.cc
@@ -33,10 +33,10 @@
#include "arrow/ipc/reader.h"
#include "arrow/ipc/writer.h"
#include "arrow/pretty_print.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
+#include "arrow/type.h"
DEFINE_string(arrow, "", "Arrow file name");
DEFINE_string(json, "", "JSON file name");
@@ -143,7 +143,7 @@ static Status ValidateArrowVsJson(
auto json_schema = json_reader->schema();
auto arrow_schema = arrow_reader->schema();
- if (!json_schema->Equals(arrow_schema)) {
+ if (!json_schema->Equals(*arrow_schema)) {
std::stringstream ss;
ss << "JSON schema: \n"
<< json_schema->ToString() << "\n"
@@ -170,7 +170,7 @@ static Status ValidateArrowVsJson(
RETURN_NOT_OK(json_reader->GetRecordBatch(i, &json_batch));
RETURN_NOT_OK(arrow_reader->GetRecordBatch(i, &arrow_batch));
- if (!json_batch->ApproxEquals(*arrow_batch.get())) {
+ if (!json_batch->ApproxEquals(*arrow_batch)) {
std::stringstream ss;
ss << "Record batch " << i << " did not match";
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 95ab011..9572a0a 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -33,7 +33,6 @@
#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/memory_pool.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/json.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc
index a01be19..8056b6f 100644
--- a/cpp/src/arrow/ipc/json.cc
+++ b/cpp/src/arrow/ipc/json.cc
@@ -26,7 +26,6 @@
#include "arrow/buffer.h"
#include "arrow/ipc/json-internal.h"
#include "arrow/memory_pool.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc
index 85dc8b3..36ba4b2 100644
--- a/cpp/src/arrow/ipc/metadata.cc
+++ b/cpp/src/arrow/ipc/metadata.cc
@@ -29,7 +29,6 @@
#include "arrow/io/interfaces.h"
#include "arrow/ipc/File_generated.h"
#include "arrow/ipc/Message_generated.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/type.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 03c678a..28320d9 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -30,9 +30,9 @@
#include "arrow/ipc/Message_generated.h"
#include "arrow/ipc/metadata.h"
#include "arrow/ipc/util.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
+#include "arrow/type.h"
#include "arrow/util/logging.h"
namespace arrow {
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 994e128..583f909 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -49,7 +49,7 @@ static inline void AssertSchemaEqual(const Schema& lhs, const Schema& rhs) {
}
static inline void CompareBatch(const RecordBatch& left, const RecordBatch& right) {
- if (!left.schema()->Equals(right.schema())) {
+ if (!left.schema()->Equals(*right.schema())) {
FAIL() << "Left schema: " << left.schema()->ToString()
<< "\nRight schema: " << right.schema()->ToString();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/ipc/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 92e6194..db5f082 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -32,7 +32,6 @@
#include "arrow/ipc/util.h"
#include "arrow/loader.h"
#include "arrow/memory_pool.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/python/pandas-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/pandas-test.cc
index ae2527e..0d643df 100644
--- a/cpp/src/arrow/python/pandas-test.cc
+++ b/cpp/src/arrow/python/pandas-test.cc
@@ -25,7 +25,6 @@
#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/python/pandas_convert.h"
-#include "arrow/schema.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
@@ -52,7 +51,7 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) {
std::make_shared<Column>(f2, arr), std::make_shared<Column>(f3, arr)};
auto schema = std::make_shared<Schema>(fields);
- auto table = std::make_shared<Table>("", schema, cols);
+ auto table = std::make_shared<Table>(schema, cols);
PyObject* out;
Py_BEGIN_ALLOW_THREADS;
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index f2c2415..685b1f4 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -35,7 +35,6 @@
#include <vector>
#include "arrow/array.h"
-#include "arrow/column.h"
#include "arrow/loader.h"
#include "arrow/python/builtin_convert.h"
#include "arrow/python/common.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc
deleted file mode 100644
index aa38fd3..0000000
--- a/cpp/src/arrow/schema.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/schema.h"
-
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "arrow/type.h"
-
-namespace arrow {
-
-Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields) : fields_(fields) {}
-
-bool Schema::Equals(const Schema& other) const {
- if (this == &other) { return true; }
-
- if (num_fields() != other.num_fields()) { return false; }
- for (int i = 0; i < num_fields(); ++i) {
- if (!field(i)->Equals(*other.field(i).get())) { return false; }
- }
- return true;
-}
-
-bool Schema::Equals(const std::shared_ptr<Schema>& other) const {
- return Equals(*other.get());
-}
-
-std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) {
- if (fields_.size() > 0 && name_to_index_.size() == 0) {
- for (size_t i = 0; i < fields_.size(); ++i) {
- name_to_index_[fields_[i]->name] = static_cast<int>(i);
- }
- }
-
- auto it = name_to_index_.find(name);
- if (it == name_to_index_.end()) {
- return nullptr;
- } else {
- return fields_[it->second];
- }
-}
-
-std::string Schema::ToString() const {
- std::stringstream buffer;
-
- int i = 0;
- for (auto field : fields_) {
- if (i > 0) { buffer << std::endl; }
- buffer << field->ToString();
- ++i;
- }
- return buffer.str();
-}
-
-} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/schema.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h
deleted file mode 100644
index 37cdbf7..0000000
--- a/cpp/src/arrow/schema.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef ARROW_SCHEMA_H
-#define ARROW_SCHEMA_H
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "arrow/type.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class ARROW_EXPORT Schema {
- public:
- explicit Schema(const std::vector<std::shared_ptr<Field>>& fields);
-
- // Returns true if all of the schema fields are equal
- bool Equals(const Schema& other) const;
- bool Equals(const std::shared_ptr<Schema>& other) const;
-
- // Return the ith schema element. Does not boundscheck
- std::shared_ptr<Field> field(int i) const { return fields_[i]; }
-
- // Returns nullptr if name not found
- std::shared_ptr<Field> GetFieldByName(const std::string& name);
-
- const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; }
-
- // Render a string representation of the schema suitable for debugging
- std::string ToString() const;
-
- int num_fields() const { return static_cast<int>(fields_.size()); }
-
- private:
- std::vector<std::shared_ptr<Field>> fields_;
- std::unordered_map<std::string, int> name_to_index_;
-};
-
-} // namespace arrow
-
-#endif // ARROW_FIELD_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index 6bb3163..3853306 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -22,8 +22,6 @@
#include "gtest/gtest.h"
#include "arrow/array.h"
-#include "arrow/column.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/test-common.h"
@@ -35,6 +33,160 @@ using std::vector;
namespace arrow {
+class TestChunkedArray : public TestBase {
+ protected:
+ virtual void Construct() {
+ one_ = std::make_shared<ChunkedArray>(arrays_one_);
+ another_ = std::make_shared<ChunkedArray>(arrays_another_);
+ }
+
+ ArrayVector arrays_one_;
+ ArrayVector arrays_another_;
+
+ std::shared_ptr<ChunkedArray> one_;
+ std::shared_ptr<ChunkedArray> another_;
+};
+
+TEST_F(TestChunkedArray, BasicEquals) {
+ std::vector<bool> null_bitmap(100, true);
+ std::vector<int32_t> data(100, 1);
+ std::shared_ptr<Array> array;
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array);
+ arrays_one_.push_back(array);
+ arrays_another_.push_back(array);
+
+ Construct();
+ ASSERT_TRUE(one_->Equals(one_));
+ ASSERT_FALSE(one_->Equals(nullptr));
+ ASSERT_TRUE(one_->Equals(another_));
+ ASSERT_TRUE(one_->Equals(*another_.get()));
+}
+
+TEST_F(TestChunkedArray, EqualsDifferingTypes) {
+ std::vector<bool> null_bitmap(100, true);
+ std::vector<int32_t> data32(100, 1);
+ std::vector<int64_t> data64(100, 1);
+ std::shared_ptr<Array> array;
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap, data32, &array);
+ arrays_one_.push_back(array);
+ ArrayFromVector<Int64Type, int64_t>(null_bitmap, data64, &array);
+ arrays_another_.push_back(array);
+
+ Construct();
+ ASSERT_FALSE(one_->Equals(another_));
+ ASSERT_FALSE(one_->Equals(*another_.get()));
+}
+
+TEST_F(TestChunkedArray, EqualsDifferingLengths) {
+ std::vector<bool> null_bitmap100(100, true);
+ std::vector<bool> null_bitmap101(101, true);
+ std::vector<int32_t> data100(100, 1);
+ std::vector<int32_t> data101(101, 1);
+ std::shared_ptr<Array> array;
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap100, data100, &array);
+ arrays_one_.push_back(array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap101, data101, &array);
+ arrays_another_.push_back(array);
+
+ Construct();
+ ASSERT_FALSE(one_->Equals(another_));
+ ASSERT_FALSE(one_->Equals(*another_.get()));
+
+ std::vector<bool> null_bitmap1(1, true);
+ std::vector<int32_t> data1(1, 1);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap1, data1, &array);
+ arrays_one_.push_back(array);
+
+ Construct();
+ ASSERT_TRUE(one_->Equals(another_));
+ ASSERT_TRUE(one_->Equals(*another_.get()));
+}
+
+class TestColumn : public TestChunkedArray {
+ protected:
+ void Construct() override {
+ TestChunkedArray::Construct();
+
+ one_col_ = std::make_shared<Column>(one_field_, one_);
+ another_col_ = std::make_shared<Column>(another_field_, another_);
+ }
+
+ std::shared_ptr<ChunkedArray> data_;
+ std::unique_ptr<Column> column_;
+
+ std::shared_ptr<Field> one_field_;
+ std::shared_ptr<Field> another_field_;
+
+ std::shared_ptr<Column> one_col_;
+ std::shared_ptr<Column> another_col_;
+};
+
+TEST_F(TestColumn, BasicAPI) {
+ ArrayVector arrays;
+ arrays.push_back(MakePrimitive<Int32Array>(100));
+ arrays.push_back(MakePrimitive<Int32Array>(100, 10));
+ arrays.push_back(MakePrimitive<Int32Array>(100, 20));
+
+ auto field = std::make_shared<Field>("c0", int32());
+ column_.reset(new Column(field, arrays));
+
+ ASSERT_EQ("c0", column_->name());
+ ASSERT_TRUE(column_->type()->Equals(int32()));
+ ASSERT_EQ(300, column_->length());
+ ASSERT_EQ(30, column_->null_count());
+ ASSERT_EQ(3, column_->data()->num_chunks());
+
+ // nullptr array should not break
+ column_.reset(new Column(field, std::shared_ptr<Array>(nullptr)));
+ ASSERT_NE(column_.get(), nullptr);
+}
+
+TEST_F(TestColumn, ChunksInhomogeneous) {
+ ArrayVector arrays;
+ arrays.push_back(MakePrimitive<Int32Array>(100));
+ arrays.push_back(MakePrimitive<Int32Array>(100, 10));
+
+ auto field = std::make_shared<Field>("c0", int32());
+ column_.reset(new Column(field, arrays));
+
+ ASSERT_OK(column_->ValidateData());
+
+ arrays.push_back(MakePrimitive<Int16Array>(100, 10));
+ column_.reset(new Column(field, arrays));
+ ASSERT_RAISES(Invalid, column_->ValidateData());
+}
+
+TEST_F(TestColumn, Equals) {
+ std::vector<bool> null_bitmap(100, true);
+ std::vector<int32_t> data(100, 1);
+ std::shared_ptr<Array> array;
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array);
+ arrays_one_.push_back(array);
+ arrays_another_.push_back(array);
+
+ one_field_ = std::make_shared<Field>("column", int32());
+ another_field_ = std::make_shared<Field>("column", int32());
+
+ Construct();
+ ASSERT_TRUE(one_col_->Equals(one_col_));
+ ASSERT_FALSE(one_col_->Equals(nullptr));
+ ASSERT_TRUE(one_col_->Equals(another_col_));
+ ASSERT_TRUE(one_col_->Equals(*another_col_.get()));
+
+ // Field is different
+ another_field_ = std::make_shared<Field>("two", int32());
+ Construct();
+ ASSERT_FALSE(one_col_->Equals(another_col_));
+ ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
+
+ // ChunkedArray is different
+ another_field_ = std::make_shared<Field>("column", int32());
+ arrays_another_.push_back(array);
+ Construct();
+ ASSERT_FALSE(one_col_->Equals(another_col_));
+ ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
+}
+
class TestTable : public TestBase {
public:
void MakeExample1(int length) {
@@ -63,7 +215,7 @@ class TestTable : public TestBase {
TEST_F(TestTable, EmptySchema) {
auto empty_schema = shared_ptr<Schema>(new Schema({}));
- table_.reset(new Table("data", empty_schema, columns_));
+ table_.reset(new Table(empty_schema, columns_));
ASSERT_OK(table_->ValidateColumns());
ASSERT_EQ(0, table_->num_rows());
ASSERT_EQ(0, table_->num_columns());
@@ -73,17 +225,13 @@ TEST_F(TestTable, Ctors) {
const int length = 100;
MakeExample1(length);
- std::string name = "data";
-
- table_.reset(new Table(name, schema_, columns_));
+ table_.reset(new Table(schema_, columns_));
ASSERT_OK(table_->ValidateColumns());
- ASSERT_EQ(name, table_->name());
ASSERT_EQ(length, table_->num_rows());
ASSERT_EQ(3, table_->num_columns());
- table_.reset(new Table(name, schema_, columns_, length));
+ table_.reset(new Table(schema_, columns_, length));
ASSERT_OK(table_->ValidateColumns());
- ASSERT_EQ(name, table_->name());
ASSERT_EQ(length, table_->num_rows());
}
@@ -91,10 +239,9 @@ TEST_F(TestTable, Metadata) {
const int length = 100;
MakeExample1(length);
- std::string name = "data";
- table_.reset(new Table(name, schema_, columns_));
+ table_.reset(new Table(schema_, columns_));
- ASSERT_TRUE(table_->schema()->Equals(schema_));
+ ASSERT_TRUE(table_->schema()->Equals(*schema_));
auto col = table_->column(0);
ASSERT_EQ(schema_->field(0)->name, col->name());
@@ -106,13 +253,13 @@ TEST_F(TestTable, InvalidColumns) {
const int length = 100;
MakeExample1(length);
- table_.reset(new Table("data", schema_, columns_, length - 1));
+ table_.reset(new Table(schema_, columns_, length - 1));
ASSERT_RAISES(Invalid, table_->ValidateColumns());
columns_.clear();
// Wrong number of columns
- table_.reset(new Table("data", schema_, columns_, length));
+ table_.reset(new Table(schema_, columns_, length));
ASSERT_RAISES(Invalid, table_->ValidateColumns());
columns_ = {
@@ -120,7 +267,7 @@ TEST_F(TestTable, InvalidColumns) {
std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)),
std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length - 1))};
- table_.reset(new Table("data", schema_, columns_, length));
+ table_.reset(new Table(schema_, columns_, length));
ASSERT_RAISES(Invalid, table_->ValidateColumns());
}
@@ -128,26 +275,22 @@ TEST_F(TestTable, Equals) {
const int length = 100;
MakeExample1(length);
- std::string name = "data";
- table_.reset(new Table(name, schema_, columns_));
+ table_.reset(new Table(schema_, columns_));
- ASSERT_TRUE(table_->Equals(table_));
- ASSERT_FALSE(table_->Equals(nullptr));
- // Differing name
- ASSERT_FALSE(table_->Equals(std::make_shared<Table>("other_name", schema_, columns_)));
+ ASSERT_TRUE(table_->Equals(*table_));
// Differing schema
auto f0 = std::make_shared<Field>("f3", int32());
auto f1 = std::make_shared<Field>("f4", uint8());
auto f2 = std::make_shared<Field>("f5", int16());
vector<shared_ptr<Field>> fields = {f0, f1, f2};
auto other_schema = std::make_shared<Schema>(fields);
- ASSERT_FALSE(table_->Equals(std::make_shared<Table>(name, other_schema, columns_)));
+ ASSERT_FALSE(table_->Equals(Table(other_schema, columns_)));
// Differing columns
std::vector<std::shared_ptr<Column>> other_columns = {
std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length, 10)),
std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length, 10)),
std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length, 10))};
- ASSERT_FALSE(table_->Equals(std::make_shared<Table>(name, schema_, other_columns)));
+ ASSERT_FALSE(table_->Equals(Table(schema_, other_columns)));
}
TEST_F(TestTable, FromRecordBatches) {
@@ -157,10 +300,10 @@ TEST_F(TestTable, FromRecordBatches) {
auto batch1 = std::make_shared<RecordBatch>(schema_, length, arrays_);
std::shared_ptr<Table> result, expected;
- ASSERT_OK(Table::FromRecordBatches("foo", {batch1}, &result));
+ ASSERT_OK(Table::FromRecordBatches({batch1}, &result));
- expected = std::make_shared<Table>("foo", schema_, columns_);
- ASSERT_TRUE(result->Equals(expected));
+ expected = std::make_shared<Table>(schema_, columns_);
+ ASSERT_TRUE(result->Equals(*expected));
std::vector<std::shared_ptr<Column>> other_columns;
for (int i = 0; i < schema_->num_fields(); ++i) {
@@ -168,20 +311,20 @@ TEST_F(TestTable, FromRecordBatches) {
other_columns.push_back(std::make_shared<Column>(schema_->field(i), col_arrays));
}
- ASSERT_OK(Table::FromRecordBatches("foo", {batch1, batch1}, &result));
- expected = std::make_shared<Table>("foo", schema_, other_columns);
- ASSERT_TRUE(result->Equals(expected));
+ ASSERT_OK(Table::FromRecordBatches({batch1, batch1}, &result));
+ expected = std::make_shared<Table>(schema_, other_columns);
+ ASSERT_TRUE(result->Equals(*expected));
// Error states
std::vector<std::shared_ptr<RecordBatch>> empty_batches;
- ASSERT_RAISES(Invalid, Table::FromRecordBatches("", empty_batches, &result));
+ ASSERT_RAISES(Invalid, Table::FromRecordBatches(empty_batches, &result));
std::vector<std::shared_ptr<Field>> fields = {schema_->field(0), schema_->field(1)};
auto other_schema = std::make_shared<Schema>(fields);
std::vector<std::shared_ptr<Array>> other_arrays = {arrays_[0], arrays_[1]};
auto batch2 = std::make_shared<RecordBatch>(other_schema, length, other_arrays);
- ASSERT_RAISES(Invalid, Table::FromRecordBatches("", {batch1, batch2}, &result));
+ ASSERT_RAISES(Invalid, Table::FromRecordBatches({batch1, batch2}, &result));
}
TEST_F(TestTable, ConcatenateTables) {
@@ -195,25 +338,50 @@ TEST_F(TestTable, ConcatenateTables) {
auto batch2 = std::make_shared<RecordBatch>(schema_, length, arrays_);
std::shared_ptr<Table> t1, t2, t3, result, expected;
- ASSERT_OK(Table::FromRecordBatches("foo", {batch1}, &t1));
- ASSERT_OK(Table::FromRecordBatches("foo", {batch2}, &t2));
+ ASSERT_OK(Table::FromRecordBatches({batch1}, &t1));
+ ASSERT_OK(Table::FromRecordBatches({batch2}, &t2));
- ASSERT_OK(ConcatenateTables("bar", {t1, t2}, &result));
- ASSERT_OK(Table::FromRecordBatches("bar", {batch1, batch2}, &expected));
- ASSERT_TRUE(result->Equals(expected));
+ ASSERT_OK(ConcatenateTables({t1, t2}, &result));
+ ASSERT_OK(Table::FromRecordBatches({batch1, batch2}, &expected));
+ ASSERT_TRUE(result->Equals(*expected));
// Error states
std::vector<std::shared_ptr<Table>> empty_tables;
- ASSERT_RAISES(Invalid, ConcatenateTables("", empty_tables, &result));
+ ASSERT_RAISES(Invalid, ConcatenateTables(empty_tables, &result));
std::vector<std::shared_ptr<Field>> fields = {schema_->field(0), schema_->field(1)};
auto other_schema = std::make_shared<Schema>(fields);
std::vector<std::shared_ptr<Array>> other_arrays = {arrays_[0], arrays_[1]};
auto batch3 = std::make_shared<RecordBatch>(other_schema, length, other_arrays);
- ASSERT_OK(Table::FromRecordBatches("", {batch3}, &t3));
+ ASSERT_OK(Table::FromRecordBatches({batch3}, &t3));
+
+ ASSERT_RAISES(Invalid, ConcatenateTables({t1, t3}, &result));
+}
+
+TEST_F(TestTable, RemoveColumn) {
+ const int64_t length = 10;
+ MakeExample1(length);
+
+ Table table(schema_, columns_);
+
+ std::shared_ptr<Table> result;
+ ASSERT_OK(table.RemoveColumn(0, &result));
+
+ auto ex_schema =
+ std::shared_ptr<Schema>(new Schema({schema_->field(1), schema_->field(2)}));
+ std::vector<std::shared_ptr<Column>> ex_columns = {table.column(1), table.column(2)};
+ ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
+
+ ASSERT_OK(table.RemoveColumn(1, &result));
+ ex_schema = std::shared_ptr<Schema>(new Schema({schema_->field(0), schema_->field(2)}));
+ ex_columns = {table.column(0), table.column(2)};
+ ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
- ASSERT_RAISES(Invalid, ConcatenateTables("foo", {t1, t3}, &result));
+ ASSERT_OK(table.RemoveColumn(2, &result));
+ ex_schema = std::shared_ptr<Schema>(new Schema({schema_->field(0), schema_->field(1)}));
+ ex_columns = {table.column(0), table.column(1)};
+ ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
}
class TestRecordBatch : public TestBase {};
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 3f254aa..8e283f4 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -23,12 +23,122 @@
#include <sstream>
#include "arrow/array.h"
-#include "arrow/column.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/stl.h"
namespace arrow {
+// ----------------------------------------------------------------------
+// ChunkedArray and Column methods
+
+ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) {
+ length_ = 0;
+ null_count_ = 0;
+ for (const std::shared_ptr<Array>& chunk : chunks) {
+ length_ += chunk->length();
+ null_count_ += chunk->null_count();
+ }
+}
+
+bool ChunkedArray::Equals(const ChunkedArray& other) const {
+ if (length_ != other.length()) { return false; }
+ if (null_count_ != other.null_count()) { return false; }
+
+ // Check contents of the underlying arrays. This checks for equality of
+ // the underlying data independently of the chunk size.
+ int this_chunk_idx = 0;
+ int64_t this_start_idx = 0;
+ int other_chunk_idx = 0;
+ int64_t other_start_idx = 0;
+
+ int64_t elements_compared = 0;
+ while (elements_compared < length_) {
+ const std::shared_ptr<Array> this_array = chunks_[this_chunk_idx];
+ const std::shared_ptr<Array> other_array = other.chunk(other_chunk_idx);
+ int64_t common_length = std::min(
+ this_array->length() - this_start_idx, other_array->length() - other_start_idx);
+ if (!this_array->RangeEquals(this_start_idx, this_start_idx + common_length,
+ other_start_idx, other_array)) {
+ return false;
+ }
+
+ elements_compared += common_length;
+
+ // If we have exhausted the current chunk, proceed to the next one individually.
+ if (this_start_idx + common_length == this_array->length()) {
+ this_chunk_idx++;
+ this_start_idx = 0;
+ } else {
+ this_start_idx += common_length;
+ }
+
+ if (other_start_idx + common_length == other_array->length()) {
+ other_chunk_idx++;
+ other_start_idx = 0;
+ } else {
+ other_start_idx += common_length;
+ }
+ }
+ return true;
+}
+
+bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const {
+ if (this == other.get()) { return true; }
+ if (!other) { return false; }
+ return Equals(*other.get());
+}
+
+Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks)
+ : field_(field) {
+ data_ = std::make_shared<ChunkedArray>(chunks);
+}
+
+Column::Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data)
+ : field_(field) {
+ if (data) {
+ data_ = std::make_shared<ChunkedArray>(ArrayVector({data}));
+ } else {
+ data_ = std::make_shared<ChunkedArray>(ArrayVector({}));
+ }
+}
+
+Column::Column(const std::string& name, const std::shared_ptr<Array>& data)
+ : Column(::arrow::field(name, data->type()), data) {}
+
+Column::Column(
+ const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data)
+ : field_(field), data_(data) {}
+
+bool Column::Equals(const Column& other) const {
+ if (!field_->Equals(other.field())) { return false; }
+ return data_->Equals(other.data());
+}
+
+bool Column::Equals(const std::shared_ptr<Column>& other) const {
+ if (this == other.get()) { return true; }
+ if (!other) { return false; }
+
+ return Equals(*other.get());
+}
+
+Status Column::ValidateData() {
+ for (int i = 0; i < data_->num_chunks(); ++i) {
+ std::shared_ptr<DataType> type = data_->chunk(i)->type();
+ if (!this->type()->Equals(type)) {
+ std::stringstream ss;
+ ss << "In chunk " << i << " expected type " << this->type()->ToString()
+ << " but saw " << type->ToString();
+ return Status::Invalid(ss.str());
+ }
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// RecordBatch methods
+
RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
const std::vector<std::shared_ptr<Array>>& columns)
: schema_(schema), num_rows_(num_rows), columns_(columns) {}
@@ -83,9 +193,9 @@ std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset, int64_t length)
// ----------------------------------------------------------------------
// Table methods
-Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema,
+Table::Table(const std::shared_ptr<Schema>& schema,
const std::vector<std::shared_ptr<Column>>& columns)
- : name_(name), schema_(schema), columns_(columns) {
+ : schema_(schema), columns_(columns) {
if (columns.size() == 0) {
num_rows_ = 0;
} else {
@@ -93,12 +203,11 @@ Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema,
}
}
-Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema,
+Table::Table(const std::shared_ptr<Schema>& schema,
const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows)
- : name_(name), schema_(schema), columns_(columns), num_rows_(num_rows) {}
+ : schema_(schema), columns_(columns), num_rows_(num_rows) {}
-Status Table::FromRecordBatches(const std::string& name,
- const std::vector<std::shared_ptr<RecordBatch>>& batches,
+Status Table::FromRecordBatches(const std::vector<std::shared_ptr<RecordBatch>>& batches,
std::shared_ptr<Table>* table) {
if (batches.size() == 0) {
return Status::Invalid("Must pass at least one record batch");
@@ -110,7 +219,7 @@ Status Table::FromRecordBatches(const std::string& name,
const int ncolumns = static_cast<int>(schema->num_fields());
for (int i = 1; i < nbatches; ++i) {
- if (!batches[i]->schema()->Equals(schema)) {
+ if (!batches[i]->schema()->Equals(*schema)) {
std::stringstream ss;
ss << "Schema at index " << static_cast<int>(i) << " was different: \n"
<< schema->ToString() << "\nvs\n"
@@ -129,11 +238,11 @@ Status Table::FromRecordBatches(const std::string& name,
columns[i] = std::make_shared<Column>(schema->field(i), column_arrays);
}
- *table = std::make_shared<Table>(name, schema, columns);
+ *table = std::make_shared<Table>(schema, columns);
return Status::OK();
}
-Status ConcatenateTables(const std::string& output_name,
+Status ConcatenateTables(
const std::vector<std::shared_ptr<Table>>& tables, std::shared_ptr<Table>* table) {
if (tables.size() == 0) { return Status::Invalid("Must pass at least one table"); }
@@ -143,7 +252,7 @@ Status ConcatenateTables(const std::string& output_name,
const int ncolumns = static_cast<int>(schema->num_fields());
for (int i = 1; i < ntables; ++i) {
- if (!tables[i]->schema()->Equals(schema)) {
+ if (!tables[i]->schema()->Equals(*schema)) {
std::stringstream ss;
ss << "Schema at index " << static_cast<int>(i) << " was different: \n"
<< schema->ToString() << "\nvs\n"
@@ -164,13 +273,13 @@ Status ConcatenateTables(const std::string& output_name,
}
columns[i] = std::make_shared<Column>(schema->field(i), column_arrays);
}
- *table = std::make_shared<Table>(output_name, schema, columns);
+ *table = std::make_shared<Table>(schema, columns);
return Status::OK();
}
bool Table::Equals(const Table& other) const {
- if (name_ != other.name()) { return false; }
- if (!schema_->Equals(other.schema())) { return false; }
+ if (this == &other) { return true; }
+ if (!schema_->Equals(*other.schema())) { return false; }
if (static_cast<int64_t>(columns_.size()) != other.num_columns()) { return false; }
for (int i = 0; i < static_cast<int>(columns_.size()); i++) {
@@ -179,10 +288,12 @@ bool Table::Equals(const Table& other) const {
return true;
}
-bool Table::Equals(const std::shared_ptr<Table>& other) const {
- if (this == other.get()) { return true; }
- if (!other) { return false; }
- return Equals(*other.get());
+Status Table::RemoveColumn(int i, std::shared_ptr<Table>* out) const {
+ std::shared_ptr<Schema> new_schema;
+ RETURN_NOT_OK(schema_->RemoveField(i, &new_schema));
+
+ *out = std::make_shared<Table>(new_schema, DeleteVectorElement(columns_, i));
+ return Status::OK();
}
Status Table::ValidateColumns() const {
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/table.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index bf0d99c..7b739c9 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -23,6 +23,7 @@
#include <string>
#include <vector>
+#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -32,6 +33,74 @@ class Column;
class Schema;
class Status;
+using ArrayVector = std::vector<std::shared_ptr<Array>>;
+
+// A data structure managing a list of primitive Arrow arrays logically as one
+// large array
+class ARROW_EXPORT ChunkedArray {
+ public:
+ explicit ChunkedArray(const ArrayVector& chunks);
+
+ // @returns: the total length of the chunked array; computed on construction
+ int64_t length() const { return length_; }
+
+ int64_t null_count() const { return null_count_; }
+
+ int num_chunks() const { return static_cast<int>(chunks_.size()); }
+
+ std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
+
+ const ArrayVector& chunks() const { return chunks_; }
+
+ bool Equals(const ChunkedArray& other) const;
+ bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
+
+ protected:
+ ArrayVector chunks_;
+ int64_t length_;
+ int64_t null_count_;
+};
+
+// An immutable column data structure consisting of a field (type metadata) and
+// a logical chunked data array (which can be validated as all being the same
+// type).
+class ARROW_EXPORT Column {
+ public:
+ Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
+ Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data);
+
+ Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data);
+
+ /// Construct from name and array
+ Column(const std::string& name, const std::shared_ptr<Array>& data);
+
+ int64_t length() const { return data_->length(); }
+
+ int64_t null_count() const { return data_->null_count(); }
+
+ std::shared_ptr<Field> field() const { return field_; }
+
+ // @returns: the column's name in the passed metadata
+ const std::string& name() const { return field_->name; }
+
+ // @returns: the column's type according to the metadata
+ std::shared_ptr<DataType> type() const { return field_->type; }
+
+ // @returns: the column's data as a chunked logical array
+ std::shared_ptr<ChunkedArray> data() const { return data_; }
+
+ bool Equals(const Column& other) const;
+ bool Equals(const std::shared_ptr<Column>& other) const;
+
+ // Verify that the column's array data is consistent with the passed field's
+ // metadata
+ Status ValidateData();
+
+ protected:
+ std::shared_ptr<Field> field_;
+ std::shared_ptr<ChunkedArray> data_;
+};
+
// A record batch is a simpler and more rigid table data structure intended for
// use primarily in shared memory IPC. It contains a schema (metadata) and a
// corresponding sequence of equal-length Arrow arrays
@@ -81,25 +150,22 @@ class ARROW_EXPORT RecordBatch {
class ARROW_EXPORT Table {
public:
// If columns is zero-length, the table's number of rows is zero
- Table(const std::string& name, const std::shared_ptr<Schema>& schema,
+ Table(const std::shared_ptr<Schema>& schema,
const std::vector<std::shared_ptr<Column>>& columns);
// num_rows is a parameter to allow for tables of a particular size not
// having any materialized columns. Each column should therefore have the
// same length as num_rows -- you can validate this using
// Table::ValidateColumns
- Table(const std::string& name, const std::shared_ptr<Schema>& schema,
- const std::vector<std::shared_ptr<Column>>& columns, int64_t nubm_rows);
+ Table(const std::shared_ptr<Schema>& schema,
+ const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows);
// Construct table from RecordBatch, but only if all of the batch schemas are
// equal. Returns Status::Invalid if there is some problem
- static Status FromRecordBatches(const std::string& name,
+ static Status FromRecordBatches(
const std::vector<std::shared_ptr<RecordBatch>>& batches,
std::shared_ptr<Table>* table);
- // @returns: the table's name, if any (may be length 0)
- const std::string& name() const { return name_; }
-
// @returns: the table's schema
std::shared_ptr<Schema> schema() const { return schema_; }
@@ -107,6 +173,10 @@ class ARROW_EXPORT Table {
// @returns: the i-th column
std::shared_ptr<Column> column(int i) const { return columns_[i]; }
+ /// Remove column from the table, producing a new Table (because tables and
+ /// schemas are immutable)
+ Status RemoveColumn(int i, std::shared_ptr<Table>* out) const;
+
// @returns: the number of columns in the table
int num_columns() const { return static_cast<int>(columns_.size()); }
@@ -114,15 +184,11 @@ class ARROW_EXPORT Table {
int64_t num_rows() const { return num_rows_; }
bool Equals(const Table& other) const;
- bool Equals(const std::shared_ptr<Table>& other) const;
// After construction, perform any checks to validate the input arguments
Status ValidateColumns() const;
private:
- // The table's name, optional
- std::string name_;
-
std::shared_ptr<Schema> schema_;
std::vector<std::shared_ptr<Column>> columns_;
@@ -131,7 +197,7 @@ class ARROW_EXPORT Table {
// Construct table from multiple input tables. Return Status::Invalid if
// schemas are not equal
-Status ARROW_EXPORT ConcatenateTables(const std::string& output_name,
+Status ARROW_EXPORT ConcatenateTables(
const std::vector<std::shared_ptr<Table>>& tables, std::shared_ptr<Table>* table);
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-common.h b/cpp/src/arrow/test-common.h
index f704b6b..dc11e76 100644
--- a/cpp/src/arrow/test-common.h
+++ b/cpp/src/arrow/test-common.h
@@ -29,7 +29,6 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/builder.h"
-#include "arrow/column.h"
#include "arrow/memory_pool.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index bed5559..94937b5 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -30,9 +30,7 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/builder.h"
-#include "arrow/column.h"
#include "arrow/memory_pool.h"
-#include "arrow/schema.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/type-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index ed86543..70c1734 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -23,7 +23,6 @@
#include "gtest/gtest.h"
-#include "arrow/schema.h"
#include "arrow/type.h"
using std::shared_ptr;
@@ -75,11 +74,8 @@ TEST_F(TestSchema, Basics) {
vector<shared_ptr<Field>> fields3 = {f0, f1_optional, f2};
auto schema3 = std::make_shared<Schema>(fields3);
- ASSERT_TRUE(schema->Equals(schema2));
- ASSERT_FALSE(schema->Equals(schema3));
-
- ASSERT_TRUE(schema->Equals(*schema2.get()));
- ASSERT_FALSE(schema->Equals(*schema3.get()));
+ ASSERT_TRUE(schema->Equals(*schema2));
+ ASSERT_FALSE(schema->Equals(*schema3));
}
TEST_F(TestSchema, ToString) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index c790f6e..e6e6f5c 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -24,6 +24,7 @@
#include "arrow/compare.h"
#include "arrow/status.h"
#include "arrow/util/logging.h"
+#include "arrow/util/stl.h"
#include "arrow/visitor.h"
namespace arrow {
@@ -45,6 +46,8 @@ std::string Field::ToString() const {
return ss.str();
}
+DataType::~DataType() {}
+
bool DataType::Equals(const DataType& other) const {
bool are_equal = false;
Status error = TypeEquals(*this, other, &are_equal);
@@ -225,6 +228,56 @@ std::string NullType::ToString() const {
}
// ----------------------------------------------------------------------
+// Schema implementation
+
+Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields) : fields_(fields) {}
+
+bool Schema::Equals(const Schema& other) const {
+ if (this == &other) { return true; }
+
+ if (num_fields() != other.num_fields()) { return false; }
+ for (int i = 0; i < num_fields(); ++i) {
+ if (!field(i)->Equals(*other.field(i).get())) { return false; }
+ }
+ return true;
+}
+
+std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) {
+ if (fields_.size() > 0 && name_to_index_.size() == 0) {
+ for (size_t i = 0; i < fields_.size(); ++i) {
+ name_to_index_[fields_[i]->name] = static_cast<int>(i);
+ }
+ }
+
+ auto it = name_to_index_.find(name);
+ if (it == name_to_index_.end()) {
+ return nullptr;
+ } else {
+ return fields_[it->second];
+ }
+}
+
+Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const {
+ DCHECK_GE(i, 0);
+ DCHECK_LT(i, this->num_fields());
+
+ *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i));
+ return Status::OK();
+}
+
+std::string Schema::ToString() const {
+ std::stringstream buffer;
+
+ int i = 0;
+ for (auto field : fields_) {
+ if (i > 0) { buffer << std::endl; }
+ buffer << field->ToString();
+ ++i;
+ }
+ return buffer.str();
+}
+
+// ----------------------------------------------------------------------
// Visitors and factory functions
#define ACCEPT_VISITOR(TYPE) \
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 2a73f6b..4f93190 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -22,6 +22,7 @@
#include <memory>
#include <ostream>
#include <string>
+#include <unordered_map>
#include <vector>
#include "arrow/status.h"
@@ -132,7 +133,7 @@ struct ARROW_EXPORT DataType {
explicit DataType(Type::type type) : type(type) {}
- virtual ~DataType() = default;
+ virtual ~DataType();
// Return whether the types are equal
//
@@ -597,6 +598,36 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
};
// ----------------------------------------------------------------------
+// Schema
+
+class ARROW_EXPORT Schema {
+ public:
+ explicit Schema(const std::vector<std::shared_ptr<Field>>& fields);
+
+ // Returns true if all of the schema fields are equal
+ bool Equals(const Schema& other) const;
+
+ // Return the ith schema element. Does not boundscheck
+ std::shared_ptr<Field> field(int i) const { return fields_[i]; }
+
+ // Returns nullptr if name not found
+ std::shared_ptr<Field> GetFieldByName(const std::string& name);
+
+ const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; }
+
+ // Render a string representation of the schema suitable for debugging
+ std::string ToString() const;
+
+ Status RemoveField(int i, std::shared_ptr<Schema>* out) const;
+
+ int num_fields() const { return static_cast<int>(fields_.size()); }
+
+ private:
+ std::vector<std::shared_ptr<Field>> fields_;
+ std::unordered_map<std::string, int> name_to_index_;
+};
+
+// ----------------------------------------------------------------------
// Factory functions
std::shared_ptr<DataType> ARROW_EXPORT fixed_width_binary(int32_t byte_width);
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/cpp/src/arrow/util/stl.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h
new file mode 100644
index 0000000..3ec535d
--- /dev/null
+++ b/cpp/src/arrow/util/stl.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_STL_H
+#define ARROW_UTIL_STL_H
+
+#include <vector>
+
+namespace arrow {
+
+template <typename T>
+inline std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
+ std::vector<T> out;
+ out.reserve(values.size() - 1);
+ for (size_t i = 0; i < index; ++i) {
+ out.push_back(values[i]);
+ }
+ for (size_t i = index + 1; i < values.size(); ++i) {
+ out.push_back(values[i]);
+ }
+ return out;
+}
+
+} // namespace arrow
+
+#endif // ARROW_UTIL_STL_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 654f5ab..6cae196 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -19,6 +19,8 @@
# distutils: language = c++
# cython: embedsignature = True
+from cython.operator cimport dereference as deref
+
import numpy as np
from pyarrow.includes.libarrow cimport *
@@ -216,7 +218,7 @@ cdef class Array:
return '{0}\n{1}'.format(type_format, values)
def equals(Array self, Array other):
- return self.ap.Equals(other.sp_array)
+ return self.ap.Equals(deref(other.ap))
def __len__(self):
if self.sp_array.get():
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index bdbd18b..8e428b4 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -59,7 +59,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CDataType" arrow::DataType":
Type type
- c_bool Equals(const shared_ptr[CDataType]& other)
c_bool Equals(const CDataType& other)
c_string ToString()
@@ -71,7 +70,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int64_t null_count()
Type type_enum()
- c_bool Equals(const shared_ptr[CArray]& arr)
+ c_bool Equals(const CArray& arr)
c_bool IsNull(int i)
shared_ptr[CArray] Slice(int64_t offset)
@@ -155,7 +154,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CSchema" arrow::Schema":
CSchema(const vector[shared_ptr[CField]]& fields)
- c_bool Equals(const shared_ptr[CSchema]& other)
+ c_bool Equals(const CSchema& other)
shared_ptr[CField] field(int i)
shared_ptr[CField] GetFieldByName(c_string& name)
@@ -231,7 +230,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
const vector[shared_ptr[CArray]]& chunks)
c_bool Equals(const CColumn& other)
- c_bool Equals(const shared_ptr[CColumn]& other)
int64_t length()
int64_t null_count()
@@ -258,12 +256,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length)
cdef cppclass CTable" arrow::Table":
- CTable(const c_string& name, const shared_ptr[CSchema]& schema,
+ CTable(const shared_ptr[CSchema]& schema,
const vector[shared_ptr[CColumn]]& columns)
@staticmethod
CStatus FromRecordBatches(
- const c_string& name,
const vector[shared_ptr[CRecordBatch]]& batches,
shared_ptr[CTable]* table)
@@ -271,15 +268,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int num_rows()
c_bool Equals(const CTable& other)
- c_bool Equals(const shared_ptr[CTable]& other)
-
- const c_string& name()
shared_ptr[CSchema] schema()
shared_ptr[CColumn] column(int i)
- CStatus ConcatenateTables(const c_string& output_name,
- const vector[shared_ptr[CTable]]& tables,
+ CStatus RemoveColumn(int i, shared_ptr[CTable]* out)
+
+ CStatus ConcatenateTables(const vector[shared_ptr[CTable]]& tables,
shared_ptr[CTable]* result)
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index cb44ce8..d528bdc 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -956,7 +956,6 @@ cdef class _StreamReader:
vector[shared_ptr[CRecordBatch]] batches
shared_ptr[CRecordBatch] batch
shared_ptr[CTable] table
- c_string name = b''
with nogil:
while True:
@@ -965,7 +964,7 @@ cdef class _StreamReader:
break
batches.push_back(batch)
- check_status(CTable.FromRecordBatches(name, batches, &table))
+ check_status(CTable.FromRecordBatches(batches, &table))
return table_from_ctable(table)
@@ -1033,7 +1032,6 @@ cdef class _FileReader:
cdef:
vector[shared_ptr[CRecordBatch]] batches
shared_ptr[CTable] table
- c_string name = b''
int i, nbatches
nbatches = self.num_record_batches
@@ -1042,6 +1040,6 @@ cdef class _FileReader:
with nogil:
for i in range(nbatches):
check_status(self.reader.get().GetRecordBatch(i, &batches[i]))
- check_status(CTable.FromRecordBatches(name, batches, &table))
+ check_status(CTable.FromRecordBatches(batches, &table))
return table_from_ctable(table)
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index ab5ae5f..4f02901 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -166,7 +166,7 @@ cdef class Schema:
cdef Schema _other
_other = other
- return self.sp_schema.get().Equals(_other.sp_schema)
+ return self.sp_schema.get().Equals(deref(_other.schema))
def field_by_name(self, name):
"""
@@ -200,11 +200,16 @@ cdef class Schema:
return result
- def __repr__(self):
+ def __str__(self):
return frombytes(self.schema.ToString())
+ def __repr__(self):
+ return self.__str__()
+
+
cdef dict _type_cache = {}
+
cdef DataType primitive_type(Type type):
if type in _type_cache:
return _type_cache[type]
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 58f5d68..e6fddbd 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -298,7 +298,7 @@ cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
-cdef _dataframe_to_arrays(df, name, timestamps_to_ms, Schema schema):
+cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema):
cdef:
list names = []
list arrays = []
@@ -474,7 +474,7 @@ cdef class RecordBatch:
-------
pyarrow.table.RecordBatch
"""
- names, arrays = _dataframe_to_arrays(df, None, False, schema)
+ names, arrays = _dataframe_to_arrays(df, False, schema)
return cls.from_arrays(arrays, names)
@staticmethod
@@ -573,6 +573,9 @@ cdef class Table:
def __cinit__(self):
self.table = NULL
+ def __repr__(self):
+ return 'pyarrow.Table\n{0}'.format(str(self.schema))
+
cdef init(self, const shared_ptr[CTable]& table):
self.sp_table = table
self.table = table.get()
@@ -608,7 +611,7 @@ cdef class Table:
return result
@classmethod
- def from_pandas(cls, df, name=None, timestamps_to_ms=False, schema=None):
+ def from_pandas(cls, df, timestamps_to_ms=False, schema=None):
"""
Convert pandas.DataFrame to an Arrow Table
@@ -616,8 +619,6 @@ cdef class Table:
----------
df: pandas.DataFrame
- name: str
-
timestamps_to_ms: bool
Convert datetime columns to ms resolution. This is needed for
compability with other functionality like Parquet I/O which
@@ -643,13 +644,13 @@ cdef class Table:
>>> pa.Table.from_pandas(df)
<pyarrow.table.Table object at 0x7f05d1fb1b40>
"""
- names, arrays = _dataframe_to_arrays(df, name=name,
+ names, arrays = _dataframe_to_arrays(df,
timestamps_to_ms=timestamps_to_ms,
schema=schema)
- return cls.from_arrays(arrays, names=names, name=name)
+ return cls.from_arrays(arrays, names=names)
@staticmethod
- def from_arrays(arrays, names=None, name=None):
+ def from_arrays(arrays, names=None):
"""
Construct a Table from Arrow arrays or columns
@@ -660,8 +661,6 @@ cdef class Table:
names: list of str, optional
Names for the table columns. If Columns passed, will be
inferred. If Arrays passed, this argument is required
- name: str, optional
- name for the Table
Returns
-------
@@ -669,7 +668,6 @@ cdef class Table:
"""
cdef:
- c_string c_name
vector[shared_ptr[CField]] fields
vector[shared_ptr[CColumn]] columns
shared_ptr[CSchema] schema
@@ -689,16 +687,11 @@ cdef class Table:
else:
raise ValueError(type(arrays[i]))
- if name is None:
- c_name = ''
- else:
- c_name = tobytes(name)
-
- table.reset(new CTable(c_name, schema, columns))
+ table.reset(new CTable(schema, columns))
return table_from_ctable(table)
@staticmethod
- def from_batches(batches, name=None):
+ def from_batches(batches):
"""
Construct a Table from a list of Arrow RecordBatches
@@ -712,16 +705,12 @@ cdef class Table:
vector[shared_ptr[CRecordBatch]] c_batches
shared_ptr[CTable] c_table
RecordBatch batch
- Table table
- c_string c_name
-
- c_name = b'' if name is None else tobytes(name)
for batch in batches:
c_batches.push_back(batch.sp_batch)
with nogil:
- check_status(CTable.FromRecordBatches(c_name, c_batches, &c_table))
+ check_status(CTable.FromRecordBatches(c_batches, &c_table))
return table_from_ctable(c_table)
@@ -762,18 +751,6 @@ cdef class Table:
return OrderedDict(entries)
@property
- def name(self):
- """
- Label of the table
-
- Returns
- -------
- str
- """
- self._check_nullptr()
- return frombytes(self.table.name())
-
- @property
def schema(self):
"""
Schema of the table and its columns
@@ -851,8 +828,19 @@ cdef class Table:
"""
return (self.num_rows, self.num_columns)
+ def remove_column(self, int i):
+ """
+ Create new Table with the indicated column removed
+ """
+ cdef shared_ptr[CTable] c_table
-def concat_tables(tables, output_name=None):
+ with nogil:
+ check_status(self.table.RemoveColumn(i, &c_table))
+
+ return table_from_ctable(c_table)
+
+
+def concat_tables(tables):
"""
Perform zero-copy concatenation of pyarrow.Table objects. Raises exception
if all of the Table schemas are not the same
@@ -867,15 +855,12 @@ def concat_tables(tables, output_name=None):
vector[shared_ptr[CTable]] c_tables
shared_ptr[CTable] c_result
Table table
- c_string c_name
-
- c_name = b'' if output_name is None else tobytes(output_name)
for table in tables:
c_tables.push_back(table.sp_table)
with nogil:
- check_status(ConcatenateTables(c_name, c_tables, &c_result))
+ check_status(ConcatenateTables(c_tables, &c_result))
return table_from_ctable(c_result)
http://git-wip-us.apache.org/repos/asf/arrow/blob/47fad3f4/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c72ff9e..fc32b9f 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -47,7 +47,7 @@ def test_single_pylist_column_roundtrip(tmpdir):
filename = tmpdir.join('single_{}_column.parquet'
.format(dtype.__name__))
data = [pa.from_pylist(list(map(dtype, range(5))))]
- table = pa.Table.from_arrays(data, names=('a', 'b'), name='table_name')
+ table = pa.Table.from_arrays(data, names=('a', 'b'))
pq.write_table(table, filename.strpath)
table_read = pq.read_table(filename.strpath)
for col_written, col_read in zip(table.itercolumns(),