You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/27 13:28:59 UTC
arrow git commit: ARROW-898: [C++/Python] Use shared_ptr to avoid
copying KeyValueMetadata, add to Field type also
Repository: arrow
Updated Branches:
refs/heads/master 3fdeac74c -> e876abbdf
ARROW-898: [C++/Python] Use shared_ptr to avoid copying KeyValueMetadata, add to Field type also
This also adds support for adding and removing schema-level metadata to the Python Schema wrapper object. Need to do the same for Field but putting this up for @cpcloud to review since he's working on using this in parquet-cpp
Author: Wes McKinney <we...@twosigma.com>
Closes #605 from wesm/ARROW-898 and squashes the following commits:
03873f7 [Wes McKinney] RemoveMetadata not return Status
c621c2c [Wes McKinney] Add metadata methods to Field, some code cleaning
581b9fa [Wes McKinney] Add unit tests for passing metadata to Field constructor
51fae29 [Wes McKinney] Add metadata to Field
2ce4003 [Wes McKinney] Test sharing metadata
48aa3ca [Wes McKinney] Use shared_ptr<const T> for KeyValueMetadata so metadata can be shared / not copied
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e876abbd
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e876abbd
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e876abbd
Branch: refs/heads/master
Commit: e876abbdf4f7bac53ae5d56f4680259f021ea8d9
Parents: 3fdeac7
Author: Wes McKinney <we...@twosigma.com>
Authored: Thu Apr 27 09:28:54 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Apr 27 09:28:54 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/builder.cc | 11 +-
cpp/src/arrow/io/io-memory-benchmark.cc | 6 +-
cpp/src/arrow/io/memory.cc | 4 +-
cpp/src/arrow/ipc/metadata.cc | 38 ++--
cpp/src/arrow/type-test.cc | 61 ++++--
cpp/src/arrow/type.cc | 60 ++++--
cpp/src/arrow/type.h | 32 ++-
cpp/src/arrow/util/key-value-metadata-test.cc | 9 +
cpp/src/arrow/util/key_value_metadata.cc | 4 +
cpp/src/arrow/util/key_value_metadata.h | 6 +
cpp/src/arrow/util/memory.h | 8 +-
python/pyarrow/_array.pxd | 2 -
python/pyarrow/_array.pyx | 220 +++++++++++++++++----
python/pyarrow/_table.pyx | 7 +-
python/pyarrow/includes/libarrow.pxd | 31 ++-
python/pyarrow/tests/pandas_examples.py | 4 +-
python/pyarrow/tests/test_convert_pandas.py | 64 +++---
python/pyarrow/tests/test_schema.py | 48 ++++-
18 files changed, 466 insertions(+), 149 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 4ecb8d3..ab43c2a 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -406,17 +406,16 @@ Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) {
ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> value_builder,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(pool,
- type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(value_builder->type()))),
+ : ArrayBuilder(
+ pool, type ? type : std::static_pointer_cast<DataType>(
+ std::make_shared<ListType>(value_builder->type()))),
offset_builder_(pool),
value_builder_(value_builder) {}
ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(pool,
- type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(values->type()))),
+ : ArrayBuilder(pool, type ? type : std::static_pointer_cast<DataType>(
+ std::make_shared<ListType>(values->type()))),
offset_builder_(pool),
values_(values) {}
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/io/io-memory-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-memory-benchmark.cc b/cpp/src/arrow/io/io-memory-benchmark.cc
index 59b511a..6aa9577 100644
--- a/cpp/src/arrow/io/io-memory-benchmark.cc
+++ b/cpp/src/arrow/io/io-memory-benchmark.cc
@@ -26,7 +26,7 @@
namespace arrow {
static void BM_SerialMemcopy(benchmark::State& state) { // NOLINT non-const reference
- constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB
+ constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB
auto buffer1 = std::make_shared<PoolBuffer>(default_memory_pool());
buffer1->Resize(kTotalSize);
@@ -43,7 +43,7 @@ static void BM_SerialMemcopy(benchmark::State& state) { // NOLINT non-const ref
}
static void BM_ParallelMemcopy(benchmark::State& state) { // NOLINT non-const reference
- constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB
+ constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB
auto buffer1 = std::make_shared<PoolBuffer>(default_memory_pool());
buffer1->Resize(kTotalSize);
@@ -72,4 +72,4 @@ BENCHMARK(BM_ParallelMemcopy)
->MinTime(1.0)
->UseRealTime();
-} // namespace arrow
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/io/memory.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc
index 95c6206..faf02d2 100644
--- a/cpp/src/arrow/io/memory.cc
+++ b/cpp/src/arrow/io/memory.cc
@@ -140,8 +140,8 @@ Status FixedSizeBufferWriter::Tell(int64_t* position) {
Status FixedSizeBufferWriter::Write(const uint8_t* data, int64_t nbytes) {
if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) {
- parallel_memcopy(mutable_data_ + position_, data, nbytes,
- memcopy_blocksize_, memcopy_num_threads_);
+ parallel_memcopy(mutable_data_ + position_, data, nbytes, memcopy_blocksize_,
+ memcopy_num_threads_);
} else {
memcpy(mutable_data_ + position_, data, nbytes);
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/ipc/metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc
index c0b518a..706ab2e 100644
--- a/cpp/src/arrow/ipc/metadata.cc
+++ b/cpp/src/arrow/ipc/metadata.cc
@@ -593,20 +593,27 @@ static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema,
field_offsets.push_back(offset);
}
+ auto fb_offsets = fbb.CreateVector(field_offsets);
+
/// Custom metadata
- const auto& custom_metadata_ = schema.custom_metadata();
- std::vector<KeyValueOffset> key_value_offsets;
- size_t metadata_size = custom_metadata_.size();
- key_value_offsets.reserve(metadata_size);
- for (size_t i = 0; i < metadata_size; ++i) {
- const auto& key = custom_metadata_.key(i);
- const auto& value = custom_metadata_.value(i);
- key_value_offsets.push_back(
- flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value)));
+ const KeyValueMetadata* metadata = schema.metadata().get();
+
+ if (metadata != nullptr) {
+ std::vector<KeyValueOffset> key_value_offsets;
+ size_t metadata_size = metadata->size();
+ key_value_offsets.reserve(metadata_size);
+ for (size_t i = 0; i < metadata_size; ++i) {
+ const auto& key = metadata->key(i);
+ const auto& value = metadata->value(i);
+ key_value_offsets.push_back(
+ flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value)));
+ }
+ *out = flatbuf::CreateSchema(
+ fbb, endianness(), fb_offsets, fbb.CreateVector(key_value_offsets));
+ } else {
+ *out = flatbuf::CreateSchema(fbb, endianness(), fb_offsets);
}
- *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets),
- fbb.CreateVector(key_value_offsets));
return Status::OK();
}
@@ -955,17 +962,16 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem
RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i]));
}
- KeyValueMetadata custom_metadata;
+ auto metadata = std::make_shared<KeyValueMetadata>();
auto fb_metadata = schema->custom_metadata();
if (fb_metadata != nullptr) {
- custom_metadata.reserve(fb_metadata->size());
-
+ metadata->reserve(fb_metadata->size());
for (const auto& pair : *fb_metadata) {
- custom_metadata.Append(pair->key()->str(), pair->value()->str());
+ metadata->Append(pair->key()->str(), pair->value()->str());
}
}
- *out = std::make_shared<Schema>(fields, custom_metadata);
+ *out = std::make_shared<Schema>(fields, metadata);
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/type-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index 8e2dfd5..e73adec 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -23,6 +23,7 @@
#include "gtest/gtest.h"
+#include "arrow/test-util.h"
#include "arrow/type.h"
using std::shared_ptr;
@@ -50,6 +51,40 @@ TEST(TestField, Equals) {
ASSERT_FALSE(f0.Equals(f0_nn));
}
+TEST(TestField, TestMetadataConstruction) {
+ auto metadata = std::shared_ptr<KeyValueMetadata>(
+ new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"}));
+ auto metadata2 = metadata->Copy();
+ auto f0 = field("f0", int32(), true, metadata);
+ auto f1 = field("f0", int32(), true, metadata2);
+ ASSERT_TRUE(metadata->Equals(*f0->metadata()));
+ ASSERT_TRUE(f0->Equals(*f1));
+}
+
+TEST(TestField, TestAddMetadata) {
+ auto metadata = std::shared_ptr<KeyValueMetadata>(
+ new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"}));
+ auto f0 = field("f0", int32());
+ auto f1 = field("f0", int32(), true, metadata);
+ std::shared_ptr<Field> f2;
+ ASSERT_OK(f0->AddMetadata(metadata, &f2));
+
+ ASSERT_FALSE(f2->Equals(*f0));
+ ASSERT_TRUE(f2->Equals(*f1));
+
+ // Not copied
+ ASSERT_TRUE(metadata.get() == f1->metadata().get());
+}
+
+TEST(TestField, TestRemoveMetadata) {
+ auto metadata = std::shared_ptr<KeyValueMetadata>(
+ new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"}));
+ auto f0 = field("f0", int32());
+ auto f1 = field("f0", int32(), true, metadata);
+ std::shared_ptr<Field> f2 = f1->RemoveMetadata();
+ ASSERT_TRUE(f2->metadata() == nullptr);
+}
+
class TestSchema : public ::testing::Test {
public:
void SetUp() {}
@@ -117,38 +152,42 @@ TEST_F(TestSchema, GetFieldByName) {
ASSERT_TRUE(result == nullptr);
}
-TEST_F(TestSchema, TestCustomMetadataConstruction) {
+TEST_F(TestSchema, TestMetadataConstruction) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
vector<shared_ptr<Field>> fields = {f0, f1, f2};
- KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"});
+ auto metadata = std::shared_ptr<KeyValueMetadata>(
+ new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"}));
auto schema = std::make_shared<Schema>(fields, metadata);
- ASSERT_TRUE(metadata.Equals(schema->custom_metadata()));
+ ASSERT_TRUE(metadata->Equals(*schema->metadata()));
}
-TEST_F(TestSchema, TestAddCustomMetadata) {
+TEST_F(TestSchema, TestAddMetadata) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
vector<shared_ptr<Field>> fields = {f0, f1, f2};
- KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"});
+ auto metadata = std::shared_ptr<KeyValueMetadata>(
+ new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"}));
auto schema = std::make_shared<Schema>(fields);
std::shared_ptr<Schema> new_schema;
- schema->AddCustomMetadata(metadata, &new_schema);
- ASSERT_TRUE(metadata.Equals(new_schema->custom_metadata()));
+ schema->AddMetadata(metadata, &new_schema);
+ ASSERT_TRUE(metadata->Equals(*new_schema->metadata()));
+
+ // Not copied
+ ASSERT_TRUE(metadata.get() == new_schema->metadata().get());
}
-TEST_F(TestSchema, TestRemoveCustomMetadata) {
+TEST_F(TestSchema, TestRemoveMetadata) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
vector<shared_ptr<Field>> fields = {f0, f1, f2};
KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"});
auto schema = std::make_shared<Schema>(fields);
- std::shared_ptr<Schema> new_schema;
- schema->RemoveCustomMetadata(&new_schema);
- ASSERT_EQ(0, new_schema->custom_metadata().size());
+ std::shared_ptr<Schema> new_schema = schema->RemoveMetadata();
+ ASSERT_TRUE(new_schema->metadata() == nullptr);
}
#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index f59f8fb..b1e322c 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -31,10 +31,31 @@
namespace arrow {
+Status Field::AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<Field>* out) const {
+ *out = std::make_shared<Field>(name_, type_, nullable_, metadata);
+ return Status::OK();
+}
+
+std::shared_ptr<Field> Field::RemoveMetadata() const {
+ return std::make_shared<Field>(name_, type_, nullable_);
+}
+
bool Field::Equals(const Field& other) const {
- return (this == &other) ||
- (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
- this->type_->Equals(*other.type_.get()));
+ if (this == &other) {
+ return true;
+ }
+ if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
+ this->type_->Equals(*other.type_.get())) {
+ if (metadata_ == nullptr && other.metadata_ == nullptr) {
+ return true;
+ } else if ((metadata_ == nullptr) ^ (other.metadata_ == nullptr)) {
+ return false;
+ } else {
+ return metadata_->Equals(*other.metadata_);
+ }
+ }
+ return false;
}
bool Field::Equals(const std::shared_ptr<Field>& other) const {
@@ -233,8 +254,8 @@ std::string NullType::ToString() const {
// Schema implementation
Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields,
- const KeyValueMetadata& custom_metadata)
- : fields_(fields), custom_metadata_(custom_metadata) {}
+ const std::shared_ptr<const KeyValueMetadata>& metadata)
+ : fields_(fields), metadata_(metadata) {}
bool Schema::Equals(const Schema& other) const {
if (this == &other) { return true; }
@@ -266,26 +287,25 @@ Status Schema::AddField(
DCHECK_GE(i, 0);
DCHECK_LE(i, this->num_fields());
- *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field), custom_metadata_);
+ *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field), metadata_);
return Status::OK();
}
-Status Schema::AddCustomMetadata(
- const KeyValueMetadata& custom_metadata, std::shared_ptr<Schema>* out) const {
- *out = std::make_shared<Schema>(fields_, custom_metadata);
+Status Schema::AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<Schema>* out) const {
+ *out = std::make_shared<Schema>(fields_, metadata);
return Status::OK();
}
-Status Schema::RemoveCustomMetadata(std::shared_ptr<Schema>* out) {
- *out = std::make_shared<Schema>(fields_, KeyValueMetadata());
- return Status::OK();
+std::shared_ptr<Schema> Schema::RemoveMetadata() const {
+ return std::make_shared<Schema>(fields_);
}
Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, this->num_fields());
- *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i), custom_metadata_);
+ *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i), metadata_);
return Status::OK();
}
@@ -298,6 +318,15 @@ std::string Schema::ToString() const {
buffer << field->ToString();
++i;
}
+
+ if (metadata_) {
+ buffer << "\n-- metadata --";
+ for (int64_t i = 0; i < metadata_->size(); ++i) {
+ buffer << "\n" << metadata_->key(i) << ": "
+ << metadata_->value(i);
+ }
+ }
+
return buffer.str();
}
@@ -391,8 +420,9 @@ std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type
}
std::shared_ptr<Field> field(
- const std::string& name, const TypePtr& type, bool nullable) {
- return std::make_shared<Field>(name, type, nullable);
+ const std::string& name, const std::shared_ptr<DataType>& type, bool nullable,
+ const std::shared_ptr<const KeyValueMetadata>& metadata) {
+ return std::make_shared<Field>(name, type, nullable, metadata);
}
std::shared_ptr<DataType> decimal(int precision, int scale) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index dc94561..bb25857 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -203,8 +203,16 @@ class NoExtraMeta {};
class ARROW_EXPORT Field {
public:
Field(const std::string& name, const std::shared_ptr<DataType>& type,
- bool nullable = true)
- : name_(name), type_(type), nullable_(nullable) {}
+ bool nullable = true,
+ const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr)
+ : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {}
+
+ std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; }
+
+ Status AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<Field>* out) const;
+
+ std::shared_ptr<Field> RemoveMetadata() const;
bool Equals(const Field& other) const;
bool Equals(const std::shared_ptr<Field>& other) const;
@@ -224,6 +232,9 @@ class ARROW_EXPORT Field {
// Fields can be nullable
bool nullable_;
+
+ // The field's metadata, if any
+ std::shared_ptr<const KeyValueMetadata> metadata_;
};
typedef std::shared_ptr<Field> FieldPtr;
@@ -679,7 +690,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
class ARROW_EXPORT Schema {
public:
explicit Schema(const std::vector<std::shared_ptr<Field>>& fields,
- const KeyValueMetadata& custom_metadata = KeyValueMetadata());
+ const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr);
// Returns true if all of the schema fields are equal
bool Equals(const Schema& other) const;
@@ -691,7 +702,7 @@ class ARROW_EXPORT Schema {
std::shared_ptr<Field> GetFieldByName(const std::string& name);
const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; }
- const KeyValueMetadata& custom_metadata() const { return custom_metadata_; }
+ std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; }
// Render a string representation of the schema suitable for debugging
std::string ToString() const;
@@ -700,16 +711,18 @@ class ARROW_EXPORT Schema {
int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const;
Status RemoveField(int i, std::shared_ptr<Schema>* out) const;
- Status AddCustomMetadata(
- const KeyValueMetadata& metadata, std::shared_ptr<Schema>* out) const;
- Status RemoveCustomMetadata(std::shared_ptr<Schema>* out);
+ Status AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<Schema>* out) const;
+
+ std::shared_ptr<Schema> RemoveMetadata() const;
int num_fields() const { return static_cast<int>(fields_.size()); }
private:
std::vector<std::shared_ptr<Field>> fields_;
std::unordered_map<std::string, int> name_to_index_;
- KeyValueMetadata custom_metadata_;
+
+ std::shared_ptr<const KeyValueMetadata> metadata_;
};
// ----------------------------------------------------------------------
@@ -741,7 +754,8 @@ std::shared_ptr<DataType> ARROW_EXPORT dictionary(
const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& values);
std::shared_ptr<Field> ARROW_EXPORT field(
- const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true);
+ const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true,
+ const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr);
// ----------------------------------------------------------------------
//
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/key-value-metadata-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/key-value-metadata-test.cc b/cpp/src/arrow/util/key-value-metadata-test.cc
index aadc989..59cfdf5 100644
--- a/cpp/src/arrow/util/key-value-metadata-test.cc
+++ b/cpp/src/arrow/util/key-value-metadata-test.cc
@@ -72,6 +72,15 @@ TEST(KeyValueMetadataTest, StringAppend) {
ASSERT_EQ("red", metadata.value(3));
}
+TEST(KeyValueMetadataTest, Copy) {
+ std::vector<std::string> keys = {"foo", "bar"};
+ std::vector<std::string> values = {"bizz", "buzz"};
+
+ KeyValueMetadata metadata(keys, values);
+ auto metadata2 = metadata.Copy();
+ ASSERT_TRUE(metadata.Equals(*metadata2));
+}
+
TEST(KeyValueMetadataTest, Equals) {
std::vector<std::string> keys = {"foo", "bar"};
std::vector<std::string> values = {"bizz", "buzz"};
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/key_value_metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc
index c91478b..8bddd5d 100644
--- a/cpp/src/arrow/util/key_value_metadata.cc
+++ b/cpp/src/arrow/util/key_value_metadata.cc
@@ -91,6 +91,10 @@ std::string KeyValueMetadata::value(int64_t i) const {
return values_[static_cast<size_t>(i)];
}
+std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Copy() const {
+ return std::make_shared<KeyValueMetadata>(keys_, values_);
+}
+
bool KeyValueMetadata::Equals(const KeyValueMetadata& other) const {
return size() == other.size() &&
std::equal(keys_.cbegin(), keys_.cend(), other.keys_.cbegin()) &&
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/key_value_metadata.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h
index 713b2c0..bae4ad8 100644
--- a/cpp/src/arrow/util/key_value_metadata.h
+++ b/cpp/src/arrow/util/key_value_metadata.h
@@ -19,10 +19,12 @@
#define ARROW_UTIL_KEY_VALUE_METADATA_H
#include <cstdint>
+#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
+#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -44,11 +46,15 @@ class ARROW_EXPORT KeyValueMetadata {
std::string key(int64_t i) const;
std::string value(int64_t i) const;
+ std::shared_ptr<KeyValueMetadata> Copy() const;
+
bool Equals(const KeyValueMetadata& other) const;
private:
std::vector<std::string> keys_;
std::vector<std::string> values_;
+
+ DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata);
};
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/memory.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/memory.h b/cpp/src/arrow/util/memory.h
index 7feeb29..c5c17ef 100644
--- a/cpp/src/arrow/util/memory.h
+++ b/cpp/src/arrow/util/memory.h
@@ -31,7 +31,7 @@ uint8_t* pointer_logical_and(const uint8_t* address, uintptr_t bits) {
// A helper function for doing memcpy with multiple threads. This is required
// to saturate the memory bandwidth of modern cpus.
void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes,
- uintptr_t block_size, int num_threads) {
+ uintptr_t block_size, int num_threads) {
std::vector<std::thread> threadpool(num_threads);
uint8_t* left = pointer_logical_and(src + block_size - 1, ~(block_size - 1));
uint8_t* right = pointer_logical_and(src + nbytes, ~(block_size - 1));
@@ -52,8 +52,8 @@ void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes,
// Start all threads first and handle leftovers while threads run.
for (int i = 0; i < num_threads; i++) {
- threadpool[i] = std::thread(memcpy, dst + prefix + i * chunk_size,
- left + i * chunk_size, chunk_size);
+ threadpool[i] = std::thread(
+ memcpy, dst + prefix + i * chunk_size, left + i * chunk_size, chunk_size);
}
memcpy(dst, src, prefix);
@@ -64,6 +64,6 @@ void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes,
}
}
-} // namespace arrow
+} // namespace arrow
#endif // ARROW_UTIL_MEMORY_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/_array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd
index 4d5db86..464de31 100644
--- a/python/pyarrow/_array.pxd
+++ b/python/pyarrow/_array.pxd
@@ -81,8 +81,6 @@ cdef class Schema:
cdef init(self, const vector[shared_ptr[CField]]& fields)
cdef init_schema(self, const shared_ptr[CSchema]& schema)
- cpdef dict custom_metadata(self)
-
cdef class Scalar:
cdef readonly:
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/_array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx
index 2fb20b7..658f4b3 100644
--- a/python/pyarrow/_array.pyx
+++ b/python/pyarrow/_array.pyx
@@ -172,7 +172,14 @@ cdef class DecimalType(FixedSizeBinaryType):
cdef class Field:
+ """
+ Represents a named field, with a data type, nullability, and optional
+ metadata
+ Notes
+ -----
+ Do not use this class's constructor directly; use pyarrow.field
+ """
def __cinit__(self):
pass
@@ -181,32 +188,77 @@ cdef class Field:
self.field = field.get()
self.type = box_data_type(field.get().type())
- @classmethod
- def from_py(cls, object name, DataType type, bint nullable=True):
- cdef Field result = Field()
- result.type = type
- result.sp_field.reset(new CField(tobytes(name), type.sp_type,
- nullable))
- result.field = result.sp_field.get()
+ def equals(self, Field other):
+ """
+ Test if this field is equal to the other
+ """
+ return self.field.Equals(deref(other.field))
- return result
+ def __str__(self):
+ self._check_null()
+ return 'pyarrow.Field<{0}>'.format(frombytes(self.field.ToString()))
def __repr__(self):
- return 'Field({0!r}, type={1})'.format(self.name, str(self.type))
+ return self.__str__()
property nullable:
def __get__(self):
+ self._check_null()
return self.field.nullable()
property name:
def __get__(self):
- if box_field(self.sp_field) is None:
- raise ReferenceError(
- 'Field not initialized (references NULL pointer)')
+ self._check_null()
return frombytes(self.field.name())
+ property metadata:
+
+ def __get__(self):
+ self._check_null()
+ return box_metadata(self.field.metadata().get())
+
+ def _check_null(self):
+ if self.field == NULL:
+ raise ReferenceError(
+ 'Field not initialized (references NULL pointer)')
+
+ def add_metadata(self, dict metadata):
+ """
+ Add metadata as dict of string keys and values to Field
+
+ Parameters
+ ----------
+ metadata : dict
+ Keys and values must be string-like / coercible to bytes
+
+ Returns
+ -------
+ field : pyarrow.Field
+ """
+ cdef shared_ptr[CKeyValueMetadata] c_meta
+ convert_metadata(metadata, &c_meta)
+
+ cdef shared_ptr[CField] new_field
+ with nogil:
+ check_status(self.field.AddMetadata(c_meta, &new_field))
+
+ return box_field(new_field)
+
+ def remove_metadata(self):
+ """
+ Create new field without metadata, if any
+
+ Returns
+ -------
+ field : pyarrow.Field
+ """
+ cdef shared_ptr[CField] new_field
+ with nogil:
+ new_field = self.field.RemoveMetadata()
+ return box_field(new_field)
+
cdef class Schema:
@@ -226,6 +278,14 @@ cdef class Schema:
return result
+ cdef init(self, const vector[shared_ptr[CField]]& fields):
+ self.schema = new CSchema(fields)
+ self.sp_schema.reset(self.schema)
+
+ cdef init_schema(self, const shared_ptr[CSchema]& schema):
+ self.schema = schema.get()
+ self.sp_schema = schema
+
property names:
def __get__(self):
@@ -236,20 +296,10 @@ cdef class Schema:
result.append(name)
return result
- cdef init(self, const vector[shared_ptr[CField]]& fields):
- self.schema = new CSchema(fields)
- self.sp_schema.reset(self.schema)
-
- cdef init_schema(self, const shared_ptr[CSchema]& schema):
- self.schema = schema.get()
- self.sp_schema = schema
+ property metadata:
- cpdef dict custom_metadata(self):
- cdef:
- CKeyValueMetadata metadata = self.schema.custom_metadata()
- unordered_map[c_string, c_string] result
- metadata.ToUnorderedMap(&result)
- return result
+ def __get__(self):
+ return box_metadata(self.schema.metadata().get())
def equals(self, other):
"""
@@ -274,23 +324,40 @@ cdef class Schema:
"""
return box_field(self.schema.GetFieldByName(tobytes(name)))
- @classmethod
- def from_fields(cls, fields):
- cdef:
- Schema result
- Field field
- vector[shared_ptr[CField]] c_fields
+ def add_metadata(self, dict metadata):
+ """
+ Add metadata as dict of string keys and values to Schema
- c_fields.resize(len(fields))
+ Parameters
+ ----------
+ metadata : dict
+ Keys and values must be string-like / coercible to bytes
- for i in range(len(fields)):
- field = fields[i]
- c_fields[i] = field.sp_field
+ Returns
+ -------
+ schema : pyarrow.Schema
+ """
+ cdef shared_ptr[CKeyValueMetadata] c_meta
+ convert_metadata(metadata, &c_meta)
- result = Schema()
- result.init(c_fields)
+ cdef shared_ptr[CSchema] new_schema
+ with nogil:
+ check_status(self.schema.AddMetadata(c_meta, &new_schema))
- return result
+ return box_schema(new_schema)
+
+ def remove_metadata(self):
+ """
+ Create new schema without metadata, if any
+
+ Returns
+ -------
+ schema : pyarrow.Schema
+ """
+ cdef shared_ptr[CSchema] new_schema
+ with nogil:
+ new_schema = self.schema.RemoveMetadata()
+ return box_schema(new_schema)
def __str__(self):
return frombytes(self.schema.ToString())
@@ -299,6 +366,15 @@ cdef class Schema:
return self.__str__()
+cdef box_metadata(const CKeyValueMetadata* metadata):
+ cdef unordered_map[c_string, c_string] result
+ if metadata != NULL:
+ metadata.ToUnorderedMap(&result)
+ return result
+ else:
+ return None
+
+
cdef dict _type_cache = {}
@@ -315,8 +391,49 @@ cdef DataType primitive_type(Type type):
#------------------------------------------------------------
# Type factory functions
-def field(name, type, bint nullable=True):
- return Field.from_py(name, type, nullable)
+cdef int convert_metadata(dict metadata,
+ shared_ptr[CKeyValueMetadata]* out) except -1:
+ cdef:
+ shared_ptr[CKeyValueMetadata] meta = (
+ make_shared[CKeyValueMetadata]())
+ c_string key, value
+
+ for py_key, py_value in metadata.items():
+ key = tobytes(py_key)
+ value = tobytes(py_value)
+ meta.get().Append(key, value)
+ out[0] = meta
+ return 0
+
+
+def field(name, DataType type, bint nullable=True, dict metadata=None):
+ """
+ Create a pyarrow.Field instance
+
+ Parameters
+ ----------
+ name : string or bytes
+ type : pyarrow.DataType
+ nullable : boolean, default True
+ metadata : dict, default None
+ Keys and values must be coercible to bytes
+
+ Returns
+ -------
+ field : pyarrow.Field
+ """
+ cdef:
+ shared_ptr[CKeyValueMetadata] c_meta
+ Field result = Field()
+
+ if metadata is not None:
+ convert_metadata(metadata, &c_meta)
+
+ result.sp_field.reset(new CField(tobytes(name), type.sp_type,
+ nullable, c_meta))
+ result.field = result.sp_field.get()
+ result.type = type
+ return result
cdef set PRIMITIVE_TYPES = set([
@@ -554,7 +671,28 @@ def struct(fields):
def schema(fields):
- return Schema.from_fields(fields)
+ """
+ Construct pyarrow.Schema from collection of fields
+
+ Parameters
+ ----------
+ field : list or iterable
+
+ Returns
+ -------
+ schema : pyarrow.Schema
+ """
+ cdef:
+ Schema result
+ Field field
+ vector[shared_ptr[CField]] c_fields
+
+ for i, field in enumerate(fields):
+ c_fields.push_back(field.sp_field)
+
+ result = Schema()
+ result.init(c_fields)
+ return result
cdef DataType box_data_type(const shared_ptr[CDataType]& type):
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/_table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx
index ed0782b..599e046 100644
--- a/python/pyarrow/_table.pyx
+++ b/python/pyarrow/_table.pyx
@@ -272,11 +272,12 @@ cdef class Column:
return chunked_array
-cdef CKeyValueMetadata key_value_metadata_from_dict(dict metadata):
+cdef shared_ptr[const CKeyValueMetadata] key_value_metadata_from_dict(
+ dict metadata):
cdef:
unordered_map[c_string, c_string] unordered_metadata = metadata
- CKeyValueMetadata c_metadata = CKeyValueMetadata(unordered_metadata)
- return c_metadata
+ return (<shared_ptr[const CKeyValueMetadata]>
+ make_shared[CKeyValueMetadata](unordered_metadata))
cdef int _schema_from_arrays(
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index ef1a332..8a730b3 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -23,6 +23,10 @@ cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil:
cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata":
CKeyValueMetadata()
CKeyValueMetadata(const unordered_map[c_string, c_string]&)
+
+ c_bool Equals(const CKeyValueMetadata& other)
+
+ void Append(const c_string& key, const c_string& value)
void ToUnorderedMap(unordered_map[c_string, c_string]*) const
cdef extern from "arrow/api.h" namespace "arrow" nogil:
@@ -168,25 +172,48 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CDataType] type()
c_bool nullable()
+ c_string ToString()
+ c_bool Equals(const CField& other)
+
+ shared_ptr[const CKeyValueMetadata] metadata()
+
CField(const c_string& name, const shared_ptr[CDataType]& type,
c_bool nullable)
+ CField(const c_string& name, const shared_ptr[CDataType]& type,
+ c_bool nullable, const shared_ptr[CKeyValueMetadata]& metadata)
+
+ # Removed const in Cython so don't have to cast to get code to generate
+ CStatus AddMetadata(const shared_ptr[CKeyValueMetadata]& metadata,
+ shared_ptr[CField]* out)
+ shared_ptr[CField] RemoveMetadata()
+
+
cdef cppclass CStructType" arrow::StructType"(CDataType):
CStructType(const vector[shared_ptr[CField]]& fields)
cdef cppclass CSchema" arrow::Schema":
CSchema(const vector[shared_ptr[CField]]& fields)
CSchema(const vector[shared_ptr[CField]]& fields,
- const CKeyValueMetadata& custom_metadata)
+ const shared_ptr[const CKeyValueMetadata]& metadata)
+
+ # Does not actually exist, but gets Cython to not complain
+ CSchema(const vector[shared_ptr[CField]]& fields,
+ const shared_ptr[CKeyValueMetadata]& metadata)
c_bool Equals(const CSchema& other)
shared_ptr[CField] field(int i)
- const CKeyValueMetadata& custom_metadata() const
+ shared_ptr[const CKeyValueMetadata] metadata()
shared_ptr[CField] GetFieldByName(c_string& name)
int num_fields()
c_string ToString()
+ # Removed const in Cython so don't have to cast to get code to generate
+ CStatus AddMetadata(const shared_ptr[CKeyValueMetadata]& metadata,
+ shared_ptr[CSchema]* out)
+ shared_ptr[CSchema] RemoveMetadata()
+
cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray):
c_bool Value(int i)
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/tests/pandas_examples.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py
index e081c38..313a3ae 100644
--- a/python/pyarrow/tests/pandas_examples.py
+++ b/python/pyarrow/tests/pandas_examples.py
@@ -73,7 +73,7 @@ def dataframe_with_arrays():
]
df = pd.DataFrame(arrays)
- schema = pa.Schema.from_fields(fields)
+ schema = pa.schema(fields)
return df, schema
@@ -114,6 +114,6 @@ def dataframe_with_lists():
]
df = pd.DataFrame(arrays)
- schema = pa.Schema.from_fields(fields)
+ schema = pa.schema(fields)
return df, schema
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index f360234..2779da3 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -106,10 +106,10 @@ class TestPandasConversion(unittest.TestCase):
for numpy_dtype, arrow_dtype in dtypes:
values = np.random.randn(num_values)
data[numpy_dtype] = values.astype(numpy_dtype)
- fields.append(pa.Field.from_py(numpy_dtype, arrow_dtype))
+ fields.append(pa.field(numpy_dtype, arrow_dtype))
df = pd.DataFrame(data)
- schema = pa.Schema.from_fields(fields)
+ schema = pa.schema(fields)
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_float_nulls(self):
@@ -127,7 +127,7 @@ class TestPandasConversion(unittest.TestCase):
arr = pa.Array.from_pandas(values, null_mask)
arrays.append(arr)
- fields.append(pa.Field.from_py(name, arrow_dtype))
+ fields.append(pa.field(name, arrow_dtype))
values[null_mask] = np.nan
expected_cols.append(values)
@@ -136,7 +136,7 @@ class TestPandasConversion(unittest.TestCase):
columns=names)
table = pa.Table.from_arrays(arrays, names)
- assert table.schema.equals(pa.Schema.from_fields(fields))
+ assert table.schema.equals(pa.schema(fields))
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
@@ -159,10 +159,10 @@ class TestPandasConversion(unittest.TestCase):
min(info.max, np.iinfo('i8').max),
size=num_values)
data[dtype] = values.astype(dtype)
- fields.append(pa.Field.from_py(dtype, arrow_dtype))
+ fields.append(pa.field(dtype, arrow_dtype))
df = pd.DataFrame(data)
- schema = pa.Schema.from_fields(fields)
+ schema = pa.schema(fields)
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_integer_with_nulls(self):
@@ -200,8 +200,8 @@ class TestPandasConversion(unittest.TestCase):
np.random.seed(0)
df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
- field = pa.Field.from_py('bools', pa.bool_())
- schema = pa.Schema.from_fields([field])
+ field = pa.field('bools', pa.bool_())
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_boolean_nulls(self):
@@ -217,8 +217,8 @@ class TestPandasConversion(unittest.TestCase):
expected = values.astype(object)
expected[mask] = None
- field = pa.Field.from_py('bools', pa.bool_())
- schema = pa.Schema.from_fields([field])
+ field = pa.field('bools', pa.bool_())
+ schema = pa.schema([field])
ex_frame = pd.DataFrame({'bools': expected})
table = pa.Table.from_arrays([arr], ['bools'])
@@ -230,16 +230,16 @@ class TestPandasConversion(unittest.TestCase):
def test_boolean_object_nulls(self):
arr = np.array([False, None, True] * 100, dtype=object)
df = pd.DataFrame({'bools': arr})
- field = pa.Field.from_py('bools', pa.bool_())
- schema = pa.Schema.from_fields([field])
+ field = pa.field('bools', pa.bool_())
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_unicode(self):
repeats = 1000
values = [u'foo', None, u'bar', u'ma�ana', np.nan]
df = pd.DataFrame({'strings': values * repeats})
- field = pa.Field.from_py('strings', pa.string())
- schema = pa.Schema.from_fields([field])
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, expected_schema=schema)
@@ -257,7 +257,7 @@ class TestPandasConversion(unittest.TestCase):
def test_fixed_size_bytes(self):
values = [b'foo', None, b'bar', None, None, b'hey']
df = pd.DataFrame({'strings': values})
- schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))])
+ schema = pa.schema([pa.field('strings', pa.binary(3))])
table = pa.Table.from_pandas(df, schema=schema)
assert table.schema[0].type == schema[0].type
assert table.schema[0].name == schema[0].name
@@ -267,7 +267,7 @@ class TestPandasConversion(unittest.TestCase):
def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
values = [b'foo', None, b'ba', None, None, b'hey']
df = pd.DataFrame({'strings': values})
- schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))])
+ schema = pa.schema([pa.field('strings', pa.binary(3))])
with self.assertRaises(pa.ArrowInvalid):
pa.Table.from_pandas(df, schema=schema)
@@ -279,8 +279,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437'],
dtype='datetime64[ms]')
})
- field = pa.Field.from_py('datetime64', pa.timestamp('ms'))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('datetime64', pa.timestamp('ms'))
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=True,
expected_schema=schema)
@@ -291,8 +291,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ns]')
})
- field = pa.Field.from_py('datetime64', pa.timestamp('ns'))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('datetime64', pa.timestamp('ns'))
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=False,
expected_schema=schema)
@@ -304,8 +304,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437'],
dtype='datetime64[ms]')
})
- field = pa.Field.from_py('datetime64', pa.timestamp('ms'))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('datetime64', pa.timestamp('ms'))
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=True,
expected_schema=schema)
@@ -316,8 +316,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ns]')
})
- field = pa.Field.from_py('datetime64', pa.timestamp('ns'))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('datetime64', pa.timestamp('ns'))
+ schema = pa.schema([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=False,
expected_schema=schema)
@@ -353,8 +353,8 @@ class TestPandasConversion(unittest.TestCase):
datetime.date(1970, 1, 1),
datetime.date(2040, 2, 26)]})
table = pa.Table.from_pandas(df)
- field = pa.Field.from_py('date', pa.date32())
- schema = pa.Schema.from_fields([field])
+ field = pa.field('date', pa.date32())
+ schema = pa.schema([field])
assert table.schema.equals(schema)
result = table.to_pandas()
expected = df.copy()
@@ -526,8 +526,8 @@ class TestPandasConversion(unittest.TestCase):
]
})
converted = pa.Table.from_pandas(expected)
- field = pa.Field.from_py('decimals', pa.decimal(7, 3))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('decimals', pa.decimal(7, 3))
+ schema = pa.schema([field])
assert converted.schema.equals(schema)
def test_decimal_32_to_pandas(self):
@@ -549,8 +549,8 @@ class TestPandasConversion(unittest.TestCase):
]
})
converted = pa.Table.from_pandas(expected)
- field = pa.Field.from_py('decimals', pa.decimal(12, 6))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('decimals', pa.decimal(12, 6))
+ schema = pa.schema([field])
assert converted.schema.equals(schema)
def test_decimal_64_to_pandas(self):
@@ -572,8 +572,8 @@ class TestPandasConversion(unittest.TestCase):
]
})
converted = pa.Table.from_pandas(expected)
- field = pa.Field.from_py('decimals', pa.decimal(26, 11))
- schema = pa.Schema.from_fields([field])
+ field = pa.field('decimals', pa.decimal(26, 11))
+ schema = pa.schema([field])
assert converted.schema.equals(schema)
def test_decimal_128_to_pandas(self):
http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/tests/test_schema.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index da704f3..b3abc0f 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -118,7 +118,7 @@ def test_field():
assert f.name == 'foo'
assert f.nullable
assert f.type is t
- assert repr(f) == "Field('foo', type=string)"
+ assert repr(f) == "pyarrow.Field<foo: string>"
f = pa.field('foo', t, False)
assert not f.nullable
@@ -152,6 +152,52 @@ def test_field_empty():
repr(f)
+def test_field_add_remove_metadata():
+ f0 = pa.field('foo', pa.int32())
+
+ assert f0.metadata is None
+
+ metadata = {b'foo': b'bar', b'pandas': b'badger'}
+
+ f1 = f0.add_metadata(metadata)
+ assert f1.metadata == metadata
+
+ f3 = f1.remove_metadata()
+ assert f3.metadata is None
+
+ # idempotent
+ f4 = f3.remove_metadata()
+ assert f4.metadata is None
+
+ f5 = pa.field('foo', pa.int32(), True, metadata)
+ f6 = f0.add_metadata(metadata)
+ assert f5.equals(f6)
+
+
+def test_schema_add_remove_metadata():
+ fields = [
+ pa.field('foo', pa.int32()),
+ pa.field('bar', pa.string()),
+ pa.field('baz', pa.list_(pa.int8()))
+ ]
+
+ s1 = pa.schema(fields)
+
+ assert s1.metadata is None
+
+ metadata = {b'foo': b'bar', b'pandas': b'badger'}
+
+ s2 = s1.add_metadata(metadata)
+ assert s2.metadata == metadata
+
+ s3 = s2.remove_metadata()
+ assert s3.metadata is None
+
+ # idempotent
+ s4 = s3.remove_metadata()
+ assert s4.metadata is None
+
+
def test_schema_equals():
fields = [
pa.field('foo', pa.int32()),