You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/25 21:36:36 UTC
arrow git commit: ARROW-483: [C++/Python] Provide access to
"custom_metadata" Field attribute in IPC setting
Repository: arrow
Updated Branches:
refs/heads/master 949249d9e -> 7d433dc27
ARROW-483: [C++/Python] Provide access to "custom_metadata" Field attribute in IPC setting
Author: Phillip Cloud <cp...@gmail.com>
Closes #588 from cpcloud/ARROW-483 and squashes the following commits:
f671ba4 [Phillip Cloud] ARROW-483: [C++/Python] Provide access to "custom_metadata" Field attribute in IPC setting
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7d433dc2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7d433dc2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7d433dc2
Branch: refs/heads/master
Commit: 7d433dc27bf70b5d80b8c88261a19cdc615defdb
Parents: 949249d
Author: Phillip Cloud <cp...@gmail.com>
Authored: Tue Apr 25 17:36:31 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Apr 25 17:36:31 2017 -0400
----------------------------------------------------------------------
cpp/CMakeLists.txt | 1 +
cpp/src/arrow/array.cc | 2 +-
cpp/src/arrow/builder.cc | 13 ++-
cpp/src/arrow/ipc/metadata.cc | 30 ++++++-
cpp/src/arrow/type-test.cc | 34 ++++++++
cpp/src/arrow/type.cc | 20 ++++-
cpp/src/arrow/type.h | 10 ++-
cpp/src/arrow/util/CMakeLists.txt | 2 +
cpp/src/arrow/util/key-value-metadata-test.cc | 87 +++++++++++++++++++
cpp/src/arrow/util/key_value_metadata.cc | 99 ++++++++++++++++++++++
cpp/src/arrow/util/key_value_metadata.h | 56 ++++++++++++
format/Schema.fbs | 2 +-
python/.gitignore | 1 +
python/pyarrow/_array.pxd | 2 +
python/pyarrow/_array.pyx | 7 ++
python/pyarrow/_table.pyx | 64 ++++++++------
python/pyarrow/includes/common.pxd | 3 +-
python/pyarrow/includes/libarrow.pxd | 11 ++-
18 files changed, 401 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2d8c00f..5abe5f1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -944,6 +944,7 @@ set(ARROW_SRCS
src/arrow/util/bit-util.cc
src/arrow/util/decimal.cc
+ src/arrow/util/key_value_metadata.cc
)
if (ARROW_IPC)
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index e640bbd..76dda2c 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -113,7 +113,7 @@ Status Array::Validate() const {
static inline void ConformSliceParams(
int64_t array_offset, int64_t array_length, int64_t* offset, int64_t* length) {
DCHECK_LE(*offset, array_length);
- DCHECK_GE(offset, 0);
+ DCHECK_NE(offset, nullptr);
*length = std::min(array_length - *offset, *length);
*offset = array_offset + *offset;
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index d85eb32..4ecb8d3 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -363,8 +363,6 @@ ARROW_EXPORT Status DecimalBuilder::Append(const decimal::Decimal128& value) {
return Status::OK();
}
-template ARROW_EXPORT Status DecimalBuilder::Append(const decimal::Decimal128& val);
-
Status DecimalBuilder::Init(int64_t capacity) {
RETURN_NOT_OK(FixedSizeBinaryBuilder::Init(capacity));
if (byte_width_ == 16) {
@@ -408,16 +406,17 @@ Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) {
ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> value_builder,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(
- pool, type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(value_builder->type()))),
+ : ArrayBuilder(pool,
+ type ? type : std::static_pointer_cast<DataType>(
+ std::make_shared<ListType>(value_builder->type()))),
offset_builder_(pool),
value_builder_(value_builder) {}
ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(pool, type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(values->type()))),
+ : ArrayBuilder(pool,
+ type ? type : std::static_pointer_cast<DataType>(
+ std::make_shared<ListType>(values->type()))),
offset_builder_(pool),
values_(values) {}
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/ipc/metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc
index 791948b..c0b518a 100644
--- a/cpp/src/arrow/ipc/metadata.cc
+++ b/cpp/src/arrow/ipc/metadata.cc
@@ -45,6 +45,7 @@ namespace ipc {
using FBB = flatbuffers::FlatBufferBuilder;
using DictionaryOffset = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
using FieldOffset = flatbuffers::Offset<flatbuf::Field>;
+using KeyValueOffset = flatbuffers::Offset<flatbuf::KeyValue>;
using RecordBatchOffset = flatbuffers::Offset<flatbuf::RecordBatch>;
using VectorLayoutOffset = flatbuffers::Offset<arrow::flatbuf::VectorLayout>;
using Offset = flatbuffers::Offset<void>;
@@ -583,6 +584,7 @@ flatbuf::Endianness endianness() {
static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema,
DictionaryMemo* dictionary_memo, flatbuffers::Offset<flatbuf::Schema>* out) {
+ /// Fields
std::vector<FieldOffset> field_offsets;
for (int i = 0; i < schema.num_fields(); ++i) {
std::shared_ptr<Field> field = schema.field(i);
@@ -591,7 +593,20 @@ static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema,
field_offsets.push_back(offset);
}
- *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets));
+ /// Custom metadata
+ const auto& custom_metadata_ = schema.custom_metadata();
+ std::vector<KeyValueOffset> key_value_offsets;
+ size_t metadata_size = custom_metadata_.size();
+ key_value_offsets.reserve(metadata_size);
+ for (size_t i = 0; i < metadata_size; ++i) {
+ const auto& key = custom_metadata_.key(i);
+ const auto& value = custom_metadata_.value(i);
+ key_value_offsets.push_back(
+ flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value)));
+ }
+
+ *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets),
+ fbb.CreateVector(key_value_offsets));
return Status::OK();
}
@@ -939,7 +954,18 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem
const flatbuf::Field* field = schema->fields()->Get(i);
RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i]));
}
- *out = std::make_shared<Schema>(fields);
+
+ KeyValueMetadata custom_metadata;
+ auto fb_metadata = schema->custom_metadata();
+ if (fb_metadata != nullptr) {
+ custom_metadata.reserve(fb_metadata->size());
+
+ for (const auto& pair : *fb_metadata) {
+ custom_metadata.Append(pair->key()->str(), pair->value()->str());
+ }
+ }
+
+ *out = std::make_shared<Schema>(fields, custom_metadata);
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/type-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index dec7268..8e2dfd5 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -117,6 +117,40 @@ TEST_F(TestSchema, GetFieldByName) {
ASSERT_TRUE(result == nullptr);
}
+TEST_F(TestSchema, TestCustomMetadataConstruction) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", uint8(), false);
+ auto f2 = field("f2", utf8());
+ vector<shared_ptr<Field>> fields = {f0, f1, f2};
+ KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"});
+ auto schema = std::make_shared<Schema>(fields, metadata);
+ ASSERT_TRUE(metadata.Equals(schema->custom_metadata()));
+}
+
+TEST_F(TestSchema, TestAddCustomMetadata) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", uint8(), false);
+ auto f2 = field("f2", utf8());
+ vector<shared_ptr<Field>> fields = {f0, f1, f2};
+ KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"});
+ auto schema = std::make_shared<Schema>(fields);
+ std::shared_ptr<Schema> new_schema;
+ schema->AddCustomMetadata(metadata, &new_schema);
+ ASSERT_TRUE(metadata.Equals(new_schema->custom_metadata()));
+}
+
+TEST_F(TestSchema, TestRemoveCustomMetadata) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", uint8(), false);
+ auto f2 = field("f2", utf8());
+ vector<shared_ptr<Field>> fields = {f0, f1, f2};
+ KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"});
+ auto schema = std::make_shared<Schema>(fields);
+ std::shared_ptr<Schema> new_schema;
+ schema->RemoveCustomMetadata(&new_schema);
+ ASSERT_EQ(0, new_schema->custom_metadata().size());
+}
+
#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \
TEST(TypesTest, TestPrimitive_##ENUM) { \
KLASS tp; \
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 2e454ae..f59f8fb 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -24,6 +24,7 @@
#include "arrow/array.h"
#include "arrow/compare.h"
#include "arrow/status.h"
+#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/stl.h"
#include "arrow/visitor.h"
@@ -231,7 +232,9 @@ std::string NullType::ToString() const {
// ----------------------------------------------------------------------
// Schema implementation
-Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields) : fields_(fields) {}
+Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields,
+ const KeyValueMetadata& custom_metadata)
+ : fields_(fields), custom_metadata_(custom_metadata) {}
bool Schema::Equals(const Schema& other) const {
if (this == &other) { return true; }
@@ -263,7 +266,18 @@ Status Schema::AddField(
DCHECK_GE(i, 0);
DCHECK_LE(i, this->num_fields());
- *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field));
+ *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field), custom_metadata_);
+ return Status::OK();
+}
+
+Status Schema::AddCustomMetadata(
+ const KeyValueMetadata& custom_metadata, std::shared_ptr<Schema>* out) const {
+ *out = std::make_shared<Schema>(fields_, custom_metadata);
+ return Status::OK();
+}
+
+Status Schema::RemoveCustomMetadata(std::shared_ptr<Schema>* out) {
+ *out = std::make_shared<Schema>(fields_, KeyValueMetadata());
return Status::OK();
}
@@ -271,7 +285,7 @@ Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, this->num_fields());
- *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i));
+ *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i), custom_metadata_);
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index ea4ea03..dc94561 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -28,6 +28,7 @@
#include "arrow/status.h"
#include "arrow/type_fwd.h"
+#include "arrow/util/key_value_metadata.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
#include "arrow/visitor.h"
@@ -677,7 +678,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
class ARROW_EXPORT Schema {
public:
- explicit Schema(const std::vector<std::shared_ptr<Field>>& fields);
+ explicit Schema(const std::vector<std::shared_ptr<Field>>& fields,
+ const KeyValueMetadata& custom_metadata = KeyValueMetadata());
// Returns true if all of the schema fields are equal
bool Equals(const Schema& other) const;
@@ -689,6 +691,7 @@ class ARROW_EXPORT Schema {
std::shared_ptr<Field> GetFieldByName(const std::string& name);
const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; }
+ const KeyValueMetadata& custom_metadata() const { return custom_metadata_; }
// Render a string representation of the schema suitable for debugging
std::string ToString() const;
@@ -697,11 +700,16 @@ class ARROW_EXPORT Schema {
int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const;
Status RemoveField(int i, std::shared_ptr<Schema>* out) const;
+ Status AddCustomMetadata(
+ const KeyValueMetadata& metadata, std::shared_ptr<Schema>* out) const;
+ Status RemoveCustomMetadata(std::shared_ptr<Schema>* out);
+
int num_fields() const { return static_cast<int>(fields_.size()); }
private:
std::vector<std::shared_ptr<Field>> fields_;
std::unordered_map<std::string, int> name_to_index_;
+ KeyValueMetadata custom_metadata_;
};
// ----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index b22c8ac..ac7e866 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -26,6 +26,7 @@ install(FILES
macros.h
random.h
visibility.h
+ key_value_metadata.h
DESTINATION include/arrow/util)
#######################################
@@ -52,3 +53,4 @@ endif()
ADD_ARROW_TEST(bit-util-test)
ADD_ARROW_TEST(stl-util-test)
ADD_ARROW_TEST(decimal-test)
+ADD_ARROW_TEST(key-value-metadata-test)
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/key-value-metadata-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/key-value-metadata-test.cc b/cpp/src/arrow/util/key-value-metadata-test.cc
new file mode 100644
index 0000000..aadc989
--- /dev/null
+++ b/cpp/src/arrow/util/key-value-metadata-test.cc
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gtest/gtest.h"
+
+#include "arrow/util/key_value_metadata.h"
+
+#include "arrow/test-util.h"
+
+namespace arrow {
+
+TEST(KeyValueMetadataTest, SimpleConstruction) {
+ KeyValueMetadata metadata;
+ ASSERT_EQ(0, metadata.size());
+}
+
+TEST(KeyValueMetadataTest, StringVectorConstruction) {
+ std::vector<std::string> keys = {"foo", "bar"};
+ std::vector<std::string> values = {"bizz", "buzz"};
+
+ KeyValueMetadata metadata(keys, values);
+ ASSERT_EQ("foo", metadata.key(0));
+ ASSERT_EQ("bar", metadata.key(1));
+ ASSERT_EQ("bizz", metadata.value(0));
+ ASSERT_EQ("buzz", metadata.value(1));
+ ASSERT_EQ(2, metadata.size());
+}
+
+TEST(KeyValueMetadataTest, StringMapConstruction) {
+ std::unordered_map<std::string, std::string> pairs = {{"foo", "bizz"}, {"bar", "buzz"}};
+ std::unordered_map<std::string, std::string> result_map;
+ result_map.reserve(pairs.size());
+
+ KeyValueMetadata metadata(pairs);
+ metadata.ToUnorderedMap(&result_map);
+ ASSERT_EQ(pairs, result_map);
+ ASSERT_EQ(2, metadata.size());
+}
+
+TEST(KeyValueMetadataTest, StringAppend) {
+ std::vector<std::string> keys = {"foo", "bar"};
+ std::vector<std::string> values = {"bizz", "buzz"};
+
+ KeyValueMetadata metadata(keys, values);
+ ASSERT_EQ("foo", metadata.key(0));
+ ASSERT_EQ("bar", metadata.key(1));
+ ASSERT_EQ("bizz", metadata.value(0));
+ ASSERT_EQ("buzz", metadata.value(1));
+ ASSERT_EQ(2, metadata.size());
+
+ metadata.Append("purple", "orange");
+ metadata.Append("blue", "red");
+
+ ASSERT_EQ("purple", metadata.key(2));
+ ASSERT_EQ("blue", metadata.key(3));
+
+ ASSERT_EQ("orange", metadata.value(2));
+ ASSERT_EQ("red", metadata.value(3));
+}
+
+TEST(KeyValueMetadataTest, Equals) {
+ std::vector<std::string> keys = {"foo", "bar"};
+ std::vector<std::string> values = {"bizz", "buzz"};
+
+ KeyValueMetadata metadata(keys, values);
+ KeyValueMetadata metadata2(keys, values);
+ KeyValueMetadata metadata3(keys, {"buzz", "bizz"});
+
+ ASSERT_TRUE(metadata.Equals(metadata2));
+ ASSERT_FALSE(metadata.Equals(metadata3));
+}
+
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/key_value_metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc
new file mode 100644
index 0000000..c91478b
--- /dev/null
+++ b/cpp/src/arrow/util/key_value_metadata.cc
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+static std::vector<std::string> UnorderedMapKeys(
+ const std::unordered_map<std::string, std::string>& map) {
+ std::vector<std::string> keys;
+ keys.reserve(map.size());
+ for (const auto& pair : map) {
+ keys.push_back(pair.first);
+ }
+ return keys;
+}
+
+static std::vector<std::string> UnorderedMapValues(
+ const std::unordered_map<std::string, std::string>& map) {
+ std::vector<std::string> values;
+ values.reserve(map.size());
+ for (const auto& pair : map) {
+ values.push_back(pair.second);
+ }
+ return values;
+}
+
+KeyValueMetadata::KeyValueMetadata() : keys_(), values_() {}
+
+KeyValueMetadata::KeyValueMetadata(
+ const std::unordered_map<std::string, std::string>& map)
+ : keys_(UnorderedMapKeys(map)), values_(UnorderedMapValues(map)) {}
+
+KeyValueMetadata::KeyValueMetadata(
+ const std::vector<std::string>& keys, const std::vector<std::string>& values)
+ : keys_(keys), values_(values) {
+ DCHECK_EQ(keys.size(), values.size());
+}
+
+void KeyValueMetadata::ToUnorderedMap(
+ std::unordered_map<std::string, std::string>* out) const {
+ DCHECK_NE(out, nullptr);
+ const int64_t n = size();
+ out->reserve(n);
+ for (int64_t i = 0; i < n; ++i) {
+ out->insert(std::make_pair(key(i), value(i)));
+ }
+}
+
+void KeyValueMetadata::Append(const std::string& key, const std::string& value) {
+ keys_.push_back(key);
+ values_.push_back(value);
+}
+
+void KeyValueMetadata::reserve(int64_t n) {
+ DCHECK_GE(n, 0);
+ const auto m = static_cast<size_t>(n);
+ keys_.reserve(m);
+ values_.reserve(m);
+}
+
+int64_t KeyValueMetadata::size() const {
+ DCHECK_EQ(keys_.size(), values_.size());
+ return static_cast<int64_t>(keys_.size());
+}
+
+std::string KeyValueMetadata::key(int64_t i) const {
+ DCHECK_GE(i, 0);
+ return keys_[static_cast<size_t>(i)];
+}
+
+std::string KeyValueMetadata::value(int64_t i) const {
+ DCHECK_GE(i, 0);
+ return values_[static_cast<size_t>(i)];
+}
+
+bool KeyValueMetadata::Equals(const KeyValueMetadata& other) const {
+ return size() == other.size() &&
+ std::equal(keys_.cbegin(), keys_.cend(), other.keys_.cbegin()) &&
+ std::equal(values_.cbegin(), values_.cend(), other.values_.cbegin());
+}
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/key_value_metadata.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h
new file mode 100644
index 0000000..713b2c0
--- /dev/null
+++ b/cpp/src/arrow/util/key_value_metadata.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_KEY_VALUE_METADATA_H
+#define ARROW_UTIL_KEY_VALUE_METADATA_H
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class ARROW_EXPORT KeyValueMetadata {
+ public:
+ KeyValueMetadata();
+ KeyValueMetadata(
+ const std::vector<std::string>& keys, const std::vector<std::string>& values);
+ explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
+
+ void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const;
+
+ void Append(const std::string& key, const std::string& value);
+
+ void reserve(int64_t n);
+ int64_t size() const;
+
+ std::string key(int64_t i) const;
+ std::string value(int64_t i) const;
+
+ bool Equals(const KeyValueMetadata& other) const;
+
+ private:
+ std::vector<std::string> keys_;
+ std::vector<std::string> values_;
+};
+
+} // namespace arrow
+
+#endif // ARROW_UTIL_KEY_VALUE_METADATA_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/format/Schema.fbs
----------------------------------------------------------------------
diff --git a/format/Schema.fbs b/format/Schema.fbs
index b48859f..8de5c6d 100644
--- a/format/Schema.fbs
+++ b/format/Schema.fbs
@@ -200,7 +200,7 @@ table VectorLayout {
table KeyValue {
key: string;
- value: [ubyte];
+ value: string;
}
/// ----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/.gitignore
----------------------------------------------------------------------
diff --git a/python/.gitignore b/python/.gitignore
index ba40c3e..6c0d5a9 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -33,3 +33,4 @@ coverage.xml
# benchmark working dir
.asv
+pyarrow/_table_api.h
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/_array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd
index 464de31..4d5db86 100644
--- a/python/pyarrow/_array.pxd
+++ b/python/pyarrow/_array.pxd
@@ -81,6 +81,8 @@ cdef class Schema:
cdef init(self, const vector[shared_ptr[CField]]& fields)
cdef init_schema(self, const shared_ptr[CSchema]& schema)
+ cpdef dict custom_metadata(self)
+
cdef class Scalar:
cdef readonly:
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/_array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx
index 1c571ba..2fb20b7 100644
--- a/python/pyarrow/_array.pyx
+++ b/python/pyarrow/_array.pyx
@@ -244,6 +244,13 @@ cdef class Schema:
self.schema = schema.get()
self.sp_schema = schema
+ cpdef dict custom_metadata(self):
+ cdef:
+ CKeyValueMetadata metadata = self.schema.custom_metadata()
+ unordered_map[c_string, c_string] result
+ metadata.ToUnorderedMap(&result)
+ return result
+
def equals(self, other):
"""
Test if this schema is equal to the other
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/_table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx
index 78fec75..ed0782b 100644
--- a/python/pyarrow/_table.pyx
+++ b/python/pyarrow/_table.pyx
@@ -34,7 +34,6 @@ from pyarrow._error import ArrowException
from pyarrow._array import field
from pyarrow.compat import frombytes, tobytes
-
from collections import OrderedDict
@@ -273,15 +272,22 @@ cdef class Column:
return chunked_array
-cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
+cdef CKeyValueMetadata key_value_metadata_from_dict(dict metadata):
+ cdef:
+ unordered_map[c_string, c_string] unordered_metadata = metadata
+ CKeyValueMetadata c_metadata = CKeyValueMetadata(unordered_metadata)
+ return c_metadata
+
+
+cdef int _schema_from_arrays(
+ arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1:
cdef:
Array arr
Column col
c_string c_name
vector[shared_ptr[CField]] fields
- cdef shared_ptr[CDataType] type_
-
- cdef int K = len(arrays)
+ shared_ptr[CDataType] type_
+ int K = len(arrays)
fields.resize(K)
@@ -306,15 +312,16 @@ cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
else:
raise TypeError(type(arrays[0]))
- schema.reset(new CSchema(fields))
-
+ schema.reset(new CSchema(fields, key_value_metadata_from_dict(metadata)))
+ return 0
-cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema):
+cdef tuple _dataframe_to_arrays(df, bint timestamps_to_ms, Schema schema):
cdef:
list names = []
list arrays = []
DataType type = None
+ dict metadata = {}
for name in df.columns:
col = df[name]
@@ -326,7 +333,7 @@ cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema):
names.append(name)
arrays.append(arr)
- return names, arrays
+ return names, arrays, metadata
cdef class RecordBatch:
@@ -486,11 +493,11 @@ cdef class RecordBatch:
-------
pyarrow.table.RecordBatch
"""
- names, arrays = _dataframe_to_arrays(df, False, schema)
- return cls.from_arrays(arrays, names)
+ names, arrays, metadata = _dataframe_to_arrays(df, False, schema)
+ return cls.from_arrays(arrays, names, metadata)
@staticmethod
- def from_arrays(arrays, names):
+ def from_arrays(list arrays, list names, dict metadata=None):
"""
Construct a RecordBatch from multiple pyarrow.Arrays
@@ -512,15 +519,17 @@ cdef class RecordBatch:
shared_ptr[CRecordBatch] batch
vector[shared_ptr[CArray]] c_arrays
int64_t num_rows
+ int64_t i
+ int64_t number_of_arrays = len(arrays)
- if len(arrays) == 0:
+ if not number_of_arrays:
raise ValueError('Record batch cannot contain no arrays (for now)')
num_rows = len(arrays[0])
- _schema_from_arrays(arrays, names, &schema)
+ _schema_from_arrays(arrays, names, metadata or {}, &schema)
- for i in range(len(arrays)):
- arr = arrays[i]
+ c_arrays.reserve(len(arrays))
+ for arr in arrays:
c_arrays.push_back(arr.sp_array)
batch.reset(new CRecordBatch(schema, num_rows, c_arrays))
@@ -656,13 +665,13 @@ cdef class Table:
>>> pa.Table.from_pandas(df)
<pyarrow.table.Table object at 0x7f05d1fb1b40>
"""
- names, arrays = _dataframe_to_arrays(df,
+ names, arrays, metadata = _dataframe_to_arrays(df,
timestamps_to_ms=timestamps_to_ms,
schema=schema)
- return cls.from_arrays(arrays, names=names)
+ return cls.from_arrays(arrays, names=names, metadata=metadata)
@staticmethod
- def from_arrays(arrays, names=None):
+ def from_arrays(arrays, names=None, dict metadata=None):
"""
Construct a Table from Arrow arrays or columns
@@ -680,22 +689,25 @@ cdef class Table:
"""
cdef:
- vector[shared_ptr[CField]] fields
vector[shared_ptr[CColumn]] columns
shared_ptr[CSchema] schema
shared_ptr[CTable] table
+ size_t K = len(arrays)
- _schema_from_arrays(arrays, names, &schema)
+ _schema_from_arrays(arrays, names, metadata or {}, &schema)
- cdef int K = len(arrays)
- columns.resize(K)
+ columns.reserve(K)
for i in range(K):
if isinstance(arrays[i], Array):
- columns[i].reset(new CColumn(schema.get().field(i),
- (<Array> arrays[i]).sp_array))
+ columns.push_back(
+ make_shared[CColumn](
+ schema.get().field(i),
+ (<Array> arrays[i]).sp_array
+ )
+ )
elif isinstance(arrays[i], Column):
- columns[i] = (<Column> arrays[i]).sp_column
+ columns.push_back((<Column> arrays[i]).sp_column)
else:
raise ValueError(type(arrays[i]))
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/includes/common.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index 44723fa..cc3b4b6 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -19,9 +19,10 @@
from libc.stdint cimport *
from libcpp cimport bool as c_bool
-from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector
+from libcpp.unordered_map cimport unordered_map
from cpython cimport PyObject
cimport cpython
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 473a0b9..ef1a332 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1,4 +1,4 @@
-#t Licensed to the Apache Software Foundation (ASF) under one
+# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
@@ -19,6 +19,12 @@
from pyarrow.includes.common cimport *
+cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil:
+ cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata":
+ CKeyValueMetadata()
+ CKeyValueMetadata(const unordered_map[c_string, c_string]&)
+ void ToUnorderedMap(unordered_map[c_string, c_string]*) const
+
cdef extern from "arrow/api.h" namespace "arrow" nogil:
enum Type" arrow::Type::type":
@@ -170,10 +176,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CSchema" arrow::Schema":
CSchema(const vector[shared_ptr[CField]]& fields)
+ CSchema(const vector[shared_ptr[CField]]& fields,
+ const CKeyValueMetadata& custom_metadata)
c_bool Equals(const CSchema& other)
shared_ptr[CField] field(int i)
+ const CKeyValueMetadata& custom_metadata() const
shared_ptr[CField] GetFieldByName(c_string& name)
int num_fields()
c_string ToString()