You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/19 21:51:18 UTC
[arrow] branch master updated: ARROW-3545: [C++/Python] Use "field"
terminology with StructType, specify behavior with duplicate field names
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e39e364 ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names
e39e364 is described below
commit e39e36441b94f211a57685887d14a8ff1d1b5f98
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed Dec 19 15:51:08 2018 -0600
ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names
Author: Wes McKinney <we...@apache.org>
Closes #3220 from wesm/ARROW-3545 and squashes the following commits:
dc212e61c <Wes McKinney> Fix more deprecated API uses
16e198473 <Wes McKinney> Remove field_by_name/field APIs from Python bindings, cdef only
3c4abed05 <Wes McKinney> Fix use of deprecated APIs
2eecdbf57 <Wes McKinney> Rename GetChildIndex, GetChildByName for better semantic consistency. Define behavior of these functions when there are duplicate field names. Reflect changes in Python
---
cpp/src/arrow/array.cc | 2 +-
cpp/src/arrow/type-test.cc | 30 ++++++++++++++++++++----------
cpp/src/arrow/type.cc | 33 ++++++++++++++++++++++++++++++---
cpp/src/arrow/type.h | 9 ++++++++-
python/pyarrow/includes/libarrow.pxd | 4 ++--
python/pyarrow/lib.pxd | 3 ++-
python/pyarrow/scalar.pxi | 2 +-
python/pyarrow/tests/test_types.py | 17 +++++++++++++----
python/pyarrow/types.pxi | 12 +++++++++---
9 files changed, 86 insertions(+), 26 deletions(-)
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 7e45e90..d07c27f 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -395,7 +395,7 @@ std::shared_ptr<Array> StructArray::field(int i) const {
}
std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const {
- int i = struct_type()->GetChildIndex(name);
+ int i = struct_type()->GetFieldIndex(name);
return i == -1 ? nullptr : field(i);
}
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index 20b7aff..5b758d7 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -448,7 +448,7 @@ TEST(TestStructType, Basics) {
// TODO(wesm): out of bounds for field(...)
}
-TEST(TestStructType, GetChildByName) {
+TEST(TestStructType, GetFieldByName) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
@@ -457,17 +457,17 @@ TEST(TestStructType, GetChildByName) {
StructType struct_type({f0, f1, f2, f3});
std::shared_ptr<Field> result;
- result = struct_type.GetChildByName("f1");
+ result = struct_type.GetFieldByName("f1");
ASSERT_EQ(f1, result);
- result = struct_type.GetChildByName("f3");
+ result = struct_type.GetFieldByName("f3");
ASSERT_EQ(f3, result);
- result = struct_type.GetChildByName("not-found");
+ result = struct_type.GetFieldByName("not-found");
ASSERT_EQ(result, nullptr);
}
-TEST(TestStructType, GetChildIndex) {
+TEST(TestStructType, GetFieldIndex) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
@@ -475,11 +475,21 @@ TEST(TestStructType, GetChildIndex) {
StructType struct_type({f0, f1, f2, f3});
- ASSERT_EQ(0, struct_type.GetChildIndex(f0->name()));
- ASSERT_EQ(1, struct_type.GetChildIndex(f1->name()));
- ASSERT_EQ(2, struct_type.GetChildIndex(f2->name()));
- ASSERT_EQ(3, struct_type.GetChildIndex(f3->name()));
- ASSERT_EQ(-1, struct_type.GetChildIndex("not-found"));
+ ASSERT_EQ(0, struct_type.GetFieldIndex(f0->name()));
+ ASSERT_EQ(1, struct_type.GetFieldIndex(f1->name()));
+ ASSERT_EQ(2, struct_type.GetFieldIndex(f2->name()));
+ ASSERT_EQ(3, struct_type.GetFieldIndex(f3->name()));
+ ASSERT_EQ(-1, struct_type.GetFieldIndex("not-found"));
+}
+
+TEST(TestStructType, GetFieldIndexDuplicates) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", int64());
+ auto f2 = field("f1", utf8());
+ StructType struct_type({f0, f1, f2});
+
+ ASSERT_EQ(0, struct_type.GetFieldIndex("f0"));
+ ASSERT_EQ(-1, struct_type.GetFieldIndex("f1"));
}
TEST(TestDictionaryType, Equals) {
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 753cb65..ee7fda7 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -232,18 +232,37 @@ std::string StructType::ToString() const {
return s.str();
}
-std::shared_ptr<Field> StructType::GetChildByName(const std::string& name) const {
- int i = GetChildIndex(name);
+std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const {
+ int i = GetFieldIndex(name);
return i == -1 ? nullptr : children_[i];
}
-int StructType::GetChildIndex(const std::string& name) const {
+int StructType::GetFieldIndex(const std::string& name) const {
if (children_.size() > 0 && name_to_index_.size() == 0) {
for (size_t i = 0; i < children_.size(); ++i) {
name_to_index_[children_[i]->name()] = static_cast<int>(i);
}
}
+ if (name_to_index_.size() < children_.size()) {
+ // There are duplicate field names. Refuse to guess
+ int counts = 0;
+ int last_observed_index = -1;
+ for (size_t i = 0; i < children_.size(); ++i) {
+ if (children_[i]->name() == name) {
+ ++counts;
+ last_observed_index = static_cast<int>(i);
+ }
+ }
+
+ if (counts == 1) {
+ return last_observed_index;
+ } else {
+ // Duplicate or not found
+ return -1;
+ }
+ }
+
auto it = name_to_index_.find(name);
if (it == name_to_index_.end()) {
return -1;
@@ -252,6 +271,14 @@ int StructType::GetChildIndex(const std::string& name) const {
}
}
+std::shared_ptr<Field> StructType::GetChildByName(const std::string& name) const {
+ return GetFieldByName(name);
+}
+
+int StructType::GetChildIndex(const std::string& name) const {
+ return GetFieldIndex(name);
+}
+
// ----------------------------------------------------------------------
// DictionaryType
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 8f6cfd6..95b5189 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -516,9 +516,16 @@ class ARROW_EXPORT StructType : public NestedType {
std::string name() const override { return "struct"; }
/// Returns null if name not found
+ std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
+
+ /// Returns -1 if name not found or if there are multiple fields having the
+ /// same name
+ int GetFieldIndex(const std::string& name) const;
+
+ ARROW_DEPRECATED("Use GetFieldByName")
std::shared_ptr<Field> GetChildByName(const std::string& name) const;
- /// Returns -1 if name not found
+ ARROW_DEPRECATED("Use GetChildIndex")
int GetChildIndex(const std::string& name) const;
private:
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 61517e4..f4629af 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -276,8 +276,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CStructType" arrow::StructType"(CDataType):
CStructType(const vector[shared_ptr[CField]]& fields)
- shared_ptr[CField] GetChildByName(const c_string& name)
- int GetChildIndex(const c_string& name)
+ shared_ptr[CField] GetFieldByName(const c_string& name)
+ int GetFieldIndex(const c_string& name)
cdef cppclass CUnionType" arrow::UnionType"(CDataType):
CUnionType(const vector[shared_ptr[CField]]& fields,
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 3e62826..d829d6a 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -65,7 +65,8 @@ cdef class StructType(DataType):
cdef:
const CStructType* struct_type
- cdef Field child_by_name(self, name)
+ cdef Field field(self, int i)
+ cdef Field field_by_name(self, name)
cdef class DictionaryType(DataType):
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index a2a133b..fd3f580 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -470,7 +470,7 @@ cdef class StructValue(ArrayValue):
int index
type = <CStructType*> self.type.type
- index = type.GetChildIndex(tobytes(key))
+ index = type.GetFieldIndex(tobytes(key))
if index < 0:
raise KeyError(key)
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index af2d113..729c76e 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -231,9 +231,12 @@ def test_list_type():
def test_struct_type():
- fields = [pa.field('a', pa.int64()),
- pa.field('a', pa.int32()),
- pa.field('b', pa.int32())]
+ fields = [
+ # Duplicate field name on purpose
+ pa.field('a', pa.int64()),
+ pa.field('a', pa.int32()),
+ pa.field('b', pa.int32())
+ ]
ty = pa.struct(fields)
assert len(ty) == ty.num_children == 3
@@ -243,11 +246,17 @@ def test_struct_type():
with pytest.raises(IndexError):
assert ty[3]
- assert ty['a'] == ty[1]
assert ty['b'] == ty[2]
+
+ # Duplicate
+ with pytest.raises(KeyError):
+ ty['a']
+
+ # Not found
with pytest.raises(KeyError):
ty['c']
+ # Neither integer nor string
with pytest.raises(TypeError):
ty[None]
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index d367a8a..29b2a1e 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -213,13 +213,19 @@ cdef class StructType(DataType):
DataType.init(self, type)
self.struct_type = <const CStructType*> type.get()
- cdef Field child_by_name(self, name):
+ cdef Field field(self, int i):
+ """
+ Alias for child(i)
+ """
+ return self.child(i)
+
+ cdef Field field_by_name(self, name):
"""
Access a child field by its name rather than the column index.
"""
cdef shared_ptr[CField] field
- field = self.struct_type.GetChildByName(tobytes(name))
+ field = self.struct_type.GetFieldByName(tobytes(name))
if field == nullptr:
raise KeyError(name)
@@ -234,7 +240,7 @@ cdef class StructType(DataType):
def __getitem__(self, i):
if isinstance(i, six.string_types):
- return self.child_by_name(i)
+ return self.field_by_name(i)
elif isinstance(i, six.integer_types):
return self.child(i)
else: