You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/19 21:51:18 UTC

[arrow] branch master updated: ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e39e364  ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names
e39e364 is described below

commit e39e36441b94f211a57685887d14a8ff1d1b5f98
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed Dec 19 15:51:08 2018 -0600

    ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3220 from wesm/ARROW-3545 and squashes the following commits:
    
    dc212e61c <Wes McKinney> Fix more deprecated API uses
    16e198473 <Wes McKinney> Remove field_by_name/field APIs from Python bindings, cdef only
    3c4abed05 <Wes McKinney> Fix use of deprecated APIs
    2eecdbf57 <Wes McKinney> Rename GetChildIndex, GetChildByName for better semantic consistency. Define behavior of these functions when there are duplicate field names. Reflect changes in Python
---
 cpp/src/arrow/array.cc               |  2 +-
 cpp/src/arrow/type-test.cc           | 30 ++++++++++++++++++++----------
 cpp/src/arrow/type.cc                | 33 ++++++++++++++++++++++++++++++---
 cpp/src/arrow/type.h                 |  9 ++++++++-
 python/pyarrow/includes/libarrow.pxd |  4 ++--
 python/pyarrow/lib.pxd               |  3 ++-
 python/pyarrow/scalar.pxi            |  2 +-
 python/pyarrow/tests/test_types.py   | 17 +++++++++++++----
 python/pyarrow/types.pxi             | 12 +++++++++---
 9 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 7e45e90..d07c27f 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -395,7 +395,7 @@ std::shared_ptr<Array> StructArray::field(int i) const {
 }
 
 std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const {
-  int i = struct_type()->GetChildIndex(name);
+  int i = struct_type()->GetFieldIndex(name);
   return i == -1 ? nullptr : field(i);
 }
 
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index 20b7aff..5b758d7 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -448,7 +448,7 @@ TEST(TestStructType, Basics) {
   // TODO(wesm): out of bounds for field(...)
 }
 
-TEST(TestStructType, GetChildByName) {
+TEST(TestStructType, GetFieldByName) {
   auto f0 = field("f0", int32());
   auto f1 = field("f1", uint8(), false);
   auto f2 = field("f2", utf8());
@@ -457,17 +457,17 @@ TEST(TestStructType, GetChildByName) {
   StructType struct_type({f0, f1, f2, f3});
   std::shared_ptr<Field> result;
 
-  result = struct_type.GetChildByName("f1");
+  result = struct_type.GetFieldByName("f1");
   ASSERT_EQ(f1, result);
 
-  result = struct_type.GetChildByName("f3");
+  result = struct_type.GetFieldByName("f3");
   ASSERT_EQ(f3, result);
 
-  result = struct_type.GetChildByName("not-found");
+  result = struct_type.GetFieldByName("not-found");
   ASSERT_EQ(result, nullptr);
 }
 
-TEST(TestStructType, GetChildIndex) {
+TEST(TestStructType, GetFieldIndex) {
   auto f0 = field("f0", int32());
   auto f1 = field("f1", uint8(), false);
   auto f2 = field("f2", utf8());
@@ -475,11 +475,21 @@ TEST(TestStructType, GetChildIndex) {
 
   StructType struct_type({f0, f1, f2, f3});
 
-  ASSERT_EQ(0, struct_type.GetChildIndex(f0->name()));
-  ASSERT_EQ(1, struct_type.GetChildIndex(f1->name()));
-  ASSERT_EQ(2, struct_type.GetChildIndex(f2->name()));
-  ASSERT_EQ(3, struct_type.GetChildIndex(f3->name()));
-  ASSERT_EQ(-1, struct_type.GetChildIndex("not-found"));
+  ASSERT_EQ(0, struct_type.GetFieldIndex(f0->name()));
+  ASSERT_EQ(1, struct_type.GetFieldIndex(f1->name()));
+  ASSERT_EQ(2, struct_type.GetFieldIndex(f2->name()));
+  ASSERT_EQ(3, struct_type.GetFieldIndex(f3->name()));
+  ASSERT_EQ(-1, struct_type.GetFieldIndex("not-found"));
+}
+
+TEST(TestStructType, GetFieldIndexDuplicates) {
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", int64());
+  auto f2 = field("f1", utf8());
+  StructType struct_type({f0, f1, f2});
+
+  ASSERT_EQ(0, struct_type.GetFieldIndex("f0"));
+  ASSERT_EQ(-1, struct_type.GetFieldIndex("f1"));
 }
 
 TEST(TestDictionaryType, Equals) {
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 753cb65..ee7fda7 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -232,18 +232,37 @@ std::string StructType::ToString() const {
   return s.str();
 }
 
-std::shared_ptr<Field> StructType::GetChildByName(const std::string& name) const {
-  int i = GetChildIndex(name);
+std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const {
+  int i = GetFieldIndex(name);
   return i == -1 ? nullptr : children_[i];
 }
 
-int StructType::GetChildIndex(const std::string& name) const {
+int StructType::GetFieldIndex(const std::string& name) const {
   if (children_.size() > 0 && name_to_index_.size() == 0) {
     for (size_t i = 0; i < children_.size(); ++i) {
       name_to_index_[children_[i]->name()] = static_cast<int>(i);
     }
   }
 
+  if (name_to_index_.size() < children_.size()) {
+    // There are duplicate field names. Refuse to guess
+    int counts = 0;
+    int last_observed_index = -1;
+    for (size_t i = 0; i < children_.size(); ++i) {
+      if (children_[i]->name() == name) {
+        ++counts;
+        last_observed_index = static_cast<int>(i);
+      }
+    }
+
+    if (counts == 1) {
+      return last_observed_index;
+    } else {
+      // Duplicate or not found
+      return -1;
+    }
+  }
+
   auto it = name_to_index_.find(name);
   if (it == name_to_index_.end()) {
     return -1;
@@ -252,6 +271,14 @@ int StructType::GetChildIndex(const std::string& name) const {
   }
 }
 
+std::shared_ptr<Field> StructType::GetChildByName(const std::string& name) const {
+  return GetFieldByName(name);
+}
+
+int StructType::GetChildIndex(const std::string& name) const {
+  return GetFieldIndex(name);
+}
+
 // ----------------------------------------------------------------------
 // DictionaryType
 
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 8f6cfd6..95b5189 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -516,9 +516,16 @@ class ARROW_EXPORT StructType : public NestedType {
   std::string name() const override { return "struct"; }
 
   /// Returns null if name not found
+  std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
+
+  /// Returns -1 if name not found or if there are multiple fields having the
+  /// same name
+  int GetFieldIndex(const std::string& name) const;
+
+  ARROW_DEPRECATED("Use GetFieldByName")
   std::shared_ptr<Field> GetChildByName(const std::string& name) const;
 
-  /// Returns -1 if name not found
+  ARROW_DEPRECATED("Use GetChildIndex")
   int GetChildIndex(const std::string& name) const;
 
  private:
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 61517e4..f4629af 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -276,8 +276,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CStructType" arrow::StructType"(CDataType):
         CStructType(const vector[shared_ptr[CField]]& fields)
 
-        shared_ptr[CField] GetChildByName(const c_string& name)
-        int GetChildIndex(const c_string& name)
+        shared_ptr[CField] GetFieldByName(const c_string& name)
+        int GetFieldIndex(const c_string& name)
 
     cdef cppclass CUnionType" arrow::UnionType"(CDataType):
         CUnionType(const vector[shared_ptr[CField]]& fields,
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 3e62826..d829d6a 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -65,7 +65,8 @@ cdef class StructType(DataType):
     cdef:
         const CStructType* struct_type
 
-    cdef Field child_by_name(self, name)
+    cdef Field field(self, int i)
+    cdef Field field_by_name(self, name)
 
 
 cdef class DictionaryType(DataType):
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index a2a133b..fd3f580 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -470,7 +470,7 @@ cdef class StructValue(ArrayValue):
             int index
 
         type = <CStructType*> self.type.type
-        index = type.GetChildIndex(tobytes(key))
+        index = type.GetFieldIndex(tobytes(key))
 
         if index < 0:
             raise KeyError(key)
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index af2d113..729c76e 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -231,9 +231,12 @@ def test_list_type():
 
 
 def test_struct_type():
-    fields = [pa.field('a', pa.int64()),
-              pa.field('a', pa.int32()),
-              pa.field('b', pa.int32())]
+    fields = [
+        # Duplicate field name on purpose
+        pa.field('a', pa.int64()),
+        pa.field('a', pa.int32()),
+        pa.field('b', pa.int32())
+    ]
     ty = pa.struct(fields)
 
     assert len(ty) == ty.num_children == 3
@@ -243,11 +246,17 @@ def test_struct_type():
     with pytest.raises(IndexError):
         assert ty[3]
 
-    assert ty['a'] == ty[1]
     assert ty['b'] == ty[2]
+
+    # Duplicate
+    with pytest.raises(KeyError):
+        ty['a']
+
+    # Not found
     with pytest.raises(KeyError):
         ty['c']
 
+    # Neither integer nor string
     with pytest.raises(TypeError):
         ty[None]
 
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index d367a8a..29b2a1e 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -213,13 +213,19 @@ cdef class StructType(DataType):
         DataType.init(self, type)
         self.struct_type = <const CStructType*> type.get()
 
-    cdef Field child_by_name(self, name):
+    cdef Field field(self, int i):
+        """
+        Alias for child(i)
+        """
+        return self.child(i)
+
+    cdef Field field_by_name(self, name):
         """
         Access a child field by its name rather than the column index.
         """
         cdef shared_ptr[CField] field
 
-        field = self.struct_type.GetChildByName(tobytes(name))
+        field = self.struct_type.GetFieldByName(tobytes(name))
         if field == nullptr:
             raise KeyError(name)
 
@@ -234,7 +240,7 @@ cdef class StructType(DataType):
 
     def __getitem__(self, i):
         if isinstance(i, six.string_types):
-            return self.child_by_name(i)
+            return self.field_by_name(i)
         elif isinstance(i, six.integer_types):
             return self.child(i)
         else: