You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/12/27 16:12:10 UTC

[arrow] branch master updated: ARROW-4102: [C++] Return common IdentityCast when casting to equal type

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a536529  ARROW-4102: [C++] Return common IdentityCast when casting to equal type
a536529 is described below

commit a536529a624b793ffa18c3c39581fdf777e85f8f
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Dec 27 17:11:58 2018 +0100

    ARROW-4102: [C++] Return common IdentityCast when casting to equal type
    
    I also added some code to make it easier to write cast tests in JSON.
    
    As one issue with the JSON parser -- we have a number of tests in cast-test.cc that check that values that are in null positions are ignored. We might augment the parser to be able to pass both values and validity bitmap as separate JSON strings
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3265 from wesm/ARROW-4102 and squashes the following commits:
    
    8c27ba2a <Wes McKinney> Fix bad memory access
    9c52297f <Wes McKinney> Add various identity cast tests, verify that fixed_size_binary identity casts work now
---
 cpp/src/arrow/compute/kernels/cast-test.cc | 118 ++++++++++++++++-------------
 cpp/src/arrow/compute/kernels/cast.cc      |  27 +++++--
 cpp/src/arrow/ipc/json-simple.cc           |   5 ++
 python/pyarrow/tests/test_array.py         |   3 +-
 4 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 4c39928..781e0af 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -51,6 +51,10 @@ using std::vector;
 namespace arrow {
 namespace compute {
 
+static std::vector<std::shared_ptr<DataType>> kNumericTypes = {
+    uint8(), int8(),   uint16(), int16(),   uint32(),
+    int32(), uint64(), int64(),  float32(), float64()};
+
 static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) {
   ASSERT_EQ(left.data()->buffers[buffer_index].get(),
             right.data()->buffers[buffer_index].get());
@@ -81,8 +85,10 @@ class TestCast : public ComputeFixture, public TestBase {
   void CheckZeroCopy(const Array& input, const shared_ptr<DataType>& out_type) {
     shared_ptr<Array> result;
     ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result));
-    AssertBufferSame(input, *result, 0);
-    AssertBufferSame(input, *result, 1);
+    ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size());
+    for (size_t i = 0; i < input.data()->buffers.size(); ++i) {
+      AssertBufferSame(input, *result, static_cast<int>(i));
+    }
   }
 
   template <typename InType, typename I_TYPE, typename OutType, typename O_TYPE>
@@ -106,15 +112,25 @@ class TestCast : public ComputeFixture, public TestBase {
       CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options);
     }
   }
-};
 
-TEST_F(TestCast, SameTypeZeroCopy) {
-  vector<bool> is_valid = {true, false, true, true, true};
-  vector<int32_t> v1 = {0, 1, 2, 3, 4};
+  void CheckCaseJSON(const shared_ptr<DataType>& in_type,
+                     const shared_ptr<DataType>& out_type, const std::string& in_json,
+                     const std::string& expected_json,
+                     const CastOptions& options = CastOptions()) {
+    shared_ptr<Array> input = ArrayFromJSON(in_type, in_json);
+    shared_ptr<Array> expected = ArrayFromJSON(out_type, expected_json);
+    DCHECK_EQ(input->length(), expected->length());
+    CheckPass(*input, *expected, out_type, options);
 
-  shared_ptr<Array> arr;
-  ArrayFromVector<Int32Type, int32_t>(int32(), is_valid, v1, &arr);
+    // Check a sliced variant
+    if (input->length() > 1) {
+      CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options);
+    }
+  }
+};
 
+TEST_F(TestCast, SameTypeZeroCopy) {
+  shared_ptr<Array> arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]");
   shared_ptr<Array> result;
   ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result));
 
@@ -124,20 +140,16 @@ TEST_F(TestCast, SameTypeZeroCopy) {
 
 TEST_F(TestCast, ToBoolean) {
   CastOptions options;
+  for (auto type : kNumericTypes) {
+    CheckCaseJSON(type, boolean(), "[0, null, 127, 1, 0]",
+                  "[false, null, true, true, false]");
+  }
 
-  vector<bool> is_valid = {true, false, true, true, true};
-
-  // int8, should suffice for other integers
-  vector<int8_t> v1 = {0, 1, 127, -1, 0};
-  vector<bool> e1 = {false, true, true, true, false};
-  CheckCase<Int8Type, int8_t, BooleanType, bool>(int8(), v1, is_valid, boolean(), e1,
-                                                 options);
-
-  // floating point
-  vector<double> v2 = {1.0, 0, 0, -1.0, 5.0};
-  vector<bool> e2 = {true, false, false, true, true};
-  CheckCase<DoubleType, double, BooleanType, bool>(float64(), v2, is_valid, boolean(), e2,
-                                                   options);
+  // Check negative numbers
+  CheckCaseJSON(int8(), boolean(), "[0, null, 127, -1, 0]",
+                "[false, null, true, true, false]");
+  CheckCaseJSON(float64(), boolean(), "[0, null, 127, -1, 0]",
+                "[false, null, true, true, false]");
 }
 
 TEST_F(TestCast, ToIntUpcast) {
@@ -648,36 +660,6 @@ TEST_F(TestCast, TimeToCompatible) {
                          options);
 }
 
-TEST_F(TestCast, PrimitiveZeroCopy) {
-  shared_ptr<Array> arr;
-
-  ArrayFromVector<UInt8Type, uint8_t>(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, uint8());
-  ArrayFromVector<Int8Type, int8_t>(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, int8());
-
-  ArrayFromVector<UInt16Type, uint16_t>(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, uint16());
-  ArrayFromVector<Int16Type, int8_t>(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, int16());
-
-  ArrayFromVector<UInt32Type, uint32_t>(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, uint32());
-  ArrayFromVector<Int32Type, int8_t>(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, int32());
-
-  ArrayFromVector<UInt64Type, uint64_t>(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, uint64());
-  ArrayFromVector<Int64Type, int8_t>(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, int64());
-
-  ArrayFromVector<FloatType, float>(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, float32());
-
-  ArrayFromVector<DoubleType, double>(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
-  CheckZeroCopy(*arr, float64());
-}
-
 TEST_F(TestCast, DateToCompatible) {
   CastOptions options;
 
@@ -1193,5 +1175,39 @@ TEST_F(TestCast, ListToList) {
   CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options);
 }
 
+TEST_F(TestCast, IdentityCasts) {
+  // ARROW-4102
+  auto CheckIdentityCast = [this](std::shared_ptr<DataType> type,
+                                  const std::string& json) {
+    auto arr = ArrayFromJSON(type, json);
+    CheckZeroCopy(*arr, type);
+  };
+
+  CheckIdentityCast(null(), "[null, null, null]");
+  CheckIdentityCast(boolean(), "[false, true, null, false]");
+
+  for (auto type : kNumericTypes) {
+    CheckIdentityCast(type, "[1, 2, null, 4]");
+  }
+  CheckIdentityCast(binary(), "[\"foo\", \"bar\"]");
+  CheckIdentityCast(utf8(), "[\"foo\", \"bar\"]");
+  CheckIdentityCast(fixed_size_binary(3), "[\"foo\", \"bar\"]");
+
+  CheckIdentityCast(list(int8()), "[[1, 2], [null], [], [3]]");
+
+  CheckIdentityCast(time32(TimeUnit::MILLI), "[1, 2, 3, 4]");
+  CheckIdentityCast(time64(TimeUnit::MICRO), "[1, 2, 3, 4]");
+  CheckIdentityCast(date32(), "[1, 2, 3, 4]");
+  CheckIdentityCast(date64(), "[86400000, 0]");
+  CheckIdentityCast(timestamp(TimeUnit::SECOND), "[1, 2, 3, 4]");
+
+  {
+    auto dict_type = dictionary(int8(), ArrayFromJSON(int8(), "[1, 2, 3]"));
+    auto dict_indices = ArrayFromJSON(int8(), "[0, 1, 2, 0, null, 2]");
+    auto dict_array = std::make_shared<DictionaryArray>(dict_type, dict_indices);
+    CheckZeroCopy(*dict_array, dict_type);
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index 7976ef0..15746d4 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -99,6 +99,8 @@ struct is_zero_copy_cast {
   static constexpr bool value = false;
 };
 
+// TODO(wesm): ARROW-4110; this is no longer needed, but may be useful if we
+// ever _do_ want to generate identity cast kernels at compile time
 template <typename O, typename I>
 struct is_zero_copy_cast<
     O, I,
@@ -1143,6 +1145,17 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i
   return Status::OK();
 }
 
+class IdentityCast : public UnaryKernel {
+ public:
+  IdentityCast() {}
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(input.kind(), Datum::ARRAY);
+    out->value = input.array()->Copy();
+    return Status::OK();
+  }
+};
+
 class CastKernel : public UnaryKernel {
  public:
   CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy,
@@ -1188,6 +1201,8 @@ class CastKernel : public UnaryKernel {
   std::shared_ptr<DataType> out_type_;
 };
 
+// TODO(wesm): ARROW-4110 Do not generate cases that could return IdentityCast
+
 #define CAST_CASE(InType, OutType)                                                      \
   case OutType::type_id:                                                                \
     is_zero_copy = is_zero_copy_cast<OutType, InType>::value;                           \
@@ -1233,12 +1248,10 @@ class CastKernel : public UnaryKernel {
   FN(Int64Type, Date64Type);
 
 #define DATE32_CASES(FN, IN_TYPE) \
-  FN(Date32Type, Date32Type);     \
   FN(Date32Type, Date64Type);     \
   FN(Date32Type, Int32Type);
 
 #define DATE64_CASES(FN, IN_TYPE) \
-  FN(Date64Type, Date64Type);     \
   FN(Date64Type, Date32Type);     \
   FN(Date64Type, Int64Type);
 
@@ -1258,12 +1271,9 @@ class CastKernel : public UnaryKernel {
   FN(TimestampType, Date64Type);     \
   FN(TimestampType, Int64Type);
 
-#define BINARY_CASES(FN, IN_TYPE) \
-  FN(BinaryType, BinaryType);     \
-  FN(BinaryType, StringType);
+#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType);
 
 #define STRING_CASES(FN, IN_TYPE) \
-  FN(StringType, StringType);     \
   FN(StringType, BooleanType);    \
   FN(StringType, UInt8Type);      \
   FN(StringType, Int8Type);       \
@@ -1365,6 +1375,11 @@ Status GetListCastFunc(const DataType& in_type, const std::shared_ptr<DataType>&
 
 Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>& out_type,
                        const CastOptions& options, std::unique_ptr<UnaryKernel>* kernel) {
+  if (in_type.Equals(out_type)) {
+    *kernel = std::unique_ptr<UnaryKernel>(new IdentityCast);
+    return Status::OK();
+  }
+
   switch (in_type.id()) {
     CAST_FUNCTION_CASE(NullType);
     CAST_FUNCTION_CASE(BooleanType);
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index 7a78fe4..047788c 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -474,7 +474,12 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
     SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter<Int8Type>)
     SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter<Int16Type>)
     SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter<Int32Type>)
+    SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter<Int32Type>)
+    SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter<Date32Type>)
     SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter<Int64Type>)
+    SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter<Int64Type>)
+    SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, IntegerConverter<Int64Type>)
+    SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter<Date64Type>)
     SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter<UInt8Type>)
     SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter<UInt16Type>)
     SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter<UInt32Type>)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 3d34021..17ff9c6 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -768,7 +768,8 @@ def test_cast_date64_to_int():
     ('float', [0.0, 0.1, 0.2]),
     ('double', [0.0, 0.1, 0.2]),
     ('string', ['a', 'b', 'c']),
-    ('binary', [b'a', b'b', b'c'])
+    ('binary', [b'a', b'b', b'c']),
+    (pa.binary(3), [b'abc', b'bcd', b'cde'])
 ])
 def test_cast_identities(ty, values):
     arr = pa.array(values, type=ty)