You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/03 13:50:22 UTC

[arrow] branch master updated: ARROW-3853: [C++] Implement string to timestamp cast

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 681efd8  ARROW-3853: [C++] Implement string to timestamp cast
681efd8 is described below

commit 681efd8f22fb2225dc9b74f1d5a54ddce9a79d36
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Dec 3 07:50:13 2018 -0600

    ARROW-3853: [C++] Implement string to timestamp cast
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #3044 from pitrou/ARROW-3853-cast-string-to-timestamp and squashes the following commits:
    
    00bc36067 <Antoine Pitrou> ARROW-3853:  Implement string to timestamp cast
---
 cpp/src/arrow/compute/compute-test.cc          | 33 ++++++++++++++++++++
 cpp/src/arrow/compute/kernels/cast.cc          | 32 ++++++++++++++++++-
 cpp/src/arrow/util/number-parsing-benchmark.cc | 43 ++++++++++++++++++++++++--
 cpp/src/arrow/util/parsing.h                   |  3 +-
 4 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index 821569e..52fc588 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -111,8 +111,10 @@ class TestCast : public ComputeFixture, public TestBase {
   void CheckCase(const shared_ptr<DataType>& in_type, const vector<I_TYPE>& in_values,
                  const vector<bool>& is_valid, const shared_ptr<DataType>& out_type,
                  const vector<O_TYPE>& out_values, const CastOptions& options) {
+    DCHECK_EQ(in_values.size(), out_values.size());
     shared_ptr<Array> input, expected;
     if (is_valid.size() > 0) {
+      DCHECK_EQ(is_valid.size(), out_values.size());
       ArrayFromVector<InType, I_TYPE>(in_type, is_valid, in_values, &input);
       ArrayFromVector<OutType, O_TYPE>(out_type, is_valid, out_values, &expected);
     } else {
@@ -1056,6 +1058,37 @@ TEST_F(TestCast, StringToNumberErrors) {
   CheckFails<StringType, std::string>(utf8(), {"z"}, is_valid, float32(), options);
 }
 
+TEST_F(TestCast, StringToTimestamp) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true, false, true};
+  vector<std::string> strings = {"1970-01-01", "xxx", "2000-02-29"};
+
+  auto type = timestamp(TimeUnit::SECOND);
+  vector<int64_t> e = {0, 0, 951782400};
+  CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid,
+                                                             type, e, options);
+
+  type = timestamp(TimeUnit::MICRO);
+  e = {0, 0, 951782400000000LL};
+  CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid,
+                                                             type, e, options);
+
+  // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
+}
+
+TEST_F(TestCast, StringToTimestampErrors) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true};
+
+  for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
+    auto type = timestamp(unit);
+    CheckFails<StringType, std::string>(utf8(), {""}, is_valid, type, options);
+    CheckFails<StringType, std::string>(utf8(), {"xxx"}, is_valid, type, options);
+  }
+}
+
 template <typename TestType>
 class TestDictionaryCast : public TestCast {};
 
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index cd45b2d..4f7d7f8 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -981,6 +981,35 @@ struct CastFunctor<O, StringType,
 };
 
 // ----------------------------------------------------------------------
+// String to Timestamp
+
+template <>
+struct CastFunctor<TimestampType, StringType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using out_type = TimestampType::c_type;
+
+    StringArray input_array(input.Copy());
+    auto out_data = output->GetMutableValues<out_type>(1);
+    internal::StringConverter<TimestampType> converter(output->type);
+
+    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
+      if (input_array.IsNull(i)) {
+        continue;
+      }
+
+      auto str = input_array.GetView(i);
+      if (!converter(str.data(), str.length(), out_data)) {
+        std::stringstream ss;
+        ss << "Failed to cast String '" << str << "' into " << output->type->ToString();
+        ctx->SetStatus(Status(StatusCode::Invalid, ss.str()));
+        return;
+      }
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
 
 typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
                            ArrayData*)>
@@ -1170,7 +1199,8 @@ class CastKernel : public UnaryKernel {
   FN(StringType, UInt64Type);     \
   FN(StringType, Int64Type);      \
   FN(StringType, FloatType);      \
-  FN(StringType, DoubleType);
+  FN(StringType, DoubleType);     \
+  FN(StringType, TimestampType);
 
 #define DICTIONARY_CASES(FN, IN_TYPE) \
   FN(IN_TYPE, NullType);              \
diff --git a/cpp/src/arrow/util/number-parsing-benchmark.cc b/cpp/src/arrow/util/number-parsing-benchmark.cc
index 28ef76a..42c7b31 100644
--- a/cpp/src/arrow/util/number-parsing-benchmark.cc
+++ b/cpp/src/arrow/util/number-parsing-benchmark.cc
@@ -43,7 +43,7 @@ static std::vector<std::string> MakeIntStrings(int32_t num_items) {
   for (int32_t i = 0; i < num_items; ++i) {
     strings.push_back(base_strings[i % base_strings.size()]);
   }
-  return base_strings;
+  return strings;
 }
 
 static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
@@ -54,7 +54,18 @@ static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
   for (int32_t i = 0; i < num_items; ++i) {
     strings.push_back(base_strings[i % base_strings.size()]);
   }
-  return base_strings;
+  return strings;
+}
+
+static std::vector<std::string> MakeTimestampStrings(int32_t num_items) {
+  std::vector<std::string> base_strings = {"2018-11-13 17:11:10", "2018-11-13 11:22:33",
+                                           "2016-02-29 11:22:33"};
+
+  std::vector<std::string> strings;
+  for (int32_t i = 0; i < num_items; ++i) {
+    strings.push_back(base_strings[i % base_strings.size()]);
+  }
+  return strings;
 }
 
 template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
@@ -97,6 +108,29 @@ static void BM_FloatParsing(benchmark::State& state) {  // NOLINT non-const refe
   state.SetItemsProcessed(state.iterations() * strings.size());
 }
 
+template <TimeUnit::type UNIT>
+static void BM_TimestampParsing(benchmark::State& state) {  // NOLINT non-const reference
+  using c_type = TimestampType::c_type;
+
+  auto strings = MakeTimestampStrings(1000);
+  auto type = timestamp(UNIT);
+  StringConverter<TimestampType> converter(type);
+
+  while (state.KeepRunning()) {
+    c_type total = 0;
+    for (const auto& s : strings) {
+      c_type value;
+      if (!converter(s.data(), s.length(), &value)) {
+        std::cerr << "Conversion failed for '" << s << "'";
+        std::abort();
+      }
+      total += value;
+    }
+    benchmark::DoNotOptimize(total);
+  }
+  state.SetItemsProcessed(state.iterations() * strings.size());
+}
+
 BENCHMARK_TEMPLATE(BM_IntegerParsing, Int8Type);
 BENCHMARK_TEMPLATE(BM_IntegerParsing, Int16Type);
 BENCHMARK_TEMPLATE(BM_IntegerParsing, Int32Type);
@@ -109,5 +143,10 @@ BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt64Type);
 BENCHMARK_TEMPLATE(BM_FloatParsing, FloatType);
 BENCHMARK_TEMPLATE(BM_FloatParsing, DoubleType);
 
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::SECOND);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MILLI);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MICRO);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::NANO);
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h
index aa1f820..23e0361 100644
--- a/cpp/src/arrow/util/parsing.h
+++ b/cpp/src/arrow/util/parsing.h
@@ -419,8 +419,9 @@ class StringConverter<TimestampType> {
         *out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
         return true;
     }
-    // Unreachable
+    // Unreachable, but suppress compiler warning
     assert(0);
+    *out = 0;
     return true;
   }