You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/03/17 12:15:37 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #12464: ARROW-15665: [C++] Add error handling option to StrptimeOptions

lidavidm commented on a change in pull request #12464:
URL: https://github.com/apache/arrow/pull/12464#discussion_r829048674



##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1590,7 +1705,13 @@ const FunctionDoc strftime_doc{
      "does not exist on this system."),
     {"timestamps"},
     "StrftimeOptions"};
-
+const FunctionDoc strptime_doc(
+    "Parse timestamps",
+    ("For each string in `strings`, parse it as a timestamp.\n"
+     "The timestamp unit and the expected string pattern must be given\n"
+     "in StrptimeOptions.  Null inputs emit null.  If a non-null string\n"
+     "fails parsing, an error is returned by default."),

Review comment:
       Ditto here - what is the behavior if we don't raise errors and we get an invalid timestamp?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,117 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to timestamps
+
+static std::string GetZone(std::string format) {
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < format.size() - 1) {
+    if (format[cur] == '%') {
+      if (format[cur + 1] == 'z') {
+        zone = "UTC";
+        break;
+      }
+      cur++;
+    }
+    cur++;
+  }
+  return zone;
+}
+
+template <typename Duration, typename InType>
+struct Strptime {
+  const std::shared_ptr<TimestampParser> parser;
+  const TimeUnit::type unit;
+  const std::string zone;
+  const bool raise_errors;
+
+  static Result<Strptime> Make(KernelContext* ctx, const DataType& type) {
+    const StrptimeOptions& options = StrptimeState::Get(ctx);
+
+    return Strptime{TimestampParser::MakeStrptime(options.format),
+                    std::move(options.unit), GetZone(options.format),
+                    options.raise_errors};
+  }
+
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
+
+    if (in.is_valid) {
+      auto s = internal::UnboxScalar<InType>::Unbox(in);
+      int64_t result;
+      if ((*self.parser)(s.data(), s.size(), self.unit, &result)) {
+        *checked_cast<TimestampScalar*>(out) =
+            TimestampScalar(result, timestamp(self.unit, self.zone));
+      } else {
+        if (self.raise_errors) {
+          return Status::Invalid("Failed to parse string: '", s.data(),
+                                 "' as a scalar of type ",
+                                 TimestampType(self.unit).ToString());
+        } else {
+          out->is_valid = false;
+        }
+      }
+    } else {
+      out->is_valid = false;
+    }
+    return Status::OK();
+  }
+
+  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+    ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
+
+    std::unique_ptr<ArrayBuilder> array_builder;

Review comment:
       Using a builder is probably the reason for why we can't `can_write_into_slices`? Since the builder allocates its own buffer. Presumably the kernel has preallocation enabled so you should instead be able to write directly into `out->array()->GetValues<int64_t>(1)`.

##########
File path: python/pyarrow/_compute.pyx
##########
@@ -1458,10 +1458,12 @@ class StrptimeOptions(_StrptimeOptions):
     unit : str
         Timestamp unit of the output.
         Accepted values are "s", "ms", "us", "ns".
+    raise_errors : boolean, default True
+        Raise on parsing errors.

Review comment:
       Same here - what's the behavior otherwise?

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -267,12 +267,17 @@ class ARROW_EXPORT StructFieldOptions : public FunctionOptions {
 
 class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
  public:
-  explicit StrptimeOptions(std::string format, TimeUnit::type unit);
+  explicit StrptimeOptions(std::string format, TimeUnit::type unit,
+                           bool raise_errors = true);
   StrptimeOptions();
   static constexpr char const kTypeName[] = "StrptimeOptions";
 
+  /// The desired format string.
   std::string format;
+  /// The desired time resolution
   TimeUnit::type unit;
+  /// Raise on parsing errors

Review comment:
       What is the behavior if we don't raise parse errors?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -27,6 +27,7 @@
 #include <utf8proc.h>
 #endif
 
+#include "arrow/compute/api.h"

Review comment:
       nit: is there a more precise header we can include

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,117 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to timestamps
+
+static std::string GetZone(std::string format) {

Review comment:
       `const std::string&`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -1842,12 +1843,24 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
 TYPED_TEST(TestStringKernels, Strptime) {
   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
-  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
+  auto input_array = ArrayFromJSON(utf8(), input1);
+  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, false);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
 
   input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
   options.format = "%m/%d/%Y %%z";
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
+
+  ASSERT_OK_AND_ASSIGN(auto result, CallFunction("strptime", {input_array}, &options));
+
+  options.format = "%Y-%m-%d";
+  options.raise_errors = true;
+  ASSERT_RAISES(Invalid, CallFunction("strptime", {input_array}, &options));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Invalid: Failed to parse string: '5/1/202012/11/1900'"),
+      Strptime(input_array, options));

Review comment:
       Can we have a test of the behavior with raise_errors = false and an invalid string?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -1842,12 +1843,24 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
 TYPED_TEST(TestStringKernels, Strptime) {
   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
-  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
+  auto input_array = ArrayFromJSON(utf8(), input1);
+  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, false);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
 
   input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
   options.format = "%m/%d/%Y %%z";
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
+
+  ASSERT_OK_AND_ASSIGN(auto result, CallFunction("strptime", {input_array}, &options));
+
+  options.format = "%Y-%m-%d";
+  options.raise_errors = true;
+  ASSERT_RAISES(Invalid, CallFunction("strptime", {input_array}, &options));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Invalid: Failed to parse string: '5/1/202012/11/1900'"),

Review comment:
       This error message seems suspect, why are both rows in the error message?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,117 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to timestamps
+
+static std::string GetZone(std::string format) {
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < format.size() - 1) {
+    if (format[cur] == '%') {

Review comment:
       Do we want to respect `%%` for escaping?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org