You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/13 01:58:31 UTC

[GitHub] [arrow] westonpace commented on a change in pull request #10647: ARROW-13174: [C++][Compute] Add strftime kernel

westonpace commented on a change in pull request #10647:
URL: https://github.com/apache/arrow/pull/10647#discussion_r668370637



##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -602,6 +685,15 @@ const FunctionDoc subsecond_doc{
      "Returns an error if timestamp has a defined timezone. Null values return null."),
     {"values"}};
 
+const FunctionDoc strftime_doc{
+    "Convert timestamps to a string representation with an arbitrary format",
+    ("Strftime returns a string representation with an arbitrary format.\n"
+     "Returns an error if timestamp has a defined timezone.\n"

Review comment:
       What does this mean?  I don't see code anywhere that errors if the timestamp has a timezone but I might just not be seeing it.

##########
File path: cpp/src/arrow/compute/api_scalar.cc
##########
@@ -240,6 +243,15 @@ StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
 StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
 constexpr char StrptimeOptions::kTypeName[];
 
+StrftimeOptions::StrftimeOptions(std::string format, std::string timezone)
+    : FunctionOptions(internal::kStrftimeOptionsType),
+      format(std::move(format)),
+      timezone(std::move(timezone)) {
+  tz = arrow_vendored::date::locate_zone(this->timezone);
+}
+StrftimeOptions::StrftimeOptions() : StrftimeOptions("%Y-%m-%dT%H:%M:%S", "UTC") {}

Review comment:
       Should the default format string include the trailing `%z` so that it is ISO-8601 compliant?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -321,6 +322,53 @@ struct Nanosecond {
   }
 };
 
+// ----------------------------------------------------------------------
+// Convert timestamps to a string representation with an arbitrary format
+
+template <typename Duration>
+inline std::string get_timestamp(int64_t arg, const StrftimeOptions* options) {
+  auto zt = arrow_vendored::date::zoned_time<Duration>{options->tz,
+                                                       sys_time<Duration>(Duration{arg})};
+  return arrow_vendored::date::format(options->format, zt.get_local_time());
+}
+
+template <typename Duration>
+struct Strftime {
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    const StrftimeOptions options = StrftimeState::Get(ctx);
+
+    if (in.is_valid) {
+      const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
+      *checked_cast<StringScalar*>(out) =
+          StringScalar(get_timestamp<Duration>(in_val, &options));
+    } else {
+      out->is_valid = false;
+    }
+    return Status::OK();
+  }
+
+  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+    const StrftimeOptions options = StrftimeState::Get(ctx);
+
+    std::unique_ptr<ArrayBuilder> array_builder;
+    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), utf8(), &array_builder));
+    StringBuilder* string_builder = checked_cast<StringBuilder*>(array_builder.get());
+    RETURN_NOT_OK(string_builder->Reserve(in.length * 30));

Review comment:
       Nit: How precise is this 30?  Is it based on the default format string?  If I look at 2021-07-12T23:55:41+00:00 I would think you only need ~25, maybe more if subsecond resolution.  Could we base this on the resolution?  It's not a big deal so if it seems like too much don't worry.

##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -1347,6 +1351,32 @@ def test_strptime():
     assert got == expected
 
 
+# TODO: We should test on windows once ARROW-13168 is resolved.
+@pytest.mark.pandas
+@pytest.mark.skipif(sys.platform == 'win32',
+                    reason="Timezone database is not available on Windows yet")
+def test_strftime():

Review comment:
       Can you add some examples with `%z`/`%Z` on both naive/local timestamps and UTC timestamps.  It looks like python emits an empty string for `%z`/`%Z` if the timestamp is naive but I don't know what the vendored tz lib will do.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org