You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/03 13:50:22 UTC
[arrow] branch master updated: ARROW-3853: [C++] Implement string
to timestamp cast
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 681efd8 ARROW-3853: [C++] Implement string to timestamp cast
681efd8 is described below
commit 681efd8f22fb2225dc9b74f1d5a54ddce9a79d36
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Dec 3 07:50:13 2018 -0600
ARROW-3853: [C++] Implement string to timestamp cast
Author: Antoine Pitrou <an...@python.org>
Closes #3044 from pitrou/ARROW-3853-cast-string-to-timestamp and squashes the following commits:
00bc36067 <Antoine Pitrou> ARROW-3853: Implement string to timestamp cast
---
cpp/src/arrow/compute/compute-test.cc | 33 ++++++++++++++++++++
cpp/src/arrow/compute/kernels/cast.cc | 32 ++++++++++++++++++-
cpp/src/arrow/util/number-parsing-benchmark.cc | 43 ++++++++++++++++++++++++--
cpp/src/arrow/util/parsing.h | 3 +-
4 files changed, 107 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index 821569e..52fc588 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -111,8 +111,10 @@ class TestCast : public ComputeFixture, public TestBase {
void CheckCase(const shared_ptr<DataType>& in_type, const vector<I_TYPE>& in_values,
const vector<bool>& is_valid, const shared_ptr<DataType>& out_type,
const vector<O_TYPE>& out_values, const CastOptions& options) {
+ DCHECK_EQ(in_values.size(), out_values.size());
shared_ptr<Array> input, expected;
if (is_valid.size() > 0) {
+ DCHECK_EQ(is_valid.size(), out_values.size());
ArrayFromVector<InType, I_TYPE>(in_type, is_valid, in_values, &input);
ArrayFromVector<OutType, O_TYPE>(out_type, is_valid, out_values, &expected);
} else {
@@ -1056,6 +1058,37 @@ TEST_F(TestCast, StringToNumberErrors) {
CheckFails<StringType, std::string>(utf8(), {"z"}, is_valid, float32(), options);
}
+TEST_F(TestCast, StringToTimestamp) {
+ CastOptions options;
+
+ vector<bool> is_valid = {true, false, true};
+ vector<std::string> strings = {"1970-01-01", "xxx", "2000-02-29"};
+
+ auto type = timestamp(TimeUnit::SECOND);
+ vector<int64_t> e = {0, 0, 951782400};
+ CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid,
+ type, e, options);
+
+ type = timestamp(TimeUnit::MICRO);
+ e = {0, 0, 951782400000000LL};
+ CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid,
+ type, e, options);
+
+ // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
+}
+
+TEST_F(TestCast, StringToTimestampErrors) {
+ CastOptions options;
+
+ vector<bool> is_valid = {true};
+
+ for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
+ auto type = timestamp(unit);
+ CheckFails<StringType, std::string>(utf8(), {""}, is_valid, type, options);
+ CheckFails<StringType, std::string>(utf8(), {"xxx"}, is_valid, type, options);
+ }
+}
+
template <typename TestType>
class TestDictionaryCast : public TestCast {};
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index cd45b2d..4f7d7f8 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -981,6 +981,35 @@ struct CastFunctor<O, StringType,
};
// ----------------------------------------------------------------------
+// String to Timestamp
+
+template <>
+struct CastFunctor<TimestampType, StringType> {
+ void operator()(FunctionContext* ctx, const CastOptions& options,
+ const ArrayData& input, ArrayData* output) {
+ using out_type = TimestampType::c_type;
+
+ StringArray input_array(input.Copy());
+ auto out_data = output->GetMutableValues<out_type>(1);
+ internal::StringConverter<TimestampType> converter(output->type);
+
+ for (int64_t i = 0; i < input.length; ++i, ++out_data) {
+ if (input_array.IsNull(i)) {
+ continue;
+ }
+
+ auto str = input_array.GetView(i);
+ if (!converter(str.data(), str.length(), out_data)) {
+ std::stringstream ss;
+ ss << "Failed to cast String '" << str << "' into " << output->type->ToString();
+ ctx->SetStatus(Status(StatusCode::Invalid, ss.str()));
+ return;
+ }
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
ArrayData*)>
@@ -1170,7 +1199,8 @@ class CastKernel : public UnaryKernel {
FN(StringType, UInt64Type); \
FN(StringType, Int64Type); \
FN(StringType, FloatType); \
- FN(StringType, DoubleType);
+ FN(StringType, DoubleType); \
+ FN(StringType, TimestampType);
#define DICTIONARY_CASES(FN, IN_TYPE) \
FN(IN_TYPE, NullType); \
diff --git a/cpp/src/arrow/util/number-parsing-benchmark.cc b/cpp/src/arrow/util/number-parsing-benchmark.cc
index 28ef76a..42c7b31 100644
--- a/cpp/src/arrow/util/number-parsing-benchmark.cc
+++ b/cpp/src/arrow/util/number-parsing-benchmark.cc
@@ -43,7 +43,7 @@ static std::vector<std::string> MakeIntStrings(int32_t num_items) {
for (int32_t i = 0; i < num_items; ++i) {
strings.push_back(base_strings[i % base_strings.size()]);
}
- return base_strings;
+ return strings;
}
static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
@@ -54,7 +54,18 @@ static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
for (int32_t i = 0; i < num_items; ++i) {
strings.push_back(base_strings[i % base_strings.size()]);
}
- return base_strings;
+ return strings;
+}
+
+static std::vector<std::string> MakeTimestampStrings(int32_t num_items) {
+ std::vector<std::string> base_strings = {"2018-11-13 17:11:10", "2018-11-13 11:22:33",
+ "2016-02-29 11:22:33"};
+
+ std::vector<std::string> strings;
+ for (int32_t i = 0; i < num_items; ++i) {
+ strings.push_back(base_strings[i % base_strings.size()]);
+ }
+ return strings;
}
template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
@@ -97,6 +108,29 @@ static void BM_FloatParsing(benchmark::State& state) { // NOLINT non-const refe
state.SetItemsProcessed(state.iterations() * strings.size());
}
+template <TimeUnit::type UNIT>
+static void BM_TimestampParsing(benchmark::State& state) { // NOLINT non-const reference
+ using c_type = TimestampType::c_type;
+
+ auto strings = MakeTimestampStrings(1000);
+ auto type = timestamp(UNIT);
+ StringConverter<TimestampType> converter(type);
+
+ while (state.KeepRunning()) {
+ c_type total = 0;
+ for (const auto& s : strings) {
+ c_type value;
+ if (!converter(s.data(), s.length(), &value)) {
+ std::cerr << "Conversion failed for '" << s << "'";
+ std::abort();
+ }
+ total += value;
+ }
+ benchmark::DoNotOptimize(total);
+ }
+ state.SetItemsProcessed(state.iterations() * strings.size());
+}
+
BENCHMARK_TEMPLATE(BM_IntegerParsing, Int8Type);
BENCHMARK_TEMPLATE(BM_IntegerParsing, Int16Type);
BENCHMARK_TEMPLATE(BM_IntegerParsing, Int32Type);
@@ -109,5 +143,10 @@ BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt64Type);
BENCHMARK_TEMPLATE(BM_FloatParsing, FloatType);
BENCHMARK_TEMPLATE(BM_FloatParsing, DoubleType);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::SECOND);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MILLI);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MICRO);
+BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::NANO);
+
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h
index aa1f820..23e0361 100644
--- a/cpp/src/arrow/util/parsing.h
+++ b/cpp/src/arrow/util/parsing.h
@@ -419,8 +419,9 @@ class StringConverter<TimestampType> {
*out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
return true;
}
- // Unreachable
+ // Unreachable, but suppress compiler warning
assert(0);
+ *out = 0;
return true;
}