You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/05/04 10:21:44 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

pitrou commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419336015



##########
File path: cpp/src/arrow/util/value_parsing.cc
##########
@@ -79,5 +86,46 @@ bool StringToFloatConverter::StringToFloat(const char* s, size_t length, double*
   return true;
 }
 
+// ----------------------------------------------------------------------
+// strptime-like parsing
+
+class StrptimeTimestampParser : public TimestampParser {
+ public:
+  explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {}
+
+  bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+                  value_type* out) const override {
+    arrow_vendored::date::sys_seconds time_point;
+    std::istringstream stream(std::string(s, length));
+    if (stream >> arrow_vendored::date::parse(format_, time_point)) {

Review comment:
       Shouldn't you also check that the stream is exhausted? Otherwise trailing data will be ignored.

##########
File path: cpp/src/arrow/util/value_parsing.h
##########
@@ -356,177 +377,194 @@ class StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64T
   using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
 };
 
-template <>
-class StringConverter<TimestampType> {
- public:
-  using value_type = TimestampType::c_type;
+// Inline-able ISO-8601 parser
 
-  explicit StringConverter(const std::shared_ptr<DataType>& type)
-      : unit_(checked_cast<TimestampType*>(type.get())->unit()) {}
+namespace detail {
 
-  bool operator()(const char* s, size_t length, value_type* out) {
-    // We allow the following formats:
-    // - "YYYY-MM-DD"
-    // - "YYYY-MM-DD[ T]hh"
-    // - "YYYY-MM-DD[ T]hhZ"
-    // - "YYYY-MM-DD[ T]hh:mm"
-    // - "YYYY-MM-DD[ T]hh:mmZ"
-    // - "YYYY-MM-DD[ T]hh:mm:ss"
-    // - "YYYY-MM-DD[ T]hh:mm:ssZ"
-    // UTC is always assumed, and the DataType's timezone is ignored.
-    arrow_vendored::date::year_month_day ymd;
-    if (ARROW_PREDICT_FALSE(length < 10)) {
-      return false;
-    }
-    if (length == 10) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd), out);
-    }
-    if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
-      return false;
-    }
-    if (s[length - 1] == 'Z') {
-      --length;
-    }
-    if (length == 13) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      std::chrono::duration<value_type> seconds;
-      if (ARROW_PREDICT_FALSE(!ParseHH(s + 11, &seconds))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
-    }
-    if (length == 16) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      std::chrono::duration<value_type> seconds;
-      if (ARROW_PREDICT_FALSE(!ParseHH_MM(s + 11, &seconds))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
-    }
-    if (length == 19) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      std::chrono::duration<value_type> seconds;
-      if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
-    }
+using ts_type = TimestampType::c_type;
+
+template <class TimePoint>
+static inline ts_type ConvertTimePoint(TimePoint tp, TimeUnit::type unit) {
+  auto duration = tp.time_since_epoch();
+  switch (unit) {
+    case TimeUnit::SECOND:
+      return std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+    case TimeUnit::MILLI:
+      return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    case TimeUnit::MICRO:
+      return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
+    case TimeUnit::NANO:
+      return std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+    default:
+      // Compiler errors without default case even though all enum cases are handled
+      assert(0);
+      return 0;
+  }
+}
+
+static inline bool ParseYYYY_MM_DD(const char* s,
+                                   arrow_vendored::date::year_month_day* out) {
+  uint16_t year;
+  uint8_t month, day;
+  if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
     return false;
   }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
+    return false;
+  }
+  *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
+          arrow_vendored::date::day{day}};
+  return out->ok();
+}
 
- protected:
-  template <class TimePoint>
-  bool ConvertTimePoint(TimePoint tp, value_type* out) {
-    auto duration = tp.time_since_epoch();
-    switch (unit_) {
-      case TimeUnit::SECOND:
-        *out = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
-        return true;
-      case TimeUnit::MILLI:
-        *out = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-        return true;
-      case TimeUnit::MICRO:
-        *out = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
-        return true;
-      case TimeUnit::NANO:
-        *out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
-        return true;
-    }
-    // Unreachable, but suppress compiler warning
-    assert(0);
-    *out = 0;
-    return true;
+static inline bool ParseHH(const char* s, std::chrono::duration<ts_type>* out) {
+  uint8_t hours;
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
   }
+  *out = std::chrono::duration<ts_type>(3600U * hours);
+  return true;
+}
 
-  bool ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) {
-    uint16_t year;
-    uint8_t month, day;
-    if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) {
-      return false;
-    }
-    *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
-            arrow_vendored::date::day{day}};
-    return out->ok();
+static inline bool ParseHH_MM(const char* s, std::chrono::duration<ts_type>* out) {
+  uint8_t hours, minutes;
+  if (ARROW_PREDICT_FALSE(s[2] != ':')) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
   }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration<ts_type>(3600U * hours + 60U * minutes);
+  return true;
+}
 
-  bool ParseHH(const char* s, std::chrono::duration<value_type>* out) {
-    uint8_t hours;
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(hours >= 24)) {
-      return false;
-    }
-    *out = std::chrono::duration<value_type>(3600U * hours);
-    return true;
+static inline bool ParseHH_MM_SS(const char* s, std::chrono::duration<ts_type>* out) {
+  uint8_t hours, minutes, seconds;
+  if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
+    return false;
   }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(seconds >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration<ts_type>(3600U * hours + 60U * minutes + seconds);
+  return true;
+}
 
-  bool ParseHH_MM(const char* s, std::chrono::duration<value_type>* out) {
-    uint8_t hours, minutes;
-    if (ARROW_PREDICT_FALSE(s[2] != ':')) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(hours >= 24)) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+}  // namespace detail
+
+static inline bool ParseISO8601(const char* s, size_t length, TimeUnit::type unit,

Review comment:
       This could be left in `detail`.

##########
File path: cpp/src/arrow/csv/converter_benchmark.cc
##########
@@ -20,15 +20,67 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/buffer.h"
 #include "arrow/csv/converter.h"
 #include "arrow/csv/options.h"
 #include "arrow/csv/parser.h"
+#include "arrow/csv/reader.h"
 #include "arrow/csv/test_common.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
 
+static std::shared_ptr<Buffer> GenerateTimestampsCSV(
+    const std::vector<std::string>& dates, int32_t cols, int32_t rows) {
+  std::stringstream ss;
+  for (int32_t row = 0; row < rows; ++row) {
+    for (int32_t col = 0; col < cols; ++col) {
+      ss << dates[rand() % dates.size()];
+    }
+    ss << "\n";
+  }
+  return Buffer::FromString(ss.str());
+}
+
+static Status ReadCSV(const Buffer& data, const csv::ConvertOptions& convert_opt) {
+  ARROW_ASSIGN_OR_RAISE(
+      auto table_reader,
+      csv::TableReader::Make(
+          default_memory_pool(), std::make_shared<io::BufferReader>(data),
+          csv::ReadOptions::Defaults(), csv::ParseOptions::Defaults(), convert_opt));
+  return table_reader->Read().status();
+}
+
+const std::vector<std::string> kExampleDates = {
+    "1917-10-17,", "2018-09-13 22,", "1941-06-22 04:00,", "1945-05-09 09:45:38,"};
+constexpr int32_t kNumRows = 10000000;
+constexpr int32_t kNumCols = 10;
+
+static void ConvertTimestampVirtualISO8601(benchmark::State& state) {
+  auto data = GenerateTimestampsCSV(kExampleDates, kNumCols, kNumRows);
+  auto convert_options = csv::ConvertOptions::Defaults();
+  convert_options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  for (auto _ : state) {
+    ABORT_NOT_OK(ReadCSV(*data, convert_options));

Review comment:
       I would much rather follow the same pattern as other conversion benchmarks, i.e. call `BenchmarkConversion` with a pre-populated parser. Otherwise figures are not comparable.

##########
File path: cpp/src/arrow/csv/converter_benchmark.cc
##########
@@ -20,15 +20,67 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/buffer.h"
 #include "arrow/csv/converter.h"
 #include "arrow/csv/options.h"
 #include "arrow/csv/parser.h"
+#include "arrow/csv/reader.h"
 #include "arrow/csv/test_common.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
 
+static std::shared_ptr<Buffer> GenerateTimestampsCSV(
+    const std::vector<std::string>& dates, int32_t cols, int32_t rows) {
+  std::stringstream ss;
+  for (int32_t row = 0; row < rows; ++row) {
+    for (int32_t col = 0; col < cols; ++col) {
+      ss << dates[rand() % dates.size()];

Review comment:
       Perhaps we should use a fixed seed and random generator state to make benchmark numbers more reliable (though I'm not sure that's necessary).

##########
File path: cpp/src/arrow/csv/converter.cc
##########
@@ -381,32 +383,98 @@ class NumericConverter : public ConcreteConverter {
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter for timestamps
 
+namespace {
+
+struct InlineISO8601 {

Review comment:
       Why not reuse `StringConverter<TimestampType>`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org