You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/05/02 23:09:33 UTC

[GitHub] [arrow] wesm opened a new pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

wesm opened a new pull request #7088:
URL: https://github.com/apache/arrow/pull/7088


   This builds on the work from #6631 while adding unit tests and additional benchmarks.
   
   I also renamed arrow/util/parsing.h to arrow/util/value_parsing.h to make it slightly more discoverable.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419336015



##########
File path: cpp/src/arrow/util/value_parsing.cc
##########
@@ -79,5 +86,46 @@ bool StringToFloatConverter::StringToFloat(const char* s, size_t length, double*
   return true;
 }
 
+// ----------------------------------------------------------------------
+// strptime-like parsing
+
+class StrptimeTimestampParser : public TimestampParser {
+ public:
+  explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {}
+
+  bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+                  value_type* out) const override {
+    arrow_vendored::date::sys_seconds time_point;
+    std::istringstream stream(std::string(s, length));
+    if (stream >> arrow_vendored::date::parse(format_, time_point)) {

Review comment:
       Shouldn't you also check that the stream is exhausted? Otherwise trailing data will be ignored.

##########
File path: cpp/src/arrow/util/value_parsing.h
##########
@@ -356,177 +377,194 @@ class StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64T
   using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
 };
 
-template <>
-class StringConverter<TimestampType> {
- public:
-  using value_type = TimestampType::c_type;
+// Inline-able ISO-8601 parser
 
-  explicit StringConverter(const std::shared_ptr<DataType>& type)
-      : unit_(checked_cast<TimestampType*>(type.get())->unit()) {}
+namespace detail {
 
-  bool operator()(const char* s, size_t length, value_type* out) {
-    // We allow the following formats:
-    // - "YYYY-MM-DD"
-    // - "YYYY-MM-DD[ T]hh"
-    // - "YYYY-MM-DD[ T]hhZ"
-    // - "YYYY-MM-DD[ T]hh:mm"
-    // - "YYYY-MM-DD[ T]hh:mmZ"
-    // - "YYYY-MM-DD[ T]hh:mm:ss"
-    // - "YYYY-MM-DD[ T]hh:mm:ssZ"
-    // UTC is always assumed, and the DataType's timezone is ignored.
-    arrow_vendored::date::year_month_day ymd;
-    if (ARROW_PREDICT_FALSE(length < 10)) {
-      return false;
-    }
-    if (length == 10) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd), out);
-    }
-    if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
-      return false;
-    }
-    if (s[length - 1] == 'Z') {
-      --length;
-    }
-    if (length == 13) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      std::chrono::duration<value_type> seconds;
-      if (ARROW_PREDICT_FALSE(!ParseHH(s + 11, &seconds))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
-    }
-    if (length == 16) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      std::chrono::duration<value_type> seconds;
-      if (ARROW_PREDICT_FALSE(!ParseHH_MM(s + 11, &seconds))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
-    }
-    if (length == 19) {
-      if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
-        return false;
-      }
-      std::chrono::duration<value_type> seconds;
-      if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) {
-        return false;
-      }
-      return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
-    }
+using ts_type = TimestampType::c_type;
+
+template <class TimePoint>
+static inline ts_type ConvertTimePoint(TimePoint tp, TimeUnit::type unit) {
+  auto duration = tp.time_since_epoch();
+  switch (unit) {
+    case TimeUnit::SECOND:
+      return std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+    case TimeUnit::MILLI:
+      return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    case TimeUnit::MICRO:
+      return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
+    case TimeUnit::NANO:
+      return std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+    default:
+      // Compiler errors without default case even though all enum cases are handled
+      assert(0);
+      return 0;
+  }
+}
+
+static inline bool ParseYYYY_MM_DD(const char* s,
+                                   arrow_vendored::date::year_month_day* out) {
+  uint16_t year;
+  uint8_t month, day;
+  if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
     return false;
   }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
+    return false;
+  }
+  *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
+          arrow_vendored::date::day{day}};
+  return out->ok();
+}
 
- protected:
-  template <class TimePoint>
-  bool ConvertTimePoint(TimePoint tp, value_type* out) {
-    auto duration = tp.time_since_epoch();
-    switch (unit_) {
-      case TimeUnit::SECOND:
-        *out = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
-        return true;
-      case TimeUnit::MILLI:
-        *out = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-        return true;
-      case TimeUnit::MICRO:
-        *out = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
-        return true;
-      case TimeUnit::NANO:
-        *out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
-        return true;
-    }
-    // Unreachable, but suppress compiler warning
-    assert(0);
-    *out = 0;
-    return true;
+static inline bool ParseHH(const char* s, std::chrono::duration<ts_type>* out) {
+  uint8_t hours;
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
   }
+  *out = std::chrono::duration<ts_type>(3600U * hours);
+  return true;
+}
 
-  bool ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) {
-    uint16_t year;
-    uint8_t month, day;
-    if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) {
-      return false;
-    }
-    *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
-            arrow_vendored::date::day{day}};
-    return out->ok();
+static inline bool ParseHH_MM(const char* s, std::chrono::duration<ts_type>* out) {
+  uint8_t hours, minutes;
+  if (ARROW_PREDICT_FALSE(s[2] != ':')) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
   }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration<ts_type>(3600U * hours + 60U * minutes);
+  return true;
+}
 
-  bool ParseHH(const char* s, std::chrono::duration<value_type>* out) {
-    uint8_t hours;
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(hours >= 24)) {
-      return false;
-    }
-    *out = std::chrono::duration<value_type>(3600U * hours);
-    return true;
+static inline bool ParseHH_MM_SS(const char* s, std::chrono::duration<ts_type>* out) {
+  uint8_t hours, minutes, seconds;
+  if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
+    return false;
   }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(seconds >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration<ts_type>(3600U * hours + 60U * minutes + seconds);
+  return true;
+}
 
-  bool ParseHH_MM(const char* s, std::chrono::duration<value_type>* out) {
-    uint8_t hours, minutes;
-    if (ARROW_PREDICT_FALSE(s[2] != ':')) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(hours >= 24)) {
-      return false;
-    }
-    if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+}  // namespace detail
+
+static inline bool ParseISO8601(const char* s, size_t length, TimeUnit::type unit,

Review comment:
       This could be left in `detail`.

##########
File path: cpp/src/arrow/csv/converter_benchmark.cc
##########
@@ -20,15 +20,67 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/buffer.h"
 #include "arrow/csv/converter.h"
 #include "arrow/csv/options.h"
 #include "arrow/csv/parser.h"
+#include "arrow/csv/reader.h"
 #include "arrow/csv/test_common.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
 
+static std::shared_ptr<Buffer> GenerateTimestampsCSV(
+    const std::vector<std::string>& dates, int32_t cols, int32_t rows) {
+  std::stringstream ss;
+  for (int32_t row = 0; row < rows; ++row) {
+    for (int32_t col = 0; col < cols; ++col) {
+      ss << dates[rand() % dates.size()];
+    }
+    ss << "\n";
+  }
+  return Buffer::FromString(ss.str());
+}
+
+static Status ReadCSV(const Buffer& data, const csv::ConvertOptions& convert_opt) {
+  ARROW_ASSIGN_OR_RAISE(
+      auto table_reader,
+      csv::TableReader::Make(
+          default_memory_pool(), std::make_shared<io::BufferReader>(data),
+          csv::ReadOptions::Defaults(), csv::ParseOptions::Defaults(), convert_opt));
+  return table_reader->Read().status();
+}
+
+const std::vector<std::string> kExampleDates = {
+    "1917-10-17,", "2018-09-13 22,", "1941-06-22 04:00,", "1945-05-09 09:45:38,"};
+constexpr int32_t kNumRows = 10000000;
+constexpr int32_t kNumCols = 10;
+
+static void ConvertTimestampVirtualISO8601(benchmark::State& state) {
+  auto data = GenerateTimestampsCSV(kExampleDates, kNumCols, kNumRows);
+  auto convert_options = csv::ConvertOptions::Defaults();
+  convert_options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  for (auto _ : state) {
+    ABORT_NOT_OK(ReadCSV(*data, convert_options));

Review comment:
       I would much rather follow the same pattern as other conversion benchmarks, i.e. call `BenchmarkConversion` with a pre-populated parser. Otherwise figures are not comparable.

##########
File path: cpp/src/arrow/csv/converter_benchmark.cc
##########
@@ -20,15 +20,67 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/buffer.h"
 #include "arrow/csv/converter.h"
 #include "arrow/csv/options.h"
 #include "arrow/csv/parser.h"
+#include "arrow/csv/reader.h"
 #include "arrow/csv/test_common.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
 
+static std::shared_ptr<Buffer> GenerateTimestampsCSV(
+    const std::vector<std::string>& dates, int32_t cols, int32_t rows) {
+  std::stringstream ss;
+  for (int32_t row = 0; row < rows; ++row) {
+    for (int32_t col = 0; col < cols; ++col) {
+      ss << dates[rand() % dates.size()];

Review comment:
       Perhaps we should use a fixed seed and random generator state to make benchmark numbers more reliable (though I'm not sure that's necessary).

##########
File path: cpp/src/arrow/csv/converter.cc
##########
@@ -381,32 +383,98 @@ class NumericConverter : public ConcreteConverter {
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter for timestamps
 
+namespace {
+
+struct InlineISO8601 {

Review comment:
       Why not reuse `StringConverter<TimestampType>`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-624164634


   @pitrou that sounds good to me. 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-624157808


   I see. Then I think we should vendor a `strptime` implementation, for example musl's: https://github.com/esmil/musl/blob/master/src/time/strptime.c


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-623788547


   Turns out Gandiva already had a `strptime` wrapper so wasn't that hard to adapt it. This is ready to merge pending CI 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-623028005


   https://issues.apache.org/jira/browse/ARROW-8111


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-624089054


   I'm taking this up.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419792382



##########
File path: cpp/src/arrow/util/value_parsing.cc
##########
@@ -79,5 +86,46 @@ bool StringToFloatConverter::StringToFloat(const char* s, size_t length, double*
   return true;
 }
 
+// ----------------------------------------------------------------------
+// strptime-like parsing
+
+class StrptimeTimestampParser : public TimestampParser {
+ public:
+  explicit StrptimeTimestampParser(std::string format) : format_(std::move(format)) {}
+
+  bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+                  value_type* out) const override {
+    arrow_vendored::date::sys_seconds time_point;
+    std::istringstream stream(std::string(s, length));
+    if (stream >> arrow_vendored::date::parse(format_, time_point)) {

Review comment:
       This won't work anyway because HowardHinnant/date doesn't completely work with gcc < 7




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-624318646


   Thanks @pitrou!


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] tfmark commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
tfmark commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-1048771652


   The [musl implementation](https://github.com/esmil/musl/blob/master/src/time/strptime.c) does not appear to support fractional seconds? Or am I missing something?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419799490



##########
File path: cpp/src/arrow/csv/converter_benchmark.cc
##########
@@ -20,15 +20,67 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/buffer.h"
 #include "arrow/csv/converter.h"
 #include "arrow/csv/options.h"
 #include "arrow/csv/parser.h"
+#include "arrow/csv/reader.h"
 #include "arrow/csv/test_common.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
 
+static std::shared_ptr<Buffer> GenerateTimestampsCSV(
+    const std::vector<std::string>& dates, int32_t cols, int32_t rows) {
+  std::stringstream ss;
+  for (int32_t row = 0; row < rows; ++row) {
+    for (int32_t col = 0; col < cols; ++col) {
+      ss << dates[rand() % dates.size()];
+    }
+    ss << "\n";
+  }
+  return Buffer::FromString(ss.str());
+}
+
+static Status ReadCSV(const Buffer& data, const csv::ConvertOptions& convert_opt) {
+  ARROW_ASSIGN_OR_RAISE(
+      auto table_reader,
+      csv::TableReader::Make(
+          default_memory_pool(), std::make_shared<io::BufferReader>(data),
+          csv::ReadOptions::Defaults(), csv::ParseOptions::Defaults(), convert_opt));
+  return table_reader->Read().status();
+}
+
+const std::vector<std::string> kExampleDates = {
+    "1917-10-17,", "2018-09-13 22,", "1941-06-22 04:00,", "1945-05-09 09:45:38,"};
+constexpr int32_t kNumRows = 10000000;
+constexpr int32_t kNumCols = 10;
+
+static void ConvertTimestampVirtualISO8601(benchmark::State& state) {
+  auto data = GenerateTimestampsCSV(kExampleDates, kNumCols, kNumRows);
+  auto convert_options = csv::ConvertOptions::Defaults();
+  convert_options.timestamp_parsers.push_back(TimestampParser::MakeISO8601());
+  for (auto _ : state) {
+    ABORT_NOT_OK(ReadCSV(*data, convert_options));

Review comment:
       sorry I missed this, done




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-624157356


   The HowardHinnant date library is broken for parsing in gcc < 7 so probably won't work for MinGW


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419459324



##########
File path: cpp/src/arrow/csv/converter.cc
##########
@@ -381,32 +383,98 @@ class NumericConverter : public ConcreteConverter {
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter for timestamps
 
+namespace {
+
+struct InlineISO8601 {

Review comment:
       That class needs to know the TimeUnit but it is not known yet when this class is constructed 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419459324



##########
File path: cpp/src/arrow/csv/converter.cc
##########
@@ -381,32 +383,98 @@ class NumericConverter : public ConcreteConverter {
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter for timestamps
 
+namespace {
+
+struct InlineISO8601 {

Review comment:
       That class needs to know the TimeUnit at construction time but it is not known yet when this class is constructed 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-623031355


   Hm unfortunately it seems that Howard Hinnant's date library does not fully support gcc < 7 -- there are issues with `date::parse`
   
   ```
   /usr/bin/ccache /usr/bin/g++-4.8  -DARROW_EXPORTING -DARROW_EXTRA_ERROR_CONTEXT -DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2 -DARROW_HDFS -DARROW_JEMALLOC -DARROW_JEMALLOC_INCLUDE_DIR="" -DARROW_NO_DEPRECATED_API -DARROW_WITH_BACKTRACE -DARROW_WITH_BROTLI -DARROW_WITH_BZ2 -DARROW_WITH_LZ4 -DARROW_WITH_SNAPPY -DARROW_WITH_TIMING_TESTS -DARROW_WITH_ZLIB -DARROW_WITH_ZSTD -DURI_STATIC_BUILD -Isrc -I../src -I../src/generated -isystem ../thirdparty/flatbuffers/include -isystem /home/wesm/cpp-toolchain/include -isystem jemalloc_ep-prefix/src -isystem ../thirdparty/hadoop/include -ggdb -O0  -Wall -Wno-conversion -Wno-sign-conversion -Wno-unused-variable -Werror -Wno-attributes -mavx2 -fno-omit-frame-pointer -g -fPIC   -std=gnu++11 -MD -MT src/arrow/CMakeFiles/arrow_objlib.dir/util/value_parsing.cc.o -MF src/arrow/CMakeFiles/arrow_objlib.dir/util/value_parsing.cc.o.d -o src/arrow/CMakeFiles/arrow_objlib.dir/util/value_parsing.cc.o -c ../src/arrow/util/value_parsing.cc
   In file included from ../src/arrow/vendored/datetime.h:20:0,
                    from ../src/arrow/util/value_parsing.h:35,
                    from ../src/arrow/util/value_parsing.cc:18:
   ../src/arrow/vendored/datetime/date.h: In instantiation of ‘std::basic_istream<_CharT, _Traits>& arrow_vendored::date::from_stream(std::basic_istream<_CharT, _Traits>&, const CharT*, arrow_vendored::date::sys_time<Duration>&, std::basic_string<CharT, Traits, Alloc>*, std::chrono::minutes*) [with Duration = std::chrono::duration<long int>; CharT = char; Traits = std::char_traits<char>; Alloc = std::allocator<char>; arrow_vendored::date::sys_time<Duration> = std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::duration<long int> >; std::chrono::minutes = std::chrono::duration<int, std::ratio<60l> >]’:
   ../src/arrow/vendored/datetime/date.h:7823:74:   required from ‘std::basic_istream<_CharT, _Traits>& arrow_vendored::date::operator>>(std::basic_istream<_CharT, _Traits>&, const arrow_vendored::date::parse_manip<Parsable, CharT, Traits, Alloc>&) [with Parsable = std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::duration<long int> >; CharT = char; Traits = std::char_traits<char>; Alloc = std::allocator<char>]’
   ../src/arrow/util/value_parsing.cc:100:62:   required from here
   ../src/arrow/vendored/datetime/date.h:4557:5: error: converting to ‘arrow_vendored::date::weekday’ from initializer list would use explicit constructor ‘constexpr arrow_vendored::date::weekday::weekday(unsigned int)’
        fields() = default;
        ^
   ../src/arrow/vendored/datetime/date.h:7752:20: note: synthesized method ‘arrow_vendored::date::fields<Duration>::fields() [with Duration = std::chrono::duration<long int>]’ first required here 
        fields<CT> fds{};
                       ^
   ninja: build stopped: subcommand failed.
   ```
   
   This will have to be changed to use POSIX strptime for now


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-624220176


   Everything is green, I'm going to merge now.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-623028582


   @prutskov if it's helpful, you might take a look at the changes I made to the benchmark code that you contributed to see idiomatic use of `Result<T>` and some other things (such as using in-memory buffers instead of writing data to disk)


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on a change in pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#discussion_r419799536



##########
File path: cpp/src/arrow/csv/converter_benchmark.cc
##########
@@ -20,15 +20,67 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/buffer.h"
 #include "arrow/csv/converter.h"
 #include "arrow/csv/options.h"
 #include "arrow/csv/parser.h"
+#include "arrow/csv/reader.h"
 #include "arrow/csv/test_common.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace csv {
 
+static std::shared_ptr<Buffer> GenerateTimestampsCSV(
+    const std::vector<std::string>& dates, int32_t cols, int32_t rows) {
+  std::stringstream ss;
+  for (int32_t row = 0; row < rows; ++row) {
+    for (int32_t col = 0; col < cols; ++col) {
+      ss << dates[rand() % dates.size()];

Review comment:
       Removed this code in favor of your comment below




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm edited a comment on pull request #7088: ARROW-8111: [C++] User-defined timestamp parser option to CSV, new TimestampParser interface, and strptime-compatible impl

Posted by GitBox <gi...@apache.org>.
wesm edited a comment on pull request #7088:
URL: https://github.com/apache/arrow/pull/7088#issuecomment-623788547


   Turns out Gandiva already had a `strptime` wrapper (that was ~10+x faster than vendored/datetime.h) so wasn't that hard to adapt it. This is ready to merge pending CI 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org