You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/05/06 01:52:04 UTC

[GitHub] [arrow] westonpace commented on a change in pull request #10202: ARROW-12001: [C++] Add parser handler for incorrect column counts

westonpace commented on a change in pull request #10202:
URL: https://github.com/apache/arrow/pull/10202#discussion_r627015865



##########
File path: cpp/src/arrow/csv/options.h
##########
@@ -56,6 +62,8 @@ struct ARROW_EXPORT ParseOptions {
   /// Whether empty lines are ignored.  If false, an empty line represents
   /// a single empty value (assuming a one-column CSV file).
   bool ignore_empty_lines = true;
+  /// A handler function for rows which do not have the correct number of columns
+  util::optional<InvalidRowHandler> invalid_row_handler;

Review comment:
       My vote would be that this is too much complexity.  I think simply back filling nulls (in the `InvalidRowHandler::Force` style) or skipping rows would be sufficient.  Maybe a `InvalidRowHandler` is still used under the hood to accommodate the two approaches but the interface to the user should be simpler (enum or bools).
   
   In the rare event that someone wanted to do something custom (e.g. a unique value for each column) they could fill nulls and then in a later compute step replace null with a default value.

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -76,9 +76,45 @@ class PresizedDataWriter {
     parsed_[parsed_size_++] = static_cast<uint8_t>(c);
   }
 
+  // Push the value of a fully complete field. This should only be used to fill in missing
+  // values. This method can reallocate the buffer if there isn't enough extra space for
+  // the field.
+  Status PushField(const std::string& field) {
+    if (field.length() > extra_allocated_) {
+      // just in case this happens more allocate enough for 10x this amount
+      auto to_allocate = static_cast<uint32_t>(
+          std::max(field.length() * 10, static_cast<std::string::size_type>(128)));
+      int64_t new_capacity = parsed_capacity_ + to_allocate;
+      RETURN_NOT_OK(parsed_buffer_->Resize(new_capacity));
+      parsed_ = parsed_buffer_->mutable_data();
+      parsed_capacity_ = new_capacity;
+      extra_allocated_ += to_allocate;
+    }

Review comment:
       I'm not sure but do you need...
   ```
   else {
     extra_allocated_ -= field.length();
   }
   ```

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -299,7 +406,21 @@ class BlockParserImpl {
       if (batch_.num_cols_ == -1) {
         batch_.num_cols_ = num_cols;
       } else {
-        return MismatchingColumns(batch_.num_cols_, num_cols);
+        // Find the end of the line without newline or carriage return
+        auto end = data;
+        if (*(end - 1) == '\n') {
+          end -= 1;
+          if (*(end - 1) == '\r') {
+            end -= 1;
+          }
+        }
+        ARROW_ASSIGN_OR_RAISE(auto use_row,
+                              HandleBadNumColumns(values_writer, parsed_writer, num_cols,
+                                                  std::string(start, end - start)));

Review comment:
       Use `string_view` here.

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -35,14 +35,13 @@ using detail::ParsedValueDesc;
 
 namespace {
 
-Status ParseError(const char* message) {
-  return Status::Invalid("CSV parse error: ", message);
+template <typename... Args>
+Status ParseError(Args&&... args) {
+  return Status::Invalid("CSV parse error: ", std::forward<Args>(args)...);
 }
 
-Status MismatchingColumns(int32_t expected, int32_t actual) {
-  char s[50];
-  snprintf(s, sizeof(s), "Expected %d columns, got %d", expected, actual);
-  return ParseError(s);
+Status MismatchingColumns(int32_t expected, int32_t actual, const std::string& row) {

Review comment:
       The row could be very long.  If we want to provide more context is there anyway we can put the row index in here instead?  Given that we only allow \n or \r\n to be line breaks that should make it pretty easy to locate the offending row via row index (i.e. row index == line number).

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -76,9 +76,45 @@ class PresizedDataWriter {
     parsed_[parsed_size_++] = static_cast<uint8_t>(c);
   }
 
+  // Push the value of a fully complete field. This should only be used to fill in missing
+  // values. This method can reallocate the buffer if there isn't enough extra space for
+  // the field.
+  Status PushField(const std::string& field) {
+    if (field.length() > extra_allocated_) {
+      // just in case this happens more allocate enough for 10x this amount
+      auto to_allocate = static_cast<uint32_t>(
+          std::max(field.length() * 10, static_cast<std::string::size_type>(128)));

Review comment:
       One bonus of preventing user supplied handlers is that you can change this to `PushFields(const std::string&, int)` and remove the need to guess how much space to allocate.  

##########
File path: cpp/src/arrow/csv/parser_test.cc
##########
@@ -555,6 +555,123 @@ TEST(BlockParser, MismatchingNumColumns) {
   }
 }
 
+TEST(BlockParser, MismatchingNumColumnsSkip) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::Skip();
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\nd,e\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\nb,c\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"b", "c"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b,c\nd,e\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false}));
+  }
+}
+
+TEST(BlockParser, MismatchingNumColumnsAddNulls) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::AddNulls("NULL");
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"c", "NULL"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"a", "NULL"}, {false, false}));
+  }
+  {
+    uint32_t out_size;
+    BlockParser parser(opts, 2 /* num_cols */);
+    Status st = Parse(parser, "a,b,c\nd,e\n", &out_size);
+    ASSERT_RAISES(Invalid, st);
+  }
+}
+
+TEST(BlockParser, MismatchingNumColumnsForce) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::Force();

Review comment:
       Add an assert somewhere actually verifying that `::Force` is inserting a null value (e.g. checking the equality of the {c, ""} row in the first sub test here)

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -76,9 +76,45 @@ class PresizedDataWriter {
     parsed_[parsed_size_++] = static_cast<uint8_t>(c);
   }
 
+  // Push the value of a fully complete field. This should only be used to fill in missing
+  // values. This method can reallocate the buffer if there isn't enough extra space for
+  // the field.
+  Status PushField(const std::string& field) {
+    if (field.length() > extra_allocated_) {
+      // just in case this happens more allocate enough for 10x this amount
+      auto to_allocate = static_cast<uint32_t>(
+          std::max(field.length() * 10, static_cast<std::string::size_type>(128)));
+      int64_t new_capacity = parsed_capacity_ + to_allocate;
+      RETURN_NOT_OK(parsed_buffer_->Resize(new_capacity));
+      parsed_ = parsed_buffer_->mutable_data();
+      parsed_capacity_ = new_capacity;
+      extra_allocated_ += to_allocate;

Review comment:
       `extra_allocated_ += to_allocated - field.length();`?

##########
File path: cpp/src/arrow/csv/parser_test.cc
##########
@@ -555,6 +555,123 @@ TEST(BlockParser, MismatchingNumColumns) {
   }
 }
 
+TEST(BlockParser, MismatchingNumColumnsSkip) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::Skip();
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\nd,e\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\nb,c\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"b", "c"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b,c\nd,e\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false}));
+  }
+}
+
+TEST(BlockParser, MismatchingNumColumnsAddNulls) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::AddNulls("NULL");
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"c", "NULL"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"a", "NULL"}, {false, false}));
+  }
+  {
+    uint32_t out_size;
+    BlockParser parser(opts, 2 /* num_cols */);
+    Status st = Parse(parser, "a,b,c\nd,e\n", &out_size);
+    ASSERT_RAISES(Invalid, st);
+  }
+}
+
+TEST(BlockParser, MismatchingNumColumnsForce) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::Force();
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\nd,e\n"));
+    ASSERT_EQ(3, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\nb,c\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"b", "c"}, {false, false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b,c\nd,e\n"));

Review comment:
       Maybe add a check here verifying the case where a field is removed (e.g. asserting the {"a", "b"} row here.)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org