You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/05/19 14:59:52 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #10321: ARROW-12675: [C++] CSV parsing report row on which error occurred

lidavidm commented on a change in pull request #10321:
URL: https://github.com/apache/arrow/pull/10321#discussion_r635318248



##########
File path: python/pyarrow/tests/test_csv.py
##########
@@ -974,6 +976,36 @@ def read_csv(self, *args, validate_full=True, **kwargs):
         table.validate(full=validate_full)
         return table
 
+    def test_row_numbers_in_errors(self):
+        """ Row numbers are only correctly counted in serial reads """
+        csv, _ = make_random_csv(4, 100, write_names=False)

Review comment:
       Is there a particular reason to only test without the header row? It'd be good to check that that gets handled properly too.

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -1004,7 +1016,7 @@ Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
     const ParseOptions& parse_options, const ConvertOptions& convert_options) {
   std::shared_ptr<BaseStreamingReader> reader;
   reader = std::make_shared<SerialStreamingReader>(
-      io_context, cpu_executor, input, read_options, parse_options, convert_options);
+      io_context, cpu_executor, input, read_options, parse_options, convert_options, true);

Review comment:
       ```suggestion
         io_context, cpu_executor, input, read_options, parse_options, convert_options, /*count_rows=*/true);
   ```

##########
File path: cpp/src/arrow/csv/parser_test.cc
##########
@@ -623,5 +643,48 @@ TEST(BlockParser, QuotedEscape) {
   }
 }
 
+TEST(BlockParser, RowNumberAppendedToError) {
+  auto options = ParseOptions::Defaults();
+  auto csv = "a,b,c\nd,e,f\ng,h,i\n";
+  {
+    BlockParser parser(options, -1, 0);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, csv));
+    int row = 0;
+    auto status = parser.VisitColumn(
+        0, [row](const uint8_t* data, uint32_t size, bool quoted) mutable -> Status {
+          return ++row == 2 ? Status::Invalid("Bad value") : Status::OK();
+        });
+    ASSERT_RAISES(Invalid, status);
+    ASSERT_NE(std::string::npos, status.message().find("Row 1: Bad value"))
+        << status.message();

Review comment:
       You can do something like `EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Row 1: Bad value", status)`

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -888,9 +899,10 @@ class AsyncThreadedTableReader
                            std::shared_ptr<io::InputStream> input,
                            const ReadOptions& read_options,
                            const ParseOptions& parse_options,
-                           const ConvertOptions& convert_options, Executor* cpu_executor)
+                           const ConvertOptions& convert_options,
+                           Executor* cpu_executor)
       : BaseTableReader(std::move(io_context), input, read_options, parse_options,
-                        convert_options),
+                        convert_options, false),

Review comment:
       ```suggestion
                           convert_options, /*count_rows=*/false),
   ```

##########
File path: cpp/src/arrow/csv/parser_test.cc
##########
@@ -623,5 +643,48 @@ TEST(BlockParser, QuotedEscape) {
   }
 }
 
+TEST(BlockParser, RowNumberAppendedToError) {
+  auto options = ParseOptions::Defaults();
+  auto csv = "a,b,c\nd,e,f\ng,h,i\n";
+  {
+    BlockParser parser(options, -1, 0);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, csv));
+    int row = 0;
+    auto status = parser.VisitColumn(
+        0, [row](const uint8_t* data, uint32_t size, bool quoted) mutable -> Status {
+          return ++row == 2 ? Status::Invalid("Bad value") : Status::OK();
+        });
+    ASSERT_RAISES(Invalid, status);
+    ASSERT_NE(std::string::npos, status.message().find("Row 1: Bad value"))
+        << status.message();
+  }
+
+  {
+    BlockParser parser(options, -1, 100);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, csv));
+    int row = 0;
+    auto status = parser.VisitColumn(
+        0, [row](const uint8_t* data, uint32_t size, bool quoted) mutable -> Status {
+          return ++row == 3 ? Status::Invalid("Bad value") : Status::OK();
+        });
+    ASSERT_RAISES(Invalid, status);
+    ASSERT_NE(std::string::npos, status.message().find("Row 102: Bad value"))
+        << status.message();
+  }
+
+  // No first row specified should not append row information
+  {
+    BlockParser parser(options, -1, -1);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, csv));
+    int row = 0;
+    auto status = parser.VisitColumn(
+        0, [row](const uint8_t* data, uint32_t size, bool quoted) mutable -> Status {
+          return ++row == 3 ? Status::Invalid("Bad value") : Status::OK();
+        });
+    ASSERT_RAISES(Invalid, status);
+    ASSERT_EQ(std::string::npos, status.message().find("Row")) << status.message();

Review comment:
       And here you can use `::testing::Not(::testing::HasSubstr())`

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -888,9 +899,10 @@ class AsyncThreadedTableReader
                            std::shared_ptr<io::InputStream> input,
                            const ReadOptions& read_options,
                            const ParseOptions& parse_options,
-                           const ConvertOptions& convert_options, Executor* cpu_executor)
+                           const ConvertOptions& convert_options,
+                           Executor* cpu_executor)
       : BaseTableReader(std::move(io_context), input, read_options, parse_options,
-                        convert_options),
+                        convert_options, false),

Review comment:
       (and maybe note that the parallel reader cannot count rows yet)

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -500,6 +510,7 @@ class ReaderMixin {
 
   // Number of columns in the CSV file
   int32_t num_csv_cols_ = -1;
+  int64_t num_rows_;

Review comment:
       nit: maybe something more like `last_row_number_` or `num_seen_rows_` and a comment noting `-1` means we aren't tracking the row count?

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -992,7 +1004,7 @@ Result<std::shared_ptr<TableReader>> MakeTableReader(
         io_context, input, read_options, parse_options, convert_options, cpu_executor);
   } else {
     reader = std::make_shared<SerialTableReader>(io_context, input, read_options,
-                                                 parse_options, convert_options);
+                                                 parse_options, convert_options, true);

Review comment:
       ```suggestion
                                                    parse_options, convert_options, /*count_rows=*/true);
   ```

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -1004,7 +1016,7 @@ Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
     const ParseOptions& parse_options, const ConvertOptions& convert_options) {
   std::shared_ptr<BaseStreamingReader> reader;
   reader = std::make_shared<SerialStreamingReader>(
-      io_context, cpu_executor, input, read_options, parse_options, convert_options);
+      io_context, cpu_executor, input, read_options, parse_options, convert_options, true);

Review comment:
       (will probably need to be reformatted here)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org