You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/01 12:12:23 UTC
[GitHub] [arrow] pitrou commented on a change in pull request #10255: ARROW-12661: [C++] Add ReaderOptions::skip_rows_after_names

pitrou commented on a change in pull request #10255:
URL: https://github.com/apache/arrow/pull/10255#discussion_r643030430



##########
File path: cpp/src/arrow/util/delimiting.h
##########
@@ -138,6 +153,26 @@ class ARROW_EXPORT Chunker {
   Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
                       std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
 
+  /// \brief Skip count number of rows
+  /// Pre-conditions:
+  /// - `partial` is the start of a valid block of delimited data
+  ///   (i.e. starts just after a delimiter)
+  /// - `block` follows `partial` in file order
+  ///
+  /// Post-conditions:
+  /// - Count is updated to indicate the number of rows that still need to be skipped

Review comment:
       Nit, but can you quote variable names as above: e.g. ``` `count` ```, etc.

##########
File path: cpp/src/arrow/csv/chunker_test.cc
##########
@@ -71,6 +71,36 @@ class BaseChunkerTest : public ::testing::TestWithParam<bool> {
 
   void MakeChunker() { chunker_ = ::arrow::csv::MakeChunker(options_); }
 
+  void AssertSkip(const std::string& str, const uint64_t count, const uint64_t rem_count,
+                  const int64_t rest_size) {
+    MakeChunker();
+
+    {
+      auto test_count = count;
+      auto partial = std::make_shared<Buffer>("");
+      auto block = std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(str.data()),
+                                            static_cast<int64_t>(str.size()));
+      std::shared_ptr<Buffer> rest;
+      ASSERT_OK(chunker_->ProcessSkip(partial, block, true, &test_count, &rest));
+      ASSERT_EQ(rem_count, test_count);
+      ASSERT_EQ(rest_size, rest->size());

Review comment:
       Can you also check the contents of `rest`?

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -166,14 +166,17 @@ namespace {
 // iterator APIs (e.g. Visit)) even though an empty optional is never used in this code.
 class BlockReader {
  public:
-  BlockReader(std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer)
+  BlockReader(std::unique_ptr<Chunker> chunker, std::shared_ptr<Buffer> first_buffer,
+              uint64_t skip)

Review comment:
       Can you use a more precise name? Is it `skip_rows`?

##########
File path: python/pyarrow/tests/test_csv.py
##########
@@ -319,6 +321,79 @@ def test_header_skip_rows(self):
             "kl": ["op"],
         }
 
+    def test_skip_rows_after_names(self):

Review comment:
       Can you perhaps add a test with both `skip_rows` and `skip_rows_after_names`?

##########
File path: cpp/src/arrow/csv/reader.cc
##########
@@ -254,22 +271,12 @@ class ThreadedBlockReader : public BlockReader {
  public:
   using BlockReader::BlockReader;
 
-  static Iterator<CSVBlock> MakeIterator(

Review comment:
       I presume this was unused?

##########
File path: cpp/src/arrow/util/delimiting.h
##########
@@ -53,6 +54,20 @@ class ARROW_EXPORT BoundaryFinder {
   /// `out_pos` will be -1 if no delimiter is found.
   virtual Status FindLast(util::string_view block, int64_t* out_pos) = 0;
 
+  /// \brief Find the position of the Nth delimiter inside the block
+  ///
+  /// `partial` is taken to be the beginning of the block, and `block`
+  /// its continuation.  Also, `partial` doesn't contain a delimiter.
+  ///
+  /// The returned `out_pos` is relative to `block`'s start and should point
+  /// to the first character after the first delimiter.
+  /// `out_pos` will be -1 if no delimiter is found.
+  /// \return the number of delimiters found
+  virtual Result<uint64_t> FindN(util::string_view partial, util::string_view block,
+                                 uint64_t count, int64_t* out_pos) {
+    return Status::NotImplemented("BoundaryFinder::FindN");

Review comment:
       Suggestions:
   * call this `FindNth`
   * make this a pure virtual method
   * either use two out-parameters, or return the two values as a result, e.g.:
   ```c++
   struct FindNthResult {
     int64_t num_found;
     int64_t pos;
   };
   virtual Result<FindNthResult> FindNth(...) = 0;
   ```

##########
File path: python/pyarrow/_csv.pyx
##########
@@ -72,6 +72,13 @@ cdef class ReadOptions(_Weakrefable):
     encoding: str, optional (default 'utf8')
         The character encoding of the CSV data.  Columns that cannot
         decode using this encoding can still be read as Binary.
+    skip_rows_after_names: int, optional (default 0)

Review comment:
       Can you move this up just after `skip_rows`?

##########
File path: cpp/src/arrow/util/delimiting.cc
##########
@@ -138,4 +167,25 @@ Status Chunker::ProcessFinal(std::shared_ptr<Buffer> partial,
   return Status::OK();
 }
 
+Status Chunker::ProcessSkip(std::shared_ptr<Buffer> partial,
+                            std::shared_ptr<Buffer> block, bool final, uint64_t* count,
+                            std::shared_ptr<Buffer>* rest) {
+  DCHECK_GT(*count, 0);
+  int64_t pos;
+  ARROW_ASSIGN_OR_RAISE(auto skipped,
+                        boundary_finder_->FindN(util::string_view(*partial),
+                                                util::string_view(*block), *count, &pos));
+  if (pos == BoundaryFinder::kNoDelimiterFound) {
+    return StraddlingTooLarge();
+  }
+  if (ARROW_PREDICT_FALSE(final && *count > skipped && block->size() != pos)) {
+    ++skipped;

Review comment:
       Why? Can you add a comment?

##########
File path: cpp/src/arrow/csv/chunker.cc
##########
@@ -234,6 +235,38 @@ class LexingBoundaryFinder : public BoundaryFinder {
     return Status::OK();
   }
 
+  Result<uint64_t> FindN(util::string_view partial, util::string_view block,
+                         uint64_t count, int64_t* out_pos) override {
+    Lexer<quoting, escaping> lexer(options_);
+    uint64_t found = 0;
+    const char* data = block.data();
+    const char* const data_end = block.data() + block.size();
+
+    const char* line_end;
+    if (partial.size()) {

Review comment:
       Is this conditional useful?

##########
File path: cpp/src/arrow/csv/chunker_test.cc
##########
@@ -261,5 +291,43 @@ TEST_P(BaseChunkerTest, EscapingNewline) {
   }
 }
 
+TEST_P(BaseChunkerTest, ParseSkip) {
+  {
+    auto csv = MakeCSVData({"ab,c,\n", "def,,gh\n", ",ij,kl\n"});
+    ASSERT_NO_FATAL_FAILURE(AssertSkip(csv, 1, 0, 15));

Review comment:
       I don't think `ASSERT_NO_FATAL_FAILURE` is useful, but we can keep it if you want.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org