You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/10/13 12:38:24 UTC
[GitHub] [arrow] westonpace commented on a diff in pull request #14269: ARROW-17481: [C++][Python] Major performance improvements to CSV reading from S3

westonpace commented on code in PR #14269:
URL: https://github.com/apache/arrow/pull/14269#discussion_r994580591


##########
cpp/src/arrow/csv/reader.cc:
##########
@@ -887,30 +872,37 @@ class StreamingReaderImpl : public ReaderMixin,
  protected:
   Future<> InitAfterFirstBuffer(const std::shared_ptr<Buffer>& first_buffer,
                                 AsyncGenerator<std::shared_ptr<Buffer>> buffer_generator,
-                                int max_readahead) {
+                                int max_readahead, Executor* cpu_executor) {
     if (first_buffer == nullptr) {
       return Status::Invalid("Empty CSV file");
     }
 
     std::shared_ptr<Buffer> after_header;
-    ARROW_ASSIGN_OR_RAISE(auto header_bytes_consumed,
-                          ProcessHeader(first_buffer, &after_header));
+    ARROW_ASSIGN_OR_RAISE(
+        auto header_bytes_consumed,
+        ReaderMixin<InputType>::ProcessHeader(first_buffer, &after_header));
     bytes_decoded_->fetch_add(header_bytes_consumed);
 
-    auto parser_op =
-        BlockParsingOperator(io_context_, parse_options_, num_csv_cols_, num_rows_seen_);
+    auto parser_op = BlockParsingOperator(
+        ReaderMixin<InputType>::io_context_, ReaderMixin<InputType>::parse_options_,
+        ReaderMixin<InputType>::num_csv_cols_, ReaderMixin<InputType>::num_rows_seen_);
     ARROW_ASSIGN_OR_RAISE(
         auto decoder_op,
-        BlockDecodingOperator::Make(io_context_, convert_options_, conversion_schema_));
+        BlockDecodingOperator::Make(ReaderMixin<InputType>::io_context_,
+                                    ReaderMixin<InputType>::convert_options_,
+                                    ReaderMixin<InputType>::conversion_schema_));
 
     auto block_gen = SerialBlockReader::MakeAsyncIterator(
-        std::move(buffer_generator), MakeChunker(parse_options_), std::move(after_header),
-        read_options_.skip_rows_after_names);
+        std::move(buffer_generator), MakeChunker(ReaderMixin<InputType>::parse_options_),
+        std::move(after_header),
+        ReaderMixin<InputType>::read_options_.skip_rows_after_names);
     auto parsed_block_gen =
-        MakeMappedGenerator(std::move(block_gen), std::move(parser_op));
-    auto rb_gen = MakeMappedGenerator(std::move(parsed_block_gen), std::move(decoder_op));
+        MakeApplyGenerator(std::move(block_gen), std::move(parser_op), cpu_executor);
+    auto preparse_gen =
+        MakeSerialReadaheadGenerator(parsed_block_gen, cpu_executor->GetCapacity());
+    auto rb_gen = MakeMappedGenerator(std::move(preparse_gen), std::move(decoder_op));

Review Comment:
   ```suggestion
       auto parsed_ra_gen =
           MakeSerialReadaheadGenerator(parsed_block_gen, cpu_executor->GetCapacity());
       auto rb_gen = MakeMappedGenerator(std::move(parsed_ra_gen), std::move(decoder_op));
   ```
   
   `preparse_gen` is a bit of an odd name as this is after parsing.



##########
cpp/src/arrow/dataset/file_csv.cc:
##########
@@ -18,6 +18,7 @@
 #include "arrow/dataset/file_csv.h"
 
 #include <algorithm>
+#include <iostream>

Review Comment:
   ```suggestion
   ```



##########
cpp/src/arrow/csv/reader.h:
##########
@@ -25,6 +25,7 @@
 #include "arrow/result.h"
 #include "arrow/type.h"
 #include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"

Review Comment:
   ```suggestion
   ```
   
   I don't think this is needed?  We have been mostly trying to treat `async_generator.h` as an internal header as it includes quite a few different things and can bloat header sizes.
   
   Can you put this in `reader.cc` if needed?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org