You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/07/04 05:51:52 UTC

[arrow] 13/38: ARROW-5791: [C++] Fix infinite loop with more the 32768 columns.

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit a262f42aa9e959d75ba6c1dd3dc2b67f6ad1ff85
Author: Micah Kornfield <em...@gmail.com>
AuthorDate: Mon Jul 1 13:42:14 2019 -0500

    ARROW-5791: [C++] Fix infinite loop with more the 32768 columns.
    
    But really 32768 columns should be enough for anyone :)
    
    Author: Micah Kornfield <em...@gmail.com>
    
    Closes #4762 from emkornfield/csv and squashes the following commits:
    
    ab0504c16 <Micah Kornfield> lower number of columns in test to satisfy ming
    8f53a8a58 <Micah Kornfield> remove test
    acfe2d894 <Micah Kornfield> remove cap, make min rows_in_chunk 512
    08ddc2238 <Micah Kornfield> remove floor duplication
    211472a12 <Micah Kornfield> powers of 2 are better
    b91a9e177 <Micah Kornfield> ARROW-5791:  Fix infinite loop with more the 32768 columns.  Cap max columns
---
 cpp/src/arrow/csv/parser-test.cc | 25 +++++++++++++++++++++++++
 cpp/src/arrow/csv/parser.cc      |  7 +++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc
index 3655230..d1790b2 100644
--- a/cpp/src/arrow/csv/parser-test.cc
+++ b/cpp/src/arrow/csv/parser-test.cc
@@ -439,6 +439,31 @@ TEST(BlockParser, Escaping) {
   }
 }
 
+// Generate test data with the given number of columns.
+std::string MakeLotsOfCsvColumns(int32_t num_columns) {
+  std::string values, header;
+  header.reserve(num_columns * 10);
+  values.reserve(num_columns * 10);
+  for (int x = 0; x < num_columns; x++) {
+    if (x != 0) {
+      header += ",";
+      values += ",";
+    }
+    header += "c" + std::to_string(x);
+    values += std::to_string(x);
+  }
+
+  header += "\n";
+  values += "\n";
+  return MakeCSVData({header, values});
+}
+
+TEST(BlockParser, LotsOfColumns) {
+  auto options = ParseOptions::Defaults();
+  BlockParser parser(options);
+  AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100));
+}
+
 TEST(BlockParser, QuotedEscape) {
   auto options = ParseOptions::Defaults();
   options.escaping = true;
diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index a7ca71c..89c3f4c 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -397,16 +397,19 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is
       return ParseError("Empty CSV file or block: cannot infer number of columns");
     }
   }
+
   while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) {
     // We know the number of columns, so can presize a values array for
     // a given number of rows
     DCHECK_GE(num_cols_, 0);
 
     int32_t rows_in_chunk;
+    constexpr int32_t kTargetChunkSize = 32768;
     if (num_cols_ > 0) {
-      rows_in_chunk = std::min(32768 / num_cols_, max_num_rows_ - num_rows_);
+      rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512),
+                               max_num_rows_ - num_rows_);
     } else {
-      rows_in_chunk = std::min(32768, max_num_rows_ - num_rows_);
+      rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_);
     }
 
     PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_);