You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/07/04 05:51:52 UTC
[arrow] 13/38: ARROW-5791: [C++] Fix infinite loop with more the
32768 columns.
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit a262f42aa9e959d75ba6c1dd3dc2b67f6ad1ff85
Author: Micah Kornfield <em...@gmail.com>
AuthorDate: Mon Jul 1 13:42:14 2019 -0500
ARROW-5791: [C++] Fix infinite loop with more the 32768 columns.
But really 32768 columns should be enough for anyone :)
Author: Micah Kornfield <em...@gmail.com>
Closes #4762 from emkornfield/csv and squashes the following commits:
ab0504c16 <Micah Kornfield> lower number of columns in test to satisfy ming
8f53a8a58 <Micah Kornfield> remove test
acfe2d894 <Micah Kornfield> remove cap, make min rows_in_chunk 512
08ddc2238 <Micah Kornfield> remove floor duplication
211472a12 <Micah Kornfield> powers of 2 are better
b91a9e177 <Micah Kornfield> ARROW-5791: Fix infinite loop with more the 32768 columns. Cap max columns
---
cpp/src/arrow/csv/parser-test.cc | 25 +++++++++++++++++++++++++
cpp/src/arrow/csv/parser.cc | 7 +++++--
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc
index 3655230..d1790b2 100644
--- a/cpp/src/arrow/csv/parser-test.cc
+++ b/cpp/src/arrow/csv/parser-test.cc
@@ -439,6 +439,31 @@ TEST(BlockParser, Escaping) {
}
}
+// Generate test data with the given number of columns.
+std::string MakeLotsOfCsvColumns(int32_t num_columns) {
+ std::string values, header;
+ header.reserve(num_columns * 10);
+ values.reserve(num_columns * 10);
+ for (int x = 0; x < num_columns; x++) {
+ if (x != 0) {
+ header += ",";
+ values += ",";
+ }
+ header += "c" + std::to_string(x);
+ values += std::to_string(x);
+ }
+
+ header += "\n";
+ values += "\n";
+ return MakeCSVData({header, values});
+}
+
+TEST(BlockParser, LotsOfColumns) {
+ auto options = ParseOptions::Defaults();
+ BlockParser parser(options);
+ AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100));
+}
+
TEST(BlockParser, QuotedEscape) {
auto options = ParseOptions::Defaults();
options.escaping = true;
diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index a7ca71c..89c3f4c 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -397,16 +397,19 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is
return ParseError("Empty CSV file or block: cannot infer number of columns");
}
}
+
while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) {
// We know the number of columns, so can presize a values array for
// a given number of rows
DCHECK_GE(num_cols_, 0);
int32_t rows_in_chunk;
+ constexpr int32_t kTargetChunkSize = 32768;
if (num_cols_ > 0) {
- rows_in_chunk = std::min(32768 / num_cols_, max_num_rows_ - num_rows_);
+ rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512),
+ max_num_rows_ - num_rows_);
} else {
- rows_in_chunk = std::min(32768, max_num_rows_ - num_rows_);
+ rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_);
}
PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_);