You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/05/31 17:07:35 UTC
[arrow] branch master updated: ARROW-16613: [C++][Parquet] Fix performance of repeated calls to AppendRowGroups()
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7fee9db84a ARROW-16613: [C++][Parquet] Fix performance of repeated calls to AppendRowGroups()
7fee9db84a is described below
commit 7fee9db84a8bbefa62f576751e3b51118d12cab3
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue May 31 19:07:27 2022 +0200
ARROW-16613: [C++][Parquet] Fix performance of repeated calls to AppendRowGroups()
Repeated calls to `FileMetaData::AppendRowGroups()` would be O(n²) due to incorrect use of `std::vector::reserve`.
Closes #13265 from pitrou/ARROW-16613-parquet-metadata-complexity
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/metadata.cc | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 1acb1752a3..102fa874ad 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -656,11 +656,15 @@ class FileMetaData::FileMetaDataImpl {
// ARROW-13654: `other` may point to self, be careful not to enter an infinite loop
const int n = other->num_row_groups();
- metadata_->row_groups.reserve(metadata_->row_groups.size() + n);
+ // ARROW-16613: do not use reserve() as that may suppress overallocation
+ // and incur O(n²) behavior on repeated calls to AppendRowGroups().
+ // (see https://en.cppreference.com/w/cpp/container/vector/reserve
+ // about inappropriate uses of reserve()).
+ const auto start = metadata_->row_groups.size();
+ metadata_->row_groups.resize(start + n);
for (int i = 0; i < n; i++) {
- format::RowGroup other_rg = other->row_group(i);
- metadata_->num_rows += other_rg.num_rows;
- metadata_->row_groups.push_back(std::move(other_rg));
+ metadata_->row_groups[start + i] = other->row_group(i);
+ metadata_->num_rows += metadata_->row_groups[start + i].num_rows;
}
}