You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/05/31 17:07:35 UTC

[arrow] branch master updated: ARROW-16613: [C++][Parquet] Fix performance of repeated calls to AppendRowGroups()

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7fee9db84a ARROW-16613: [C++][Parquet] Fix performance of repeated calls to AppendRowGroups()
7fee9db84a is described below

commit 7fee9db84a8bbefa62f576751e3b51118d12cab3
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue May 31 19:07:27 2022 +0200

    ARROW-16613: [C++][Parquet] Fix performance of repeated calls to AppendRowGroups()
    
    Repeated calls to `FileMetaData::AppendRowGroups()` would be O(n²) due to incorrect use of `std::vector::reserve`.
    
    Closes #13265 from pitrou/ARROW-16613-parquet-metadata-complexity
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/metadata.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 1acb1752a3..102fa874ad 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -656,11 +656,15 @@ class FileMetaData::FileMetaDataImpl {
 
     // ARROW-13654: `other` may point to self, be careful not to enter an infinite loop
     const int n = other->num_row_groups();
-    metadata_->row_groups.reserve(metadata_->row_groups.size() + n);
+    // ARROW-16613: do not use reserve() as that may suppress overallocation
+    // and incur O(n²) behavior on repeated calls to AppendRowGroups().
+    // (see https://en.cppreference.com/w/cpp/container/vector/reserve
+    //  about inappropriate uses of reserve()).
+    const auto start = metadata_->row_groups.size();
+    metadata_->row_groups.resize(start + n);
     for (int i = 0; i < n; i++) {
-      format::RowGroup other_rg = other->row_group(i);
-      metadata_->num_rows += other_rg.num_rows;
-      metadata_->row_groups.push_back(std::move(other_rg));
+      metadata_->row_groups[start + i] = other->row_group(i);
+      metadata_->num_rows += metadata_->row_groups[start + i].num_rows;
     }
   }