You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/05/22 01:15:03 UTC
[arrow] branch master updated: PARQUET-1861:
[Parquet][Documentation] Clarify buffered stream option
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5602c45 PARQUET-1861: [Parquet][Documentation] Clarify buffered stream option
5602c45 is described below
commit 5602c459eb8773b6be8059b1b118175e9f16b7a3
Author: François Saint-Jacques <fs...@gmail.com>
AuthorDate: Thu May 21 20:12:45 2020 -0500
PARQUET-1861: [Parquet][Documentation] Clarify buffered stream option
Closes #7221 from fsaintjacques/PARQUET-1861-buffered-stream
Authored-by: François Saint-Jacques <fs...@gmail.com>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/parquet/properties.h | 25 +++++++++++++------------
cpp/src/parquet/properties_test.cc | 4 ++--
2 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 2b07569..2d9725c 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -59,31 +59,32 @@ struct ParquetVersion {
/// DataPageV2 at all.
enum class ParquetDataPageVersion { V1, V2 };
-static int64_t DEFAULT_BUFFER_SIZE = 1024;
-static bool DEFAULT_USE_BUFFERED_STREAM = false;
+/// Align the default buffer size to a small multiple of a page size.
+constexpr int64_t kDefaultBufferSize = 4096 * 4;
class PARQUET_EXPORT ReaderProperties {
public:
explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
- : pool_(pool) {
- buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM;
- buffer_size_ = DEFAULT_BUFFER_SIZE;
- }
+ : pool_(pool) {}
MemoryPool* memory_pool() const { return pool_; }
std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
int64_t start, int64_t num_bytes);
+ /// Buffered stream reading allows the user to control the memory usage of
+ /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
+ /// wrapped in a buffered reader that uses a fix sized buffer (of size
+ /// `buffer_size()`) instead of the full size of the ReadAt.
+ ///
+ /// The primary reason for this control knobs is for resource control and not
+ /// performance.
bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
-
void enable_buffered_stream() { buffered_stream_enabled_ = true; }
-
void disable_buffered_stream() { buffered_stream_enabled_ = false; }
- void set_buffer_size(int64_t buf_size) { buffer_size_ = buf_size; }
-
int64_t buffer_size() const { return buffer_size_; }
+ void set_buffer_size(int64_t size) { buffer_size_ = size; }
void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
file_decryption_properties_ = std::move(decryption);
@@ -95,8 +96,8 @@ class PARQUET_EXPORT ReaderProperties {
private:
MemoryPool* pool_;
- int64_t buffer_size_;
- bool buffered_stream_enabled_;
+ int64_t buffer_size_ = kDefaultBufferSize;
+ bool buffered_stream_enabled_ = false;
std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
};
diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc
index aef563b..39dbb7f 100644
--- a/cpp/src/parquet/properties_test.cc
+++ b/cpp/src/parquet/properties_test.cc
@@ -34,8 +34,8 @@ namespace test {
TEST(TestReaderProperties, Basics) {
ReaderProperties props;
- ASSERT_EQ(DEFAULT_BUFFER_SIZE, props.buffer_size());
- ASSERT_EQ(DEFAULT_USE_BUFFERED_STREAM, props.is_buffered_stream_enabled());
+ ASSERT_EQ(props.buffer_size(), kDefaultBufferSize);
+ ASSERT_FALSE(props.is_buffered_stream_enabled());
}
TEST(TestWriterProperties, Basics) {